前言

算法和工程是我們算法工程師不可缺少的兩種能力，之前我介紹了DeepLab V1，V2, V3，但總是感覺少了點什么？只有Paper，沒有源碼那不相當于是紙上談兵了，所以今天嘗試結合論文的源碼來進行仔細的分析這三個算法。等我們分析清楚這三個算法之后，有機會再解析一下DeepLabV3。由于博主最近正在看Pytorch版本的《動手學深度學習》，不妨用Pytorch的源碼來進行分析。我分析的源碼均來自這個Pytorch工程：https://github.com/kazuto1011/deeplab-pytorch/tree/master/libs/models

DeepLab V1源碼分析

DeepLab V1的算法原理可以看我之前的推文，地址是：https://mp.weixin.qq.com/s/rvP8-Y-CRuq4HFzR0qJWcg 。我們今天解析的網絡模型是在ResNet殘差模塊的基礎上融合空洞卷積實現的，第一層為普通卷積，stride = 2，緊跟著 stride = 2 的 max-pooling，然后一個普通的 bottleneck ，一個 stride = 2 的 bottleneck，然后 dilation =2、dilation =4 的bottleneck。

from __future__ import absolute_import, print_function

import torch
import torch.nn as nn
import torch.nn.functional as F

# 定義DeepLabV1的網絡結構
class DeepLabV1(nn.Sequential):
    """
    DeepLab v1: Dilated ResNet + 1x1 Conv
    Note that this is just a container for loading the pretrained COCO model and not mentioned as "v1" in papers.
    """

    def __init__(self, n_classes, n_blocks):
        super(DeepLabV1, self).__init__()
        ch = [64 * 2 ** p for p in range(6)]
        self.add_module("layer1", _Stem(ch[0]))
        self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], 1, 1))
        self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], 2, 1))
        self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], 1, 2))
        self.add_module("layer5", _ResLayer(n_blocks[3], ch[4], ch[5], 1, 4))
        self.add_module("fc", nn.Conv2d(2048, n_classes, 1))

# 這里是看一下是使用torch的nn模塊中BatchNorm還是在encoding文件中定義的BatchNorm

try:
    from encoding.nn import SyncBatchNorm

    _BATCH_NORM = SyncBatchNorm
except:
    _BATCH_NORM = nn.BatchNorm2d

_BOTTLENECK_EXPANSION = 4

# 定義卷積+BN+ReLU的組件
class _ConvBnReLU(nn.Sequential):
    """
    Cascade of 2D convolution, batch norm, and ReLU.
    """

    BATCH_NORM = _BATCH_NORM

    def __init__(
            self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True
    ):
        super(_ConvBnReLU, self).__init__()
        self.add_module(
            "conv",
            nn.Conv2d(
                in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False
            ),
        )
        self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=0.999))

        if relu:
            self.add_module("relu", nn.ReLU())


# 定義Bottleneck，先1*1卷積降維，然后使用3*3卷積，最后再1*1卷積升維，然后再shortcut連接。
# 降維到多少是由_BOTTLENECK_EXPANSION參數決定的，這是ResNet的Bottleneck。
class _Bottleneck(nn.Module):
    """
    Bottleneck block of MSRA ResNet.
    """

    def __init__(self, in_ch, out_ch, stride, dilation, downsample):
        super(_Bottleneck, self).__init__()
        mid_ch = out_ch // _BOTTLENECK_EXPANSION
        self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True)
        self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True)
        self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False)
        self.shortcut = (
            _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False)
            if downsample
            else lambda x: x  # identity
        )

    def forward(self, x):
        h = self.reduce(x)
        h = self.conv3x3(h)
        h = self.increase(h)
        h += self.shortcut(x)
        return F.relu(h)

# 定義ResLayer，整個DeepLabv1是用ResLayer堆疊起來的，下采樣是在每個ResLayer的第一個
# Bottleneck發生的。
class _ResLayer(nn.Sequential):
    """
    Residual layer with multi grids
    """

    def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None):
        super(_ResLayer, self).__init__()

        if multi_grids is None:
            multi_grids = [1 for _ in range(n_layers)]
        else:
            assert n_layers == len(multi_grids)

        # Downsampling is only in the first block
        for i in range(n_layers):
            self.add_module(
                "block{}".format(i + 1),
                _Bottleneck(
                    in_ch=(in_ch if i == 0 else out_ch),
                    out_ch=out_ch,
                    stride=(stride if i == 0 else 1),
                    dilation=dilation * multi_grids[i],
                    downsample=(True if i == 0 else False),
                ),
            )

# 在進入ResLayer之前，先用7*7的卷積核在原圖滑動，增大感受野。padding方式設為same，大小不變。
# Pool層的核大小為3，步長為2，這會導致特征圖的分辨率發生變化。
class _Stem(nn.Sequential):
    """
    The 1st conv layer.
    Note that the max pooling is different from both MSRA and FAIR ResNet.
    """

    def __init__(self, out_ch):
        super(_Stem, self).__init__()
        self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1))
        self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True))

# 相當于Reshape，網絡并沒有用到
class _Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

# 主函數，輸出構建的DeepLab V1模型的結構還有原始圖像分辨率和結果圖像的分辨率
if __name__ == "__main__":
    model = DeepLabV1(n_classes=21, n_blocks=[3, 4, 23, 3])
    #model.eval()
    image = torch.randn(1, 3, 513, 513)

    print(model)
    print("input:", image.shape)
    print("output:", model(image).shape)

我們看一下網絡的輸入和輸出特征圖尺寸：

input: torch.Size([1, 3, 513, 513])
output: torch.Size([1, 21, 65, 65])

網絡結構已經非常清晰了，可以直接運行Python代碼打印出網絡結構或者按照我的源碼注釋來理解。注意，訓練的時候ground truth要resize到和模型的輸出特征圖尺寸一樣大才可以。

DeepLab V2源碼分析

DeepLab V2的論文解讀請看我前面發的文章：https://mp.weixin.qq.com/s/ylv3QfOe_BOuVuxQTd_m_g 。簡單的說，DeepLab V2就是DeepLab V1的基礎上加了一個ASPP模塊，這是一個類似于Inception模塊的結構，包含不同膨脹系數的空洞卷積，增強模型識別同一物體的多尺度能力。這里仍然只分析源碼：
為了方便理解把上篇文章中的ASPP模塊的示意圖放在這里：

在這里插入圖片描述

from __future__ import absolute_import, print_function

import torch
import torch.nn as nn
import torch.nn.functional as F


# 定義ASPP模塊，這是DeepLab V2和V1的主要區別，可以看到其他部分和V1的代碼一模一樣
class _ASPP(nn.Module):
    """
    Atrous spatial pyramid pooling (ASPP)
    """

    def __init__(self, in_ch, out_ch, rates):
        super(_ASPP, self).__init__()
        for i, rate in enumerate(rates):
            self.add_module(
                "c{}".format(i),
                nn.Conv2d(in_ch, out_ch, 3, 1, padding=rate, dilation=rate, bias=True),
            )

        for m in self.children():
            nn.init.normal_(m.weight, mean=0, std=0.01)
            nn.init.constant_(m.bias, 0)

    def forward(self, x):
        return sum([stage(x) for stage in self.children()])


class DeepLabV2(nn.Sequential):
    """
    DeepLab v2: Dilated ResNet + ASPP
    Output stride is fixed at 8
    """

    def __init__(self, n_classes, n_blocks, atrous_rates):
        super(DeepLabV2, self).__init__()
        ch = [64 * 2 ** p for p in range(6)]
        self.add_module("layer1", _Stem(ch[0]))
        self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], 1, 1))
        self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], 2, 1))
        self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], 1, 2))
        self.add_module("layer5", _ResLayer(n_blocks[3], ch[4], ch[5], 1, 4))
        self.add_module("aspp", _ASPP(ch[5], n_classes, atrous_rates))

    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, _ConvBnReLU.BATCH_NORM):
                m.eval()


try:
    from encoding.nn import SyncBatchNorm

    _BATCH_NORM = SyncBatchNorm
except:
    _BATCH_NORM = nn.BatchNorm2d

_BOTTLENECK_EXPANSION = 4



class _ConvBnReLU(nn.Sequential):
    """
    Cascade of 2D convolution, batch norm, and ReLU.
    """

    BATCH_NORM = _BATCH_NORM

    def __init__(
            self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True
    ):
        super(_ConvBnReLU, self).__init__()
        self.add_module(
            "conv",
            nn.Conv2d(
                in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False
            ),
        )
        self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=0.999))

        if relu:
            self.add_module("relu", nn.ReLU())


class _Bottleneck(nn.Module):
    """
    Bottleneck block of MSRA ResNet.
    """

    def __init__(self, in_ch, out_ch, stride, dilation, downsample):
        super(_Bottleneck, self).__init__()
        mid_ch = out_ch // _BOTTLENECK_EXPANSION
        self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True)
        self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True)
        self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False)
        self.shortcut = (
            _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False)
            if downsample
            else lambda x: x  # identity
        )

    def forward(self, x):
        h = self.reduce(x)
        h = self.conv3x3(h)
        h = self.increase(h)
        h += self.shortcut(x)
        return F.relu(h)


class _ResLayer(nn.Sequential):
    """
    Residual layer with multi grids
    """

    def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None):
        super(_ResLayer, self).__init__()

        if multi_grids is None:
            multi_grids = [1 for _ in range(n_layers)]
        else:
            assert n_layers == len(multi_grids)

        # Downsampling is only in the first block
        for i in range(n_layers):
            self.add_module(
                "block{}".format(i + 1),
                _Bottleneck(
                    in_ch=(in_ch if i == 0 else out_ch),
                    out_ch=out_ch,
                    stride=(stride if i == 0 else 1),
                    dilation=dilation * multi_grids[i],
                    downsample=(True if i == 0 else False),
                ),
            )


class _Stem(nn.Sequential):
    """
    The 1st conv layer.
    Note that the max pooling is different from both MSRA and FAIR ResNet.
    """

    def __init__(self, out_ch):
        super(_Stem, self).__init__()
        self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1))
        self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True))


if __name__ == "__main__":
    model = DeepLabV2(
        n_classes=21, n_blocks=[3, 4, 23, 3], atrous_rates=[6, 12, 18, 24]
    )
    model.eval()
    image = torch.randn(1, 3, 513, 513)

    print(model)
    print("input:", image.shape)
    print("output:", model(image).shape)

可以看到DeepLab V2的代碼除了ASPP模塊，其他部分和V1完全一樣，所以就沒什么好解釋的了。但需要注意的一個點是，訓練的時候，DeepLabV2的學習率采用了Poly的策略，公式為：
$lr_{iter}=lr_0*(1-\frac{iter}{max_iter})^{power}$ ，當 $power=0.9$ 時，模型可以取得不普通的分段學習策略MAP值高1.17%的效果。這部分作者也在他的代碼中實現了，如下所示：

from torch.optim.lr_scheduler import _LRScheduler


class PolynomialLR(_LRScheduler):
    def __init__(self, optimizer, step_size, iter_max, power, last_epoch=-1):
        self.step_size = step_size
        self.iter_max = iter_max
        self.power = power
        super(PolynomialLR, self).__init__(optimizer, last_epoch)

    def polynomial_decay(self, lr):
        return lr * (1 - float(self.last_epoch) / self.iter_max) ** self.power

    def get_lr(self):
        if (
            (self.last_epoch == 0)
            or (self.last_epoch % self.step_size != 0)
            or (self.last_epoch > self.iter_max)
        ):
            return [group["lr"] for group in self.optimizer.param_groups]
        return [self.polynomial_decay(lr) for lr in self.base_lrs]

可以看到這個類是直接繼承了Pytorch中的學習率調整類_LRScheduler，可以方便的在每個epoch進行學習率調整。

最后網絡的輸入分辨率和輸出分辨率和DeepLab V1一樣，具體訓練和數據制作請看作者的github工程：https://github.com/kazuto1011/deeplab-pytorch/tree/master/libs/models 。

DeepLab V3源碼分析

DeepLab V3論文原理請看我之前發的推文：https://mp.weixin.qq.com/s/D9OX89mklaU4tv74OZMqNg 。這里再簡單回歸一下DeepLab V3使用的關鍵Trick。

將BN層加到了ASPP模塊中。
使用了Multi-Grid策略，即在模型后端多加幾層不同rate的空洞卷積。
具有不同 atrous rates 的 ASPP 能夠有效的捕獲多尺度信息。不過，論文發現，隨著sampling rate的增加，有效filter特征權重(即有效特征區域，而不是補零區域的權重)的數量會變小，極端情況下，當空洞卷積的 rate 和 feature map 的大小一致時， $3\times 3$ 卷積會退化為 $1\times 1$ 卷積。為了解決這一問題，并將全局內容信息整合到模型中，則采用圖像級特征。即，采用全局平均池化(global average pooling)對模型的 feature map 進行處理，將得到的圖像級特征輸入到一個 1×1 convolution with 256 filters(加入 batch normalization)中，然后將特征進行雙線性上采樣(bilinearly upsample)到特定的空間維度。

DeepLab V3的源碼如下：

from __future__ import absolute_import, print_function

from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F

# 全局平均池化，將得到的圖像特征輸入到一個擁有256個通道的1*1卷積中，最后將特征進行
# 雙線性上采樣到特定的維度(就是輸入到ImagePool之前特征圖的維度)
class _ImagePool(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.conv = _ConvBnReLU(in_ch, out_ch, 1, 1, 0, 1)

    def forward(self, x):
        _, _, H, W = x.shape
        h = self.pool(x)
        h = self.conv(h)
        h = F.interpolate(h, size=(H, W), mode="bilinear", align_corners=False)
        return h

# ASPP模塊，DeepLabV3改進后的，新增了1*1卷積以及圖像全局池化。
class _ASPP(nn.Module):
    """
    Atrous spatial pyramid pooling with image-level feature
    """

    def __init__(self, in_ch, out_ch, rates):
        super(_ASPP, self).__init__()
        self.stages = nn.Module()
        self.stages.add_module("c0", _ConvBnReLU(in_ch, out_ch, 1, 1, 0, 1))
        for i, rate in enumerate(rates):
            self.stages.add_module(
                "c{}".format(i + 1),
                _ConvBnReLU(in_ch, out_ch, 3, 1, padding=rate, dilation=rate),
            )
        self.stages.add_module("imagepool", _ImagePool(in_ch, out_ch))

    def forward(self, x):
        return torch.cat([stage(x) for stage in self.stages.children()], dim=1)

# 完整的DeepLabV3的結構，使用帶空洞卷積的ResNet+multi-grid策略+改進后的ASPP
class DeepLabV3(nn.Sequential):
    """
    DeepLab v3: Dilated ResNet with multi-grid + improved ASPP
    """

    def __init__(self, n_classes, n_blocks, atrous_rates, multi_grids, output_stride):
        super(DeepLabV3, self).__init__()

        # Stride and dilation
        if output_stride == 8:
            s = [1, 2, 1, 1]
            d = [1, 1, 2, 4]
        elif output_stride == 16:
            s = [1, 2, 2, 1]
            d = [1, 1, 1, 2]

        ch = [64 * 2 ** p for p in range(6)]
        self.add_module("layer1", _Stem(ch[0]))
        self.add_module("layer2", _ResLayer(n_blocks[0], ch[0], ch[2], s[0], d[0]))
        self.add_module("layer3", _ResLayer(n_blocks[1], ch[2], ch[3], s[1], d[1]))
        self.add_module("layer4", _ResLayer(n_blocks[2], ch[3], ch[4], s[2], d[2]))
        self.add_module(
            "layer5", _ResLayer(n_blocks[3], ch[4], ch[5], s[3], d[3], multi_grids)
        )
        self.add_module("aspp", _ASPP(ch[5], 256, atrous_rates))
        # 連接所有分支的最終特征，輸入到256個通道的1*1卷積中，并加入BN，再進入最終的1*1卷積，
        # 得到logits結果。
        concat_ch = 256 * (len(atrous_rates) + 2)
        self.add_module("fc1", _ConvBnReLU(concat_ch, 256, 1, 1, 0, 1))
        self.add_module("fc2", nn.Conv2d(256, n_classes, kernel_size=1))


try:
    from encoding.nn import SyncBatchNorm

    _BATCH_NORM = SyncBatchNorm
except:
    _BATCH_NORM = nn.BatchNorm2d

_BOTTLENECK_EXPANSION = 4

# 和DeepLabV1定義一樣
class _ConvBnReLU(nn.Sequential):
    """
    Cascade of 2D convolution, batch norm, and ReLU.
    """

    BATCH_NORM = _BATCH_NORM

    def __init__(
            self, in_ch, out_ch, kernel_size, stride, padding, dilation, relu=True
    ):
        super(_ConvBnReLU, self).__init__()
        self.add_module(
            "conv",
            nn.Conv2d(
                in_ch, out_ch, kernel_size, stride, padding, dilation, bias=False
            ),
        )
        self.add_module("bn", _BATCH_NORM(out_ch, eps=1e-5, momentum=0.999))

        if relu:
            self.add_module("relu", nn.ReLU())


class _Bottleneck(nn.Module):
    """
    Bottleneck block of MSRA ResNet.
    """

    def __init__(self, in_ch, out_ch, stride, dilation, downsample):
        super(_Bottleneck, self).__init__()
        mid_ch = out_ch // _BOTTLENECK_EXPANSION
        self.reduce = _ConvBnReLU(in_ch, mid_ch, 1, stride, 0, 1, True)
        self.conv3x3 = _ConvBnReLU(mid_ch, mid_ch, 3, 1, dilation, dilation, True)
        self.increase = _ConvBnReLU(mid_ch, out_ch, 1, 1, 0, 1, False)
        self.shortcut = (
            _ConvBnReLU(in_ch, out_ch, 1, stride, 0, 1, False)
            if downsample
            else lambda x: x  # identity
        )

    def forward(self, x):
        h = self.reduce(x)
        h = self.conv3x3(h)
        h = self.increase(h)
        h += self.shortcut(x)
        return F.relu(h)


class _ResLayer(nn.Sequential):
    """
    Residual layer with multi grids
    """

    def __init__(self, n_layers, in_ch, out_ch, stride, dilation, multi_grids=None):
        super(_ResLayer, self).__init__()

        if multi_grids is None:
            multi_grids = [1 for _ in range(n_layers)]
        else:
            assert n_layers == len(multi_grids)

        # Downsampling is only in the first block
        for i in range(n_layers):
            self.add_module(
                "block{}".format(i + 1),
                _Bottleneck(
                    in_ch=(in_ch if i == 0 else out_ch),
                    out_ch=out_ch,
                    stride=(stride if i == 0 else 1),
                    dilation=dilation * multi_grids[i],
                    downsample=(True if i == 0 else False),
                ),
            )


class _Stem(nn.Sequential):
    """
    The 1st conv layer.
    Note that the max pooling is different from both MSRA and FAIR ResNet.
    """

    def __init__(self, out_ch):
        super(_Stem, self).__init__()
        self.add_module("conv1", _ConvBnReLU(3, out_ch, 7, 2, 3, 1))
        self.add_module("pool", nn.MaxPool2d(3, 2, 1, ceil_mode=True))


if __name__ == "__main__":
    model = DeepLabV3(
        n_classes=21,
        n_blocks=[3, 4, 23, 3],
        atrous_rates=[6, 12, 18],
        multi_grids=[1, 2, 4],
        output_stride=8,
    )
    model.eval()
    image = torch.randn(1, 3, 513, 513)

    print(model)
    print("input:", image.shape)
    print("output:", model(image).shape)

和V1，V2的區別在源碼里詳細注釋了。最后DeepLab V3得到輸出結果和V1/V2得到輸出結果是一致的，訓練標簽的設置也是一致的。

結論

通過源碼解析，應該可以對DeepLab V1,V2,V3的原理和特征圖維度變化以及訓練有清楚的認識了，所以暫時就講到這里了。之后有時間再補上DeepLab V3 Plus的論文理解和源碼解析語義分割就算暫時完結了。之后準備做目標檢測/分類網絡的解析，敬請期待吧。

代碼鏈接

https://github.com/kazuto1011/deeplab-pytorch/tree/master/libs/models

歡迎關注我的微信公眾號GiantPadaCV，期待和你一起交流機器學習，深度學習，圖像算法，優化技術，比賽及日常生活等。

圖片.png

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频

DeepLab語義分割算法源碼解析

DeepLab語義分割算法源碼解析

前言

DeepLab V1源碼分析

DeepLab V2源碼分析

DeepLab V3源碼分析

結論

代碼鏈接

推薦閱讀更多精彩內容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美 国产 综合 欧美 视频

DeepLab語義分割算法源碼解析

前言

DeepLab V1源碼分析

DeepLab V2源碼分析

DeepLab V3源碼分析

結論

代碼鏈接

推薦閱讀更多精彩內容

三个男躁一个女,国精产品一区一手机的秘密,麦子交换系列最经典十句话,欧美国产综合欧美视频