Convolutional Neural Networks for MNIST





  • im2col (順伝搬においてテンソルをベクトルに変換)
def im2col(input_data, fil_h, fil_w, stride=1, pad=0):
    N, C, H, W   :: Batch size, Channel, Height and Width of input_data
    fil_h, fil_w :: height and width of filter
    N, C, H, W = input_data.shape
    out_h = (H + 2*pad - fil_h) // stride + 1
    out_w = (W + 2*pad - fil_w) // stride + 1
    img = np.pad(input_data, [(0,0), (0,0), (pad, pad), (pad, pad)], 'constant')
    col = np.zeros((N, C, out_h, out_w, fil_h, fil_w))
    for y in range(0,out_h):
        y_max = y*stride + fil_h
        for x in range(0,out_w):
            x_max = x*stride + fil_w
            col[:, :, y, x, :, :] = img[:, :, y*stride:y_max, x*stride:x_max]
    col = col.transpose(0, 2, 3, 1, 4, 5).reshape(N*out_h*out_w, -1) # -1 = C*fil_h*fil_w
    return col
  • col2im(逆伝搬においてベクトルをテンソルに変換)
def col2im(col, img_shape, fil_h, fil_w, stride=1, pad=0):
    N, C, H, W = img_shape
    out_h = (H + 2*pad - fil_h) // stride + 1
    out_w = (W + 2*pad - fil_w) // stride + 1
    col = col.reshape(N, out_h, out_w, C, fil_h, fil_w).transpose(0, 3, 1, 2, 4, 5)
    img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))
    for y in range(0,out_h):
        y_max = y*stride + fil_h
        for x in range(0,out_w):
            x_max = x*stride + fil_w
            img[:, :, y*stride:y_max, x*stride:x_max] += col[:, :, y, x, :, :]
    return img[:, :, pad:H+pad, pad:W+pad]
  • 畳み込み層の実装
class Convolution:
    def __init__(self, filter_shape, stride=1, pad=0):
        in_dim = np.prod(filter_shape[1:])
        out_dim = np.prod(filter_shape[2:]) * filter_shape[0]
        self.W = np.random.uniform(low=-np.sqrt(6/(in_dim + out_dim)),
                                   high=np.sqrt(6/(in_dim + out_dim)),
        self.b = np.zeros(filter_shape[0]).astype('float32')
        self.s = stride
        self.p = pad
        self.x = None
        self.x_col = None
        self.W_col = None
        self.dW = None
        self.db = None
    def forward(self,x):
        FC, C, FH, FW = self.W.shape
        N, C, H, W = x.shape
        out_h = (H + 2*self.p - FH) // self.s + 1
        out_w = (W + 2*self.p - FW) // self.s + 1
        x_col = im2col(x, FH, FW, self.s, self.p)
        W_col = self.W.reshape(FC,-1).T # -1 = C*FH*FW
        out = np.dot(x_col, W_col) + self.b
        out = out.reshape(N, out_h, out_w, FC).transpose(0, 3, 1, 2)
        self.x = x
        self.x_col = x_col
        self.W_col = W_col
        return out
    def backward(self, delta):
        FC, C, FH, FW = self.W.shape
        delta = delta.transpose(0, 2, 3, 1).reshape(-1, FC)
        self.db = np.sum(delta, axis=0)
        self.dW = np.dot(self.x_col.T, delta)
        self.dW = self.dW.T.reshape(FC, C, FH, FW)
        dx = np.dot(delta, self.W_col.T)
        dx = col2im(dx, self.x.shape, FH, FW, self.s, self.p)
        return dx
  • Average poolingの実装
class AveragePooling:
    def __init__(self, filter_shape, stride=1, pad=0):
        self.pool_h = filter_shape[0]
        self.pool_w = filter_shape[1]
        self.s = stride
        self.p = pad
        self.x = None
    def forward(self, x):
        N, C, H, W = x.shape
        out_h = (H + 2*self.p - self.pool_h) // self.s + 1
        out_w = (W + 2*self.p - self.pool_w) // self.s + 1
        col = im2col(x, self.pool_h, self.pool_w, self.s, self.p)
        col = col.reshape(-1, self.pool_h*self.pool_w)
        out = np.mean(col, axis=1, keepdims=True)
        out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
        self.x = x
        self.batch_size = N
        return out
    def backward(self, delta):
        N, C, H, W = self.x.shape
        delta = delta.transpose(0, 2, 3, 1).flatten()
        delta = np.dot(delta[:,None], np.ones(self.pool_h*self.pool_w)[None,:])
        dx_col = delta.reshape(-1, self.pool_h*self.pool_w*C) / N
        dx = col2im(dx_col, self.x.shape, self.pool_h, self.pool_w, self.s, self.p)
        return dx
  • Max poolingの実装
class MaxPooling:
    def __init__(self, filter_shape, stride=1, pad=0):
        self.pool_h = filter_shape[0]
        self.pool_w = filter_shape[1]
        self.s = stride
        self.p = pad
        self.x = None
    def forward(self, x):
        N, C, H, W = x.shape
        out_h = (H + 2*self.p - self.pool_h) // self.s + 1
        out_w = (W + 2*self.p - self.pool_w) // self.s + 1
        col = im2col(x, self.pool_h, self.pool_w, self.s, self.p)
        col = col.reshape(-1, self.pool_h*self.pool_w)
        idx = np.argmax(col, axis=1)
        out = np.max(col, axis=1, keepdims=True)
        out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
        self.x = x
        self.idx = idx
        return out
    def backward(self, delta):
        N, C, H, W = self.x.shape
        delta = delta.transpose(0, 2, 3, 1).flatten()
        dmax = np.zeros((delta.size, self.pool_h*self.pool_w))
        dmax[np.arange(delta.size), self.idx] = delta
        dx_col = dmax.reshape(-1, self.pool_h*self.pool_w*C) / N
        dx = col2im(dx_col, self.x.shape, self.pool_h, self.pool_w, self.s, self.p)
        return dx


今回用いたCNNの構造を以下に示す。プーリング層にはMax poolingまたはAverage poolingを用いた。

Layer Kernel Stride Output
Input 1x28x28
Convolution 16x5x5 1 16x24x24
Batch Norm
Pooling 2x2 2 16x12x12
Convolution 32x5x5 1 32x8x8
Batch Norm
Pooling 2x2 2 32x4x4
Affine 256
Batch Norm
Affine 10

確率的勾配降下法を使って20回の学習を実行した。Max poolingおよびAverage poolingを使った場合、F1スコアはそれぞれ0.982および0.986となった。 全結合層だけで構成されたMLPのスコアが0.980だったのでわずかに精度が上がったと言える。

