关于神经网络训练不收敛的问题 - V2EX
V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
happydezhangning
V2EX    问与答

关于神经网络训练不收敛的问题

  •  
  •   happydezhangning 2019-02-23 12:38:49 +08:00 3891 次点击
    这是一个创建于 2502 天前的主题,其中的信息可能已经有所发展或是发生改变。
    最近用 python+numpy 写了一个识别 CIFAR10 的程序,没用框架,但是训练不收敛,训练次数越多,输出每个类别的概率越相同,前向和反向传播逻辑和算法上都没找出什么问题,找了各种办法也没解决,快要被搞得自闭了。想问问有没有遇到过类似问题的大佬,如何解决。。
    14 条回复    2019-02-23 19:18:13 +08:00
    baiye23333
        1
    baiye23333  
       2019-02-23 12:53:30 +08:00 via iPhone   1
    代码都不放你让我们怎么说?
    happydezhangning
        2
    happydezhangning  
    OP
       2019-02-23 13:07:41 +08:00   1
    @baiye23333
    import math
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.image as mpimg
    np.set_printoptions(threshold=np.inf)
    #open file
    def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
    dict = pickle.load(fo,encoding='bytes')
    return dict

    dict1 = unpickle('cifar-10-batches-py/data_batch_1')
    dict2 = unpickle('cifar-10-batches-py/data_batch_2')
    dict3 = unpickle('cifar-10-batches-py/data_batch_3')
    dict4 = unpickle('cifar-10-batches-py/data_batch_4')
    dict5 = unpickle('cifar-10-batches-py/data_batch_5')
    test_dict = unpickle("cifar-10-batches-py/test_batch")

    data1_4d = np.reshape(dict1[b'data'],(10000, 32, 32, 3), order = 'F')
    data1_4d = np.rot90(data1_4d, k = 3, axes = (1,2))
    data2_4d = np.reshape(dict2[b'data'],(10000, 32, 32, 3), order = 'F')
    data2_4d = np.rot90(data2_4d, k = 3, axes = (1,2))
    data3_4d = np.reshape(dict3[b'data'], (10000, 32, 32, 3), order = 'F')
    data3_4d = np.rot90(data3_4d, k = 3, axes = (1,2))
    data4_4d = np.reshape(dict4[b'data'], (10000, 32, 32, 3), order = 'F')
    data4_4d = np.rot90(data4_4d, k = 3, axes = (1,2))
    data5_4d = np.reshape(dict5[b'data'], (10000, 32, 32, 3), order = 'F')
    data5_4d = np.rot90(data5_4d, k = 3, axes = (1,2))
    test_data = np.reshape(test_dict[b'data'], (10000, 32, 32, 3), order = 'F')
    test_data = np.rot90(test_data, k = 3, axes = (1,2))

    label1 = dict1[b'labels']
    label2 = dict2[b'labels']
    label3 = dict3[b'labels']
    label4 = dict4[b'labels']
    label5 = dict5[b'labels']
    test_label = test_dict[b'labels']

    def softmax(x):
    #减去最大值
    x-=np.max(x)
    x = np.exp(x)/np.sum(np.exp(x))
    return x
    #权值参数初始化
    weight = np.random.normal(loc = 0,scale = 0.01,size = (3,3,3,16))
    bias = np.zeros([16],dtype = np.float64)
    conv_out = np.zeros([30,30,16],dtype = np.float64)
    Maxpool_out = np.zeros([15,15,16],dtype = np.float64)
    weight_of_fc = np.random.uniform(0,0.1,size = (3600,10))
    fc_in = np.zeros([1,3600],dtype = np.float64)
    softmax_out = np.zeros([1,10],dtype = np.float64)
    Relu_out = np.zeros([30,30,16],dtype = np.float64)
    dl_div_weight = np.zeros([3,3,3,16],dtype = np.float64)
    dl_div_bias = np.zeros([16],dtype = np.float64)


    def fc_forward(in_pic):
    global conv_out, weight, Maxpool_out, bias, Relu_out ,softmax_out
    global weight_of_fc, fc_in, dl_div_weight,dl_div_bias
    #卷积操作,Convolutional layer Apply 16 flters with size 3 × 3 × 3,
    #stride 1 and padding 0,Layer input 32 × 32 × 3, output 30 × 30 × 16.
    for i in range (16):
    for j in range(30):
    for k in range(30):
    conv_out[j][k][i] = (in_pic[j:j+3,k:k+3,0] * weight[:,:,0,i]).sum()+ \
    (in_pic[j:j+3,k:k+3,1] * weight[:,:,1,i]).sum()+ \
    (in_pic[j:j+3,k:k+3,2] * weight[:,:,2,i]).sum()
    conv_out += bias
    Relu_out = np.choose(conv_out < 0 ,(conv_out,0))#激活函数
    for i in range(16):#池化层
    for j in range(15):
    for k in range(15):
    Maxpool_out[j][k][i] = np.max(Relu_out[j*2:j*2+2,k*2:k*2+2,i])
    fc_in = np.reshape(Maxpool_out,(1,3600))
    fc_out = np.dot(fc_in,weight_of_fc)
    softmax_out = softmax(fc_out)
    return (np.argmax(fc_out))
    #损失函数,交叉熵
    #loss =y*np.logp(标签索引对应的)
    def back_forward(inputs,label):#优化卷积层和池化层的参数
    global conv_out, weight, Maxpool_out, bias, Relu_out ,softmax_out
    global weight_of_fc, fc_in, dl_div_weight,dl_div_bias
    for index,input_picture in enumerate(inputs):
    num_predict = fc_forward(input_picture)
    print("softmax_out : ", softmax_out)
    print("预测结果: ",num_predict,"真实值: ",label[index])
    #loss 对全连接层输出的偏导 p-y,此时 softmax_out 为 dl_div_dz,z 为全连接最后输出
    softmax_out[0][label[index]] -= 1

    dw_fc = np.dot(np.transpose(fc_in),softmax_out)
    #将 fc_in 转为 3600*1,softmax_out 为 1*10,dw_fc 为 3600*10
    dl_div_dfc3600 = np.dot(softmax_out,np.transpose(weight_of_fc))
    #weight_of_fc 为 3600*10,dl/dz=softmax_out 为 1*10,dl_div_dfc3600:1*3600
    dl_div_dMaxpool_out = np.reshape(dl_div_dfc3600,(15,15,16))

    #求对激活层输出(池化层输入)的偏导:
    dl_div_dRelu_out = np.zeros([30,30,16],dtype = np.float64)
    for i in range(16):
    for j in range(15):
    for k in range(15):
    if Maxpool_out[j][k][i] == Relu_out[j*2][k*2][i]:
    dl_div_dRelu_out[j*2][k*2][i] = dl_div_dMaxpool_out[j][k][i]
    elif Maxpool_out[j][k][i] == Relu_out[j*2+1][k*2][i]:
    dl_div_dRelu_out[j*2+1][k*2][i] = dl_div_dMaxpool_out[j][k][i]
    elif Maxpool_out[j][k][i] == Relu_out[j*2][k*2+1][i]:
    dl_div_dRelu_out[j*2][k*2+1][i] = dl_div_dMaxpool_out[j][k][i]
    else:
    dl_div_dRelu_out[j*2+1][k*2+1][i] = dl_div_dMaxpool_out[j][k][i]
    #loss 对 relu(input)即 conv_out 的偏导
    dReluout_div_cOnvout= np.choose(conv_out >= 0,(0,1))#reluout 对 reluin 的偏导
    dl_div_cOnvout= dReluout_div_convout * dl_div_dRelu_out #30*30*16

    #loss 对卷积层 w 和 bias 的偏导
    for i in range(16):
    for j in range(3):
    for k in range(3):
    for m in range(3):
    dl_div_weight[k,m,j,i] = \
    (input_picture[k:k+30,m:m+30,j] * dl_div_convout[:,:,i]).sum()
    dl_div_bias[i] = dl_div_convout[:,:,i].sum()
    weight_of_fc =weight_of_fc - 0.001 * dw_fc
    weight = weight - 0.001 * dl_div_weight
    bias =bias - 0.001 * dl_div_bias
    def train():
    back_forward(data1_4d,label1)
    back_forward(data2_4d,label2)
    back_forward(data3_4d,label3)
    back_forward(data4_4d,label4)
    back_forward(data5_4d,label5)
    train()
    happydezhangning
        3
    happydezhangning  
    OP
       2019-02-23 13:09:32 +08:00
    @baiye23333 模型要求
    1.2 Network Architecture
    Implement a neural network with layers in the following order:
    Input Image size 32 × 32 × 3.
    Convolutional layer Apply 16 flters with size 3 × 3 × 3, stride 1 and padding 0.
    Layer input 32 × 32 × 3, output 30 × 30 × 16.
    ReLU layer Apply ReLU activation function on each component.
    Layer input 30 × 30 × 16, output 30 × 30 × 16.
    Pooling layer Max-pooling with size 2 × 2 and stride 2.
    Layer input 30 × 30 × 16, output 15 × 15 × 16.
    Fully-connected layer Reshape the data to a vector with length 3600 and fully connect to 10 output nodes.
    Layer input 15 × 15 × 16 = 3600, output size 10.
    Softmax layer Apply softmax function to get the fnal output, indicating the probability in each category.
    Layer input size 10, output size 10.
    Here you should calculate the forward and backward propagation by yourself.
    honist
        4
    honist  
       2019-02-23 13:22:11 +08:00 via iPhone   1
    happydezhangning
        5
    happydezhangning  
    OP
       2019-02-23 13:30:06 +08:00
    baiye23333
        6
    baiye23333  
       2019-02-23 13:38:36 +08:00 via iPhone   1
    你可视化每一层的梯度试试,高斯初始化只适用层数较少的情况,你看看每层的梯度是不是快接近 0 了。
    baiye23333
        7
    baiye23333  
       2019-02-23 13:41:20 +08:00 via iPhone   1
    然后你再对数据进行归一化,我个人认为是梯度消失了。

    但我觉得你的代码写的好烂。。。
    ypw
        8
    ypw  
       2019-02-23 13:48:50 +08:00   1
    https://dpaste.de/O6JB

    目测 batch_size=1,你可以看看这个知乎问题,batch_size=1 的时候是不收敛的: https://www.zhihu.com/question/32673260/answer/71137399

    https://pic1.zhimg.com/d6fb7abbaeef80e739d824582a0fa384_r.jpg

    另外你可以先用 pytorch 跑通整个流程,https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html,然后再用 numpy 去写细节。
    yemenchun1
        9
    yemenchun1  
       2019-02-23 14:12:25 +08:00   1
    weight 0.001 --> 0.00001
    honist
        10
    honist  
       2019-02-23 15:38:52 +08:00 via iPhone   1
    lz 你这粘完了和没粘一样啊

    参考 8 楼和 9 楼吧 单次只有一个样本 很难确定梯度方向;学习率太大了,也会导致权值变化过大

    增大 batch 和 学习率;
    增大 batch 就是使用多个样本计算 loss 来更新一次 weight
    ipwx
        11
    ipwx  
       2019-02-23 16:00:06 +08:00   1
    Mini-batch 的本质是,通过有限个样本计算 x 上的期望。这叫做蒙特卡洛积分。

    https://en.wikipedia.org/wiki/Monte_Carlo_method

    真正的目标函数是 E[log p(y|x)],而你的 log p(y|x) = log Bernoulli-likelihood(y|x)。

    而 1-mini batch 会让这个期望估计地非常不准,方差极大,以至于大部分情况下估计的期望所求出的梯度,连方向都是错的,自然就根本不可能收敛。

    这就是 mini-batch 不能太小的数学原理。

    - - - -

    另外,我看你好像没用 log Bernoulli-likelihood(y|x),而是用了 Bernoulli-likelihood(y|x)(没开 log )。

    这是不妥当的,会带来数值问题,容易出 NaN。

    你应该用 log_softmax(t) = t - t_max - log(sum(exp(t - t_max)))
    ipwx
        12
    ipwx  
       2019-02-23 16:01:08 +08:00   1
    错了,不是 Bernoulli-likelihood,是 Categorical likelihood。

    https://en.wikipedia.org/wiki/Categorical_distribution
    zzj0311
        13
    zzj0311  
       2019-02-23 17:20:27 +08:00 via Android   1
    能不能先把缩进给加上。。cifar 10 的学习率一般要在 1e-3 -4 这种量级,batch size32/64 会好一点 搞个三层的基本就有个 60 -70%的正确率了(你现在导出的这一坨我是看不懂
    EscYezi
        14
    EscYezi  
       2019-02-23 19:18:13 +08:00 via iPhone   1
    歪个楼,代码片段建议使用 gist
    关于     帮助文档     自助推广系统     博客     API     FAQ     Solana     4990 人在线   最高记录 6679       Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 29ms UTC 05:44 PVG 13:44 LAX 21:44 JFK 00:44
    Do have faith in what you're doing.
    ubao msn snddm index pchome yahoo rakuten mypaper meadowduck bidyahoo youbao zxmzxm asda bnvcg cvbfg dfscv mmhjk xxddc yybgb zznbn ccubao uaitu acv GXCV ET GDG YH FG BCVB FJFH CBRE CBC GDG ET54 WRWR RWER WREW WRWER RWER SDG EW SF DSFSF fbbs ubao fhd dfg ewr dg df ewwr ewwr et ruyut utut dfg fgd gdfgt etg dfgt dfgd ert4 gd fgg wr 235 wer3 we vsdf sdf gdf ert xcv sdf rwer hfd dfg cvb rwf afb dfh jgh bmn lgh rty gfds cxv xcv xcs vdas fdf fgd cv sdf tert sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf sdf shasha9178 shasha9178 shasha9178 shasha9178 shasha9178 liflif2 liflif2 liflif2 liflif2 liflif2 liblib3 liblib3 liblib3 liblib3 liblib3 zhazha444 zhazha444 zhazha444 zhazha444 zhazha444 dende5 dende denden denden2 denden21 fenfen9 fenf619 fen619 fenfe9 fe619 sdf sdf sdf sdf sdf zhazh90 zhazh0 zhaa50 zha90 zh590 zho zhoz zhozh zhozho zhozho2 lislis lls95 lili95 lils5 liss9 sdf0ty987 sdft876 sdft9876 sdf09876 sd0t9876 sdf0ty98 sdf0976 sdf0ty986 sdf0ty96 sdf0t76 sdf0876 df0ty98 sf0t876 sd0ty76 sdy76 sdf76 sdf0t76 sdf0ty9 sdf0ty98 sdf0ty987 sdf0ty98 sdf6676 sdf876 sd876 sd876 sdf6 sdf6 sdf9876 sdf0t sdf06 sdf0ty9776 sdf0ty9776 sdf0ty76 sdf8876 sdf0t sd6 sdf06 s688876 sd688 sdf86