# 再探反向传播算法（手写体识别Python实例）

1.网络结构

$\begin{array}{rl}L& =\text{神经网络总共包含的层数}\\ {S}_{l}& =\text{第}l\text{层的神经元数目}\\ K& =\text{输出层的神经元数，亦即分类的数目}\\ {w}_{ij}^{l}& =第l层第j个神经元与第l+1层第i个神经元之间的权重值\end{array}$

$\begin{array}{}\text{(1)}& & \frac{\mathrm{\partial }J}{\mathrm{\partial }{w}^{l}}={\delta }^{l+1}\cdot \left({a}^{l}{\right)}^{T}\text{(2)}& & {\delta }^{l}=\left({w}^{l}{\right)}^{T}\cdot {\delta }^{l+1}\ast {f}^{\mathrm{\prime }}\left({z}^{l}\right)\text{(3)}& & {\delta }_{i}^{L}=\frac{\mathrm{\partial }J}{\mathrm{\partial }{z}_{i}^{L}}=\frac{\mathrm{\partial }J}{\mathrm{\partial }{a}_{i}^{L}}\cdot \frac{\mathrm{\partial }{a}_{i}^{L}}{\mathrm{\partial }{z}_{i}^{L}}=\frac{\mathrm{\partial }J}{\mathrm{\partial }{a}_{i}^{L}}\cdot \frac{\mathrm{\partial }f\left({z}_{i}^{L}\right)}{\mathrm{\partial }{z}_{i}^{L}}=\frac{\mathrm{\partial }J}{\mathrm{\partial }{a}_{i}^{L}}\cdot {f}^{\prime }\left({z}_{i}^{L}\right)\text{(4)}& & \frac{\mathrm{\partial }J}{\mathrm{\partial }{b}^{l}}={\delta }^{l+1}\end{array}$

2.平方误差目标函数

$\begin{array}{rl}J\left(W,b\right)& =\left[\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}J\left(W,b;{x}^{\left(k\right)},{y}^{\left(k\right)}\right)\right]+\frac{\lambda }{2}\underset{l=1}{\overset{L-1}{\sum }}\underset{i=1}{\overset{{S}_{l+1}}{\sum }}\underset{j=1}{\overset{{S}_{l}}{\sum }}\left({w}_{ij}^{l}{\right)}^{2}\\ & =\left[\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}\left(\frac{1}{2}\left({h}_{W,b}\left({x}^{\left(k\right)}\right)-{y}^{\left(k\right)}{\right)}^{2}\right)\right]+\frac{\lambda }{2}\underset{l=1}{\overset{L-1}{\sum }}\underset{i=1}{\overset{{S}_{l+1}}{\sum }}\underset{j=1}{\overset{{S}_{l}}{\sum }}\left({w}_{ij}^{l}{\right)}^{2}\end{array}$

$\begin{array}{rl}& \frac{\mathrm{\partial }}{\mathrm{\partial }{w}_{ij}^{l}}J\left(W,b\right)=\left[\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}\frac{\mathrm{\partial }}{\mathrm{\partial }{w}_{ij}^{l}}J\left(W,b;{x}^{\left(k\right)},{y}^{\left(k\right)}\right)\right]+\lambda {w}_{ij}^{l}\\ & \frac{\mathrm{\partial }}{\mathrm{\partial }{b}_{i}^{l}}J\left(W,b\right)=\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}\frac{\mathrm{\partial }}{\mathrm{\partial }{b}_{i}^{l}}J\left(W,b;{x}^{\left(k\right)},{y}^{\left(k\right)}\right)\end{array}$

m, n = np.shape(X)  # m:samples, n: dimensions
a1 = X.T  # 400 by 5000
z2 = np.dot(W1, a1) + b1  # 25 by 400 dot 400 by 5000 + 25 by 1= 25 by 5000
a2 = sigmoid(z2)  # 25 by 5000
z3 = np.dot(W2, a2) + b2  # 10 by 25 dot 25 by 5000 + 10 by 1= 10 by 5000
a3 = sigmoid(z3)  # 10 by 5000
cost = (1/m)*np.sum((a3-y_label)**2)+(lambd/2)*(np.sum(W1**2)+np.sum(W2**2))

delta3=-(y_label-a3)*sigmoidGradient(z3)# 10 by 5000 第L层残差
df_w2=np.dot(delta3,a2.T)# 10 by 5000 dot 5000 by 25 = 10 by 25# J对w2的导数
df_w2=(1/m)*df_w2+lambd*W2# J对w2的导数 + 正则化项对于的J对w2的导数

delta2=np.dot(W2.T,delta3)*sigmoidGradient(z2)# 25 by 10 dot 10 by 5000 dot 25 by 5000= 25 by 5000
df_w1=np.dot(delta2,a1.T)# 25 by 5000 dot 5000 by 400 = 25 by 400
df_w1=(1/m)*df_w1+lambd*W1

df_b1=(1/m)*np.sum(delta2,axis=1).reshape(b1.shape)#
df_b2=(1/m)*np.sum(delta3,axis=1).reshape(b2.shape)

3.交叉熵目标函数

$\begin{array}{r}J\left(W,b\right)=-\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}\left[{y}^{\left(k\right)}\mathrm{log}\left({h}_{W,b}\left({x}^{\left(k\right)}\right)\right)+\left(1-{y}^{\left(k\right)}\right)\mathrm{log}\left(1-{h}_{W,b}\left({x}^{\left(k\right)}\right)\right)\right]+\frac{\lambda }{2m}\underset{l=1}{\overset{L-1}{\sum }}\underset{i=1}{\overset{{S}_{l+1}}{\sum }}\underset{j=1}{\overset{{S}_{l}}{\sum }}\left({w}_{ij}^{l}{\right)}^{2}\end{array}$

$\begin{array}{rl}& \frac{\mathrm{\partial }}{\mathrm{\partial }{w}_{ij}^{l}}J\left(W,b\right)=\left[\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}\frac{\mathrm{\partial }}{\mathrm{\partial }{w}_{ij}^{l}}J\left(W,b;{x}^{\left(k\right)},{y}^{\left(k\right)}\right)\right]+\frac{\lambda }{m}{w}_{ij}^{l}\\ & \frac{\mathrm{\partial }}{\mathrm{\partial }{b}_{i}^{l}}J\left(W,b\right)=\frac{1}{m}\underset{k=1}{\overset{m}{\sum }}\frac{\mathrm{\partial }}{\mathrm{\partial }{b}_{i}^{l}}J\left(W,b;{x}^{\left(k\right)},{y}^{\left(k\right)}\right)\end{array}$

a1 = X.T  # 400 by 5000
z2 = np.dot(W1, a1) + b1  # 25 by 400 * 400 by 5000 + 25 by 1= 25 by 5000
a2 = sigmoid(z2)  # 25 by 5000
z3 = np.dot(W2, a2) + b2  # 10 by 25 * 25 by 5000 + 10 by 1= 10 by 5000
a3 = sigmoid(z3)  # 10 by 5000
cost = (1/m)*np.sum(np.sum(-y_label*np.log(a3)-(1-y_label)*np.log(1-a3)))
cost = cost+(lambd / (2*m)) * (np.sum(W1 ** 2) + np.sum(W2 ** 2))
# 除了目标函数变了，其他均没变

delta3=(a3-y_label)
df_w2=np.dot(delta3,a2.T)# 10 by 5000 dot 5000 by 25 = 10 by 25
df_w2=(1/m)*df_w2+(lambd/m)*W2
delta2=np.dot(W2.T,delta3)*sigmoidGradient(z2)# 25 by 10 dot 10 by 5000 * 25 by 5000= 25 by 5000
df_w1=np.dot(delta2,a1.T)# 25 by 5000 dot 5000 by 400 = 25 by 400
df_w1=(1/m)*df_w1+(lambd/m)*W1
df_b1=(1/m)*np.sum(delta2,axis=1).reshape(b1.shape)#
df_b2=(1/m)*np.sum(delta3,axis=1).reshape(b2.shape)

4.其余部分代码说明

4.1 载入数据集

def loadData():
X=data['X']# 5000 by 400 samples by dimensions
y=data['y'].reshape(5000)
eye=np.eye(10)
y_label=eye[:,y-1] # 10 by 5000# 矩阵化标签
ss=StandardScaler() # 标准化
X=ss.fit_transform(X)
return X,y,y_label

4.2训练及保存参数

def train():
m,n=np.shape(X)# m:samples, n: dimensions
input_layer_size=400
hidden_layer_size=25
output_layer_size=10

epsilong_init=0.12
W1=np.random.rand(hidden_layer_size,input_layer_size)*2*epsilong_init-epsilong_init
W2=np.random.rand(output_layer_size,hidden_layer_size)*2*epsilong_init-epsilong_init
b1=np.random.rand(hidden_layer_size,1)*2*epsilong_init-epsilong_init
b2=np.random.rand(output_layer_size,1)*2*epsilong_init-epsilong_init
#第8行到第12行的作用是： 使得初始化后每个参数取值的绝对值都小于0.12

for i in range(iteration):
arr = np.arange(5000)# 生成1-5000
np.random.shuffle(arr)# 随机打乱
index = arr[:500]
batch_X=X[index,:]# 取前500个构成一个batch
batch_y=y_label[:,index]
#第16行到第20行的作用是：每个随机取500个样本构成一个batch

c,df_w1, df_w2, df_b1, df_b2=costFandGradient(batch_X, batch_y, W1, b1, W2, b2, lambd)
cost.append(round(c,4))

p={'W1':W1,'b1':b1,'W2':W2,'b2':b2}
temp=open('data','wb')
pickle.dump(p,temp)
#第27行到第29行的作用是：以字典的形式保存参数，文件名为data
#也就是说，训练好一次之后，下次直接载入保存好的参数模型预测即可，不用再训练

4.2载入模型参数及预测

def prediction():

p = open('data', 'rb')
W1 = data['W1']
W2 = data['W2']
b1 = data['b1']
b2 = data['b2']
#第4行到第9行的作用是：载入模型参数，并赋值给相关变量

a1 = X.T  # 400 by 5000
z2 = np.dot(W1, a1) + b1  # 25 by 400 * 400 by 5000 + 25 by 1= 25 by 5000
a2 = sigmoid(z2)  # 25 by 5000
z3 = np.dot(W2, a2) + b2  # 10 by 25 * 25 by 5000 + 10 by 1= 10 by 5000
a3 = sigmoid(z3)  # 10 by 5000

y_pre = np.zeros(a3.shape[1], dtype=int)
for i in range(a3.shape[1]):
col = a3[:, i]# 遍历5000个样本输出的预测
index = np.where(col == np.max(col))[0][0] + 1# 选择概率值最大的索引，由于Python从0开始索引，所以加了一个1
y_pre[i] = index
print(accuracy_score(y,y_pre))