defforwardPropagation(): a = [None] * (L+1) a[1] = np.hstack((np.ones((m, 1)), X)) for l inrange(2, L): a[l] = sigmoid(np.matmul(a[l-1], Theta[l-1].T)) a[l] = np.hstack((np.ones((m, 1)), a[l])) a[L] = sigmoid(np.matmul(a[L-1], Theta[L-1].T)) return a
PYTHON
当然只计算一个数据也是可以的:
1 2 3 4 5 6 7 8 9 10 11
defforwardPropagation(x): """ x: (n, 1) """ a = [None] * (L+1) a[1] = np.vstack((np.ones((1, 1)), x)) for l inrange(2, L): a[l] = sigmoid(np.matmul(Theta[l-1], a[l-1])) a[l] = np.vstack((np.ones((1, 1)), a[l])) a[L] = sigmoid(np.matmul(Theta[L-1], a[L-1])) return a
PYTHON
代价函数及其正则化:
1 2 3 4 5 6 7 8
defJ(lamb): res = 0 FP = forwardPropagation() res -= np.sum(Y * np.log(FP[L]) + (1-Y) * np.log(1-FP[L])) for l inrange(1, L): res += lamb / 2 * np.sum(np.power(Theta[l][:, 1:], 2)) res /= m return res
PYTHON
使用 ex4weights.mat 中给定的参数 ,在无正则化时,准确率为 ,代价为 ;
在正则化时(),准确率为 ,代价为 。
第二部分·反向传播
反向传播算法:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defbackPropagation(lamb): Delta = [None] * L # (s_{l+1}, s_l+1) for l inrange(1, L): Delta[l] = np.zeros((s[l+1], s[l]+1)) delta = [None] * (L+1) # (s_l, ) a = forwardPropagation() for i inrange(m): delta[L] = a[L][i:i+1, :].T - Y[i:i+1, :].T for l inrange(L-1, 1, -1): delta[l] = (np.matmul(Theta[l].T, delta[l+1]) * (a[l][i:i+1, :].T * (1 - a[l][i:i+1, :].T)))[1:, :] for l inrange(1, L): Delta[l] += np.matmul(delta[l+1], a[l][i:i+1, :]) D = [None] * L # (s_{l+1}, s_l+1) for l inrange(1, L): D[l] = (Delta[l] + lamb * np.hstack((np.zeros((s[l+1], 1)), Theta[l][:, 1:]))) / m return D
PYTHON
梯度下降:
1 2 3 4 5 6 7 8 9 10
defGradientDescent(alpha, iteration): for l inrange(1, L): Theta[l] = np.random.random((s[l+1], s[l]+1)) Theta[l] = (Theta[l] - 0.5) / 4 for t inrange(iteration): D = backPropagation(lamb=1) for l inrange(1, L): Theta[l] -= alpha * D[l] Z.append(J(lamb=1)) return Theta
PYTHON
梯度检验(把这段代码插入到梯度下降计算出 矩阵之后的地方即可):
1 2 3 4 5 6 7 8 9
for l inrange(1, L): for i inrange(s[l+1]): for j inrange(s[l]+1): Theta[l][i, j] -= 0.001 J1 = J(lamb=1) Theta[l][i, j] += 0.002 J2 = J(lamb=1) Theta[l][i, j] -= 0.001 print(D[l][i, j], (J2 - J1) / 0.002)