我正在通过尼尔森的神经网络和深度学习进行工作。为了加深我的理解,尼尔森建议改写他的反向传播算法,以采用基于矩阵的方法(由于线性代数库中的优化,因此可能更快)。
目前,我每次都获得9-10%的极低/波动精度。正常情况下,我会继续努力,但是我在3天的大部分时间里都在使用该算法,而且我感觉自己对反向支持的数学掌握得很好。无论如何,我在准确性方面继续产生平庸的结果,因此,任何见识将不胜感激!!!
我正在使用MNIST手写数字数据库。
神经网络功能(此处为反向支持)
"""
neural_net_batch.py
neural_net.py modified to use matrix operations
"""
# Libs
import random
import numpy as np
# Neural Network
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes) # Number of layers in network
self.sizes = sizes # Number of neurons in each layer
self.biases = [np.random.randn(y, 1) for y in sizes[1:]] # Bias vector, 1 bias for each neuron in each layer, except input neurons
self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])] # Weight matrix
# Feed Forward Function
# Returns netowrk output for input a
def feedforward(self, a):
for b, w in zip(self.biases, self.weights): # a’ = σ(wa + b)
a = sigmoid(np.dot(w, a)+b)
return a
# Stochastic Gradient Descent
def SGD(self, training_set, epochs, m, eta, test_data):
if test_data: n_test = len(test_data)
n = len(training_set)
# Epoch loop
for j in range(epochs):
# Shuffle training data & parcel out mini batches
random.shuffle(training_set)
mini_batches = [training_set[k:k+m] for k in range(0, n, m)]
# Pass mini batches one by one to be updated
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
# End of Epoch (optional epoch testing)
if test_data:
evaluation = self.evaluate(test_data)
print("Epoch %6i: %5i / %5i" % (j, evaluation, n_test))
else:
print("Epoch %5i complete" % (j))
# Update Mini Batch (Matrix approach)
def update_mini_batch(self, mini_batch, eta):
m = len(mini_batch)
nabla_b = []
nabla_w = []
# Build activation & answer matrices
x = np.asarray([_x.ravel() for _x,_y in mini_batch]) # 10x784 where each row is an input vector
y = np.asarray([_y.ravel() for _x,_y in mini_batch]) # 10x10 where each row is an desired output vector
nabla_b, nabla_w = self.backprop(x, y) # Feed matrices into backpropagation
# Train Biases & weights
self.biases = [b-(eta/m)*nb for b, nb in zip(self.biases, nabla_b)]
self.weights = [w-(eta/m)*nw for w, nw in zip(self.weights, nabla_w)]
def backprop(self, x, y):
# Gradient arrays
nabla_b = [0 for i in self.biases]
nabla_w = [0 for i in self.weights]
w = self.weights
# Vars
m = len(x) # Mini batch size
a = x # Activation matrix temp variable
a_s = [x] # Activation matrix record
z_s = [] # Weighted Activation matrix record
special_b = [] # Special bias matrix to facilitate matrix operations
# Build special bias matrix (repeating biases for each example)
for j in range(len(self.biases)):
special_b.append([])
for k in range(m):
special_b[j].append(self.biases[j].flatten())
special_b[j] = np.asarray(special_b[j])
# Forward pass
# Starting at the input layer move through each layer
for l in range(len(self.sizes)-1):
z = a @ w[l].transpose() + special_b[l]
z_s.append(z)
a = sigmoid(z)
a_s.append(a)
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta
nabla_w[-1] = delta @ a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = self.weights[-n+1].transpose() @ delta * sp.transpose()
nabla_b[-n] = delta
nabla_w[-n] = delta @ a_s[-n-1]
# Create bias vectors by summing bias columns elementwise
for i in range(len(nabla_b)):
temp = []
for j in nabla_b[i]:
temp.append(sum(j))
nabla_b[i] = np.asarray(temp).reshape(-1,1)
return [nabla_b, nabla_w]
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(t[0])), t[1]) for t in test_data]
return sum(int(x==y) for (x, y) in test_results)
# Cost Derivative Function
# Returns the vector of partial derivatives C_x, a for the output activations y
def cost_derivative(output_activations, y):
return(output_activations-y)
# Sigmoid Function
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
# Sigmoid Prime (Derivative) Function
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
测试脚本
import mnist_data
import neural_net_batch as nn
# Data Sets
training_data, validation_data, test_data = mnist_data.load_data_wrapper()
training_data = list(training_data)
validation_data = list(validation_data)
test_data = list(test_data)
# Network
net = nn.Network([784, 30, 10])
# Perform Stochastic Gradient Descent using MNIST training & test data,
# 30 epochs, mini_batch size of 10, and learning rate of 3.0
net.SGD(list(training_data), 30, 10, 3.0, test_data=test_data)
非常有用的Reddit(u / xdaimon)帮助我忘记了以下答案:
“您的后退通行证应该是
# Backward pass
delta = cost_derivative(a_s[-1], y) * sigmoid_prime(z_s[-1])
nabla_b[-1] = delta.T
nabla_w[-1] = delta.T @ a_s[-2]
for n in range(2, self.num_layers):
z = z_s[-n]
sp = sigmoid_prime(z)
delta = delta @ self.weights[-n+1] * sp
nabla_b[-n] = delta.T
nabla_w[-n] = delta.T @ a_s[-n-1]
查找此错误的一种方法是记住,在计算nabla_w的产品中应该有一个转置的位置。
并且,如果您感兴趣,转置会出现在backprop的矩阵实现中,因为AB与A列和B行的外部乘积之和相同。在这种情况下,A = delta.T和B = a_s [-n-1],因此外部乘积在delta行与a_s [-n-1]行之间。总和中的每个术语都是nabla_w,这正是我们想要的批次中的单个元素。如果最小批量大小为1,您可以轻松地看到delta.T@a_s [-n-1]只是delta向量和激活向量的外积。“
测试不仅显示网络再次准确,而且存在预期的加速。