神经网络无法正常学习

Question

我正在用 C++ 从头开始制作一个用于学习目的的神经网络，并尝试让我的神经网络学习手写数字的 MNIST 数据集。然而，每次我拟合神经网络时，网络 MSE 只输出一个值。我不知道这是否与我编写

NeuralNetwok

类的方式或我拟合数据的方式有关。这些是重现问题的相关代码：

// PredictedData.hpp
#pragma once

#include <stdint.h>

struct PredictedData
{
  uint64_t m_NeuronIndex;
  double m_NeuronValue;

  PredictedData(
      uint64_t neuronIndex,
      double neuronValue);

  ~PredictedData();
};

// PredictedData.cpp

#include "PredictData.hpp"

PredictedData::PredictedData(
    uint64_t neuronIndex,
    double neuronValue) : m_NeuronIndex(neuronIndex),
                          m_NeuronValue(neuronValue) {}

PredictedData::~PredictedData() {}

// NeuralNetwork.hpp

#pragma once

#include "PredictData.hpp"
#include <iostream>
#include <stdint.h>
#include <vector>

class NeuralNetwork
{
private:
  void CalculateOutputs(const std::vector<double> &inputData);

  void CalculateDeltas(const std::vector<double> &expectedData);

  void ApplyGradientDescent(
      const std::vector<double> &inputData,
      const double &learningRate);

public:
  std::vector<uint64_t> m_Topology;
  std::vector<std::vector<double>> m_CachedDeltas;
  std::vector<std::vector<double>> m_CachedSums;
  std::vector<std::vector<double>> m_CachedOutputs;
  std::vector<std::vector<double>> m_Biases;
  std::vector<std::vector<std::vector<double>>> m_Weights;

  NeuralNetwork(const std::vector<uint64_t> &topology);
  ~NeuralNetwork();

  void Fit(
      const std::vector<double> &trainingData,
      const std::vector<double> &expectedData,
      const double &learningRate = 0.001);

  double GetMSE(const std::vector<double> &expectedData);

  PredictedData Predict(const std::vector<double> &inputData);
};

// NeuralNetwork.cpp
#include "NeuralNetwork.hpp"
#include <cmath>
#include <random>

double SigmoidActivation(const double &x)
{
  return 1 / (1 + std::exp(-x));
}

double SigmoidActivationDerivative(const double &x)
{
  return SigmoidActivation(x) * (1 - SigmoidActivation(x));
}

double SigmoidActivationDerivativeForOutput(const double &y)
{
  return y * (1 - y);
}

double TanhActivation(const double &x)
{
  return std::tanh(x);
  // return 2 / (1 + std::exp(-2 * x)) - 1;
}

double TanhActivationDerivative(const double &x)
{
  const double factor = TanhActivation(x);
  return 1 - factor * factor;
}

double TanhActivationDerivativeForOutput(const double &y)
{
  return 1 - y * y;
}

double ReLUActivation(const double &x)
{
  return x < 0 ? 0 : x;
}

double ReLUActivationDerivative(const double &x)
{
  return x < 0 ? 0 : 1;
}

double ParametricReLUActivation(const double &x, const double &a)
{
  return x < 0 ? 0 : a * x;
}

double ParametricReLUActivationDerivative(const double &x, const double &a)
{
  return x < 0 ? a : 1;
}

double ELUActivation(const double &x, const double &a)
{
  return x < 0 ? 0 : a * (std::exp(x) - 1);
}

double ELUActivationDerivative(const double &x, const double &a)
{
  return x < 0 ? ELUActivation(x, a) + a : 1;
}

double ELUActivationDerivativeForOutput(const double &x, const double &y, const double &a)
{
  return x < 0 ? y + a : 1;
}

double SoftPlusActivation(const double &x)
{
  return std::log(1 + std::exp(x));
}

double SoftPlusActivationDerivative(const double &x)
{
  return 1 / (1 + std::exp(-x));
}

double Cost(
    const double &predicted,
    const double &actual)
{
  double error = actual - predicted;
  return error * error;
}

double CostDerivative(
    const double &predicted,
    const double &actual)
{
  return 2 * (actual - predicted);
}

NeuralNetwork::NeuralNetwork(const std::vector<uint64_t> &topology) : m_Topology(topology)
{
  // reserve for layers
  m_Biases.reserve(topology.size() - 1);
  m_Weights.reserve(topology.size() - 1);
  m_CachedDeltas.reserve(topology.size() - 1);
  m_CachedSums.reserve(topology.size() - 1);
  m_CachedOutputs.reserve(topology.size() - 1);

  for (uint64_t i = 1; i < topology.size(); i++)
  {
    // IMPORTANT! this opeartion will used throughout
    // topology is inputed as following {input layer, hidden layer 1, hidden layer 2, hidden layer n, output layer}
    // we don't need to create memory for the input layer as it will be passed by the user
    // that's why the real index with relative to topology index is topology index - 1
    const uint64_t l = i - 1;

    // reserve for neuron in layer i
    m_CachedSums.push_back(std::vector<double>());
    m_CachedSums[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_CachedOutputs.push_back(std::vector<double>());
    m_CachedOutputs[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_CachedDeltas.push_back(std::vector<double>());
    m_CachedDeltas[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_Biases.push_back(std::vector<double>());
    m_Biases[l].reserve(topology[i]);

    // reserve for neuron in layer i
    m_Weights.push_back(std::vector<std::vector<double>>());
    m_Weights[l].reserve(topology[i]);

    for (uint64_t j = 0; j < topology[i]; j++)
    {
      // initialize for neuron j in layer i
      m_CachedSums[l].push_back(0);
      m_CachedOutputs[l].push_back(0);
      m_CachedDeltas[l].push_back(0);
      m_Biases[l].push_back(0);

      // reserve for layer l that affects neuron j
      m_Weights[l].push_back(std::vector<double>());
      m_Weights[l][j].reserve(topology[l]);

      for (uint64_t z = 0; z < topology[l]; z++)
      {
        // initialize weights
        m_Weights[l][j].push_back((double)std::rand() / RAND_MAX * sqrt(1.0 / topology[l]));
      }
    }
  }
}

NeuralNetwork::~NeuralNetwork()
{
}

void NeuralNetwork::CalculateOutputs(const std::vector<double> &inputData)
{
  // layer after input layer
  for (uint64_t i = 0; i < m_Topology[1]; i++)
  {
    m_CachedSums.front()[i] = m_Biases.front()[i];

    // product = output * weight (some way connected to the output)
    // calculated the products of all neurons of the input layer that is connected to neuron i and sum them
    for (uint64_t j = 0; j < m_Topology.front(); j++)
    {
      m_CachedSums.front()[i] += inputData[j] * m_Weights.front()[i][j];
    }

    // attempt to work with different activation function
    m_CachedOutputs.front()[i] = ReLUActivation(m_CachedSums.front()[i]);
    // m_CachedOutputs.front()[i] = SigmoidActivation(m_CachedSums.front()[i]);
    // m_CachedOutputs.front()[i] = TanhActivation(m_CachedSums.front()[i]);
  }

  // more hidden layer
  for (uint64_t i = 2; i < m_Topology.size() - 1; i++)
  {
    const uint64_t l = i - 1;

    for (uint64_t j = 0; j < m_Topology[i]; j++)
    {
      m_CachedSums[l][j] = m_Biases[l][j];

      // product = output * weight (some way connected to the output)
      // calculated the products of all neurons of the prev layer that is connected to neuron i and sum them
      for (uint64_t z = 0; z < m_Topology[l]; z++)
      {
        m_CachedSums[l][j] += m_CachedOutputs[l - 1][z] * m_Weights[l][j][z];
      }

      m_CachedOutputs[l][i] = ReLUActivation(m_CachedSums[l][i]);
      // m_CachedOutputs[l][i] = SigmoidActivation(m_CachedSums[l][i]);
      // m_CachedOutputs[l][j] = TanhActivation(m_CachedSums[l][j]);
    }
  }

  // output layer
  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    const uint64_t l = m_Topology.size() - 2;

    m_CachedSums.back()[i] = m_Biases.back()[i];

    // product = output * weight (some way connected to the output)
    // calculated the products of all neurons of the prev hidden layer that is connected to neuron i and sum them
    for (uint64_t j = 0; j < m_Topology[l]; j++)
    {
      m_CachedSums.back()[i] += m_CachedOutputs[l - 1][j] * m_Weights.back()[i][j];
    }

    // attempt to work with different activation function for output layer
    // m_CachedOutputs.back()[i] = ReLUActivation(m_CachedSums.back()[i]);
    // m_CachedOutputs.back()[i] = SigmoidActivation(m_CachedSums.back()[i]);
    m_CachedOutputs.back()[i] = SigmoidActivation(m_CachedSums.back()[i]);
  }
}

void NeuralNetwork::CalculateDeltas(const std::vector<double> &expectedData)
{
  // output layer
  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    // attempt to work with different activation function derivatiove for output layer
    m_CachedDeltas.back()[i] = SigmoidActivationDerivative(m_CachedSums.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
    // m_CachedDeltas.back()[i] = SoftPlusActivationDerivative(m_CachedSums.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
    // m_CachedDeltas.back()[i] = SoftPlusActivationDerivativeForOutput(m_CachedOutputs.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
  }

  // hidden layers
  for (uint64_t i = m_Topology.size() - 2; i > 0; i--)
  {
    const uint64_t l = i - 1;
    for (uint64_t j = 0; j < m_Topology[i]; j++)
    {
      // initialize delta to 0
      m_CachedDeltas[l][j] = 0;

      // sum all deltas of the next layer of layer i multiplied by the weights connected between neurons in the next layer with the neuron j
      for (uint64_t z = 0; z < m_Topology[i + 1]; z++)
      {
        m_CachedDeltas[l][j] += m_CachedDeltas[i][z] * m_Weights[i][z][j];
      }

      // attempt to work with different activation function derivatiove for hidden layer
      // m_CachedDeltas[l][j] *= TanhActivationDerivativeForOutput(m_CachedOutputs[l][j]);
      // m_CachedDeltas[l][j] *= SigmoidActivationDerivativeForOutput(m_CachedOutputs[l][j]);
      m_CachedDeltas[l][j] *= ReLUActivationDerivative(m_CachedSums[l][j]);
      // m_CachedDeltas[l][j] *= SoftPlusActivationDerivative(m_CachedSums[l][j]);
    }
  }
}

void NeuralNetwork::ApplyGradientDescent(const std::vector<double> &inputData, const double &learningRate)
{
  for (uint64_t i = 0; i < m_Topology[1]; i++)
  {
    // common operation to minimize redundancy
    double net = learningRate * m_CachedDeltas.front()[i];

    // change bias by net * 1
    m_Biases.front()[i] -= net;

    for (uint64_t j = 0; j < m_Topology.front(); j++)
    {
      // change weight by net * input
      m_Weights.front()[i][j] -= net * inputData[j];
    }
  }

  for (uint64_t i = 2; i < m_Topology.size(); i++)
  {
    const uint64_t l = i - 1;
    for (uint64_t j = 0; j < m_Topology[i]; j++)
    {
      // common operation to minimize redundancy
      double net = learningRate * m_CachedDeltas[l][j];

      // change bias by net * 1
      m_Biases[l][j] -= net;

      for (uint64_t z = 0; z < m_Topology[l]; z++)
      {
        // change weight by net * prev layer output
        m_Weights[l][j][z] -= net * m_CachedOutputs[l - 1][z];
      }
    }
  }
}

double NeuralNetwork::GetMSE(const std::vector<double> &expectedData)
{
  double SumSE = 0;
  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    SumSE += Cost(m_CachedOutputs.back()[i], expectedData[i]);
  }

  return SumSE / (double)m_Topology.back();
}

void NeuralNetwork::Fit(const std::vector<double> &trainingData, const std::vector<double> &expectedData, const double &learningRate)
{
  CalculateOutputs(trainingData);
  CalculateDeltas(expectedData);
  ApplyGradientDescent(trainingData, learningRate);
}

PredictedData NeuralNetwork::Predict(const std::vector<double> &inputData)
{
  CalculateOutputs(inputData);
  PredictedData predictedData(-1, -1000000);

  for (uint64_t i = 0; i < m_Topology.back(); i++)
  {
    if (m_CachedOutputs.back()[i] <= predictedData.m_NeuronValue)
    {
      continue;
    }

    predictedData.m_NeuronIndex = i;
    predictedData.m_NeuronValue = m_CachedOutputs.back()[i];
  }

  return predictedData;
}

// Main.cpp

#include "NeuralNetwork.hpp"
#include <cmath>
#include <fstream>
#include <string>

std::ifstream &Read(std::ifstream &in, void *data, std::streamsize bytes);

template <typename T>
std::ifstream &Read(std::ifstream &in, T &data)
{
  return Read(in, &data, sizeof(T));
}

uint32_t SwapEndian(uint32_t val);

bool ReadMNISTImages(std::ifstream &file, std::vector<std::vector<double>> &data, uint64_t &rows, uint64_t &columns);
bool ReadMNISTLabels(std::ifstream &file, std::vector<uint8_t> &data);

#define LEARNING_RATE 0.1

int main()
{
  std::ifstream imageFile("F:\\DATA\\mnist\\train-images.idx3-ubyte", std::ios::binary);
  if (!imageFile.is_open())
  {
    std::cout << "couldn't open image file\n";
    return 1;
  }

  std::ifstream labelFile("F:\\DATA\\mnist\\train-labels.idx1-ubyte", std::ios::binary);
  if (!labelFile.is_open())
  {
    std::cout << "couldn't open label file\n";
    return 1;
  }

  uint64_t rows;
  uint64_t columns;

  std::vector<std::vector<double>> imageData;
  if (!ReadMNISTImages(imageFile, imageData, rows, columns))
  {
    // failed
    return 1;
  }

  imageFile.close();

  std::vector<uint8_t> labelData;
  if (!ReadMNISTLabels(labelFile, labelData))
  {
    // failed
    return 1;
  }

  labelFile.close();

  uint64_t correct = 0;
  uint64_t wrong = 0;

  // init network
  NeuralNetwork network({rows * columns, 16, 16, 10});

  for (uint64_t i = 0; i < labelData.size() / 6; i++)
  {
    const uint8_t label = labelData[i];
    std::vector<double> expectedData(10, 0);

    // set index label to 1 (hot encoding)
    expectedData[label] = 1;

    // fit
    network.Fit(imageData[i], expectedData, LEARNING_RATE);

    // log MSE every 1000 fit
    if (i % 1000 == 0)
    {
      std::cout << "MSE: " << network.GetMSE(expectedData) << '\n';
    }

    // predict
    PredictedData predictedData = network.Predict(imageData[i]);

    // check if the predicted neuron index is the same with the label
    if (predictedData.m_NeuronIndex == label)
    {
      correct++;
    }
    else
    {
      wrong++;
    }
  }

  // log the amount of correct and wrong predictions
  std::cout << "correct: " << correct << ", wrong: " << wrong << ", total: " << wrong + correct << std::endl;

  return 0;
}

std::ifstream &Read(std::ifstream &in, void *data, std::streamsize bytes)
{
  char *buffer = reinterpret_cast<char *>(data);
  return static_cast<std::ifstream &>(in.read(buffer, bytes));
}

uint32_t SwapEndian(uint32_t val)
{
  val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
  return (val << 16) | (val >> 16);
}

bool ReadMNISTImages(std::ifstream &file, std::vector<std::vector<double>> &data, uint64_t &rows, uint64_t &columns)
{
  uint32_t magicNumber;
  uint32_t numberOfImages;
  uint32_t numberOfRows;
  uint32_t numberOfColumns;

  Read(file, &magicNumber, 4);
  magicNumber = SwapEndian(magicNumber);
  if (magicNumber != 2051)
  {
    std::cout << "Incorrect image file magicNumber: " << magicNumber << std::endl;
    return false;
  }

  Read(file, &numberOfImages, 4);
  numberOfImages = SwapEndian(numberOfImages);

  Read(file, &numberOfRows, 4);
  numberOfRows = SwapEndian(numberOfRows);
  rows = numberOfRows;

  Read(file, &numberOfColumns, 4);
  numberOfColumns = SwapEndian(numberOfColumns);
  columns = numberOfColumns;

  data.reserve(numberOfImages);
  for (uint64_t i = 0; i < numberOfImages; i++)
  {
    data.push_back(std::vector<double>());
    data[i].reserve(numberOfRows * numberOfColumns);

    for (uint64_t j = 0; j < numberOfRows * numberOfColumns; j++)
    {
      uint8_t pixel;
      Read(file, &pixel, 1);
      data[i].push_back((double)pixel / 255);
    }
  }

  return true;
}

bool ReadMNISTLabels(std::ifstream &file, std::vector<uint8_t> &data)
{
  uint32_t magicNumber;
  uint32_t numberOfLabels;

  Read(file, &magicNumber, 4);
  magicNumber = SwapEndian(magicNumber);
  if (magicNumber != 2049)
  {
    std::cout << "Incorrect label file magicNumber: " << magicNumber << std::endl;
    return false;
  }

  Read(file, &numberOfLabels, 4);
  numberOfLabels = SwapEndian(numberOfLabels);

  data.reserve(numberOfLabels);

  for (uint64_t i = 0; i < numberOfLabels; i++)
  {
    uint8_t pixel;
    Read(file, pixel);
    data.push_back(pixel);
  }

  return true;
}

这些是 MSE 输出：

MSE: 0.898595
MSE: 0.899916
MSE: 0.899992
MSE: 0.899984
MSE: 0.899999
MSE: 0.9
MSE: 0.899999
MSE: 0.899999
MSE: 0.9
MSE: 0.9
correct: 979, wrong: 9021, total: 10000

我一开始以为是激活函数的错误使用导致了这个错误，但即使我将激活函数更改为不同的函数，输出仍然是相同的。更改隐藏层的拓扑时也是同样的情况。此外，我更改了学习率，它对 MSE 有一些影响，但仍与 MSE 大致相同。我不知道为什么仍然一次又一次地给出相同的输出。我预计每次拟合后 MSE 都会下降。此外，当多次运行具有不同初始化的同一神经网络时，正确预测的数量和错误预测的数量以及每次拟合的 MSE 保持相同。

ApplyGradientDescent

方法有问题还是我疯了。请帮忙解决这个问题并解释一下出了什么问题。

Answer 1

问题在于成本导数中缺少一个负号。添加这个简单的标志后，问题实际上就消失了。感谢@Alexey S. Larionov 指出我的错误。

神经网络无法正常学习

问题描述投票：0回答：1

1个回答

最新问题

神经网络无法正常学习

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1