我的BPTT实现有什么问题？

Question

我尝试通过时间手动实现反向传播，但最终网络无法融合。我尝试在网上四处寻找有关BPTT的描述和课程，并且代码相应地完成了所有工作：

正向传播
错误向后传播
基于期望值的梯度计算
基于梯度和学习率更新权重

我理解递归导数的方式是，在递归神经网络的情况下，上一步的输入不能视为常量。因此：第三步中的w1的导数不仅取决于当前步骤的输入，还取决于先前的步骤。这就是dw1[1] = net_inputs_train[first_sample_index + 1][0];不正确的原因，它必须为dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;。

在展开的网络中，其他所有东西都应该是“仅”反向传播。不幸的是，该程序无法正常工作，错误只是在没有网络收敛的情况下跳来跳去。

我不知道我还能做些什么，也许我完全误解了它的概念...

#include <iostream>
#include <vector>
#include <cmath>

using namespace std;

int main(int argc, char *argv[]){

  /* Manual BPTT with one custom implemented Neuron */
  double number_of_samples = 3; /* Binary addition dataset */
  vector<vector<double>> net_inputs_train = { /* 2 inputs in each step */
      {1,1},    {0,0},  {0,0}, /* 100 + 100 = 110 */
      {1,0},    {0,1},  {1,0}, /* 101 + 010 = 111*/
      {1,0},    {1,1},  {0,0}, /* 110 + 010 = 111 */
  };

  vector<vector<double>> expected_output = { /* 1 output in each step */
      {1},      {1},    {0}, /* 110 */
      {1},      {1},    {1}, /* 111 */
      {1},      {1},    {1}, /* 111 */
  };

  double w1 = 0.5;
  double w2 = 0.5;
  double w3 = 0.5;
  double b = 0.0;

  vector<double> neuron_data(3,0);
  vector<double> neuron_deriv(3,0); /* Neuron error value ( partial based on the output )*/

  vector<double> dw1(3,0); /* derivatives for weights for each sequence */
  vector<double> dw2(3,0);
  vector<double> dw3(3,0);
  vector<double> derb(3,0);

  int first_sample_index;
  double manual_error = 1.0;
  double learning_rate = 1e-2;
  while(manual_error > learning_rate){
    for(int mbIter = 0; mbIter < 4; ++mbIter){
      first_sample_index = (rand()%(static_cast<int>(number_of_samples)));

      /* Fill in the data and derviatives */
      neuron_data[0] = (
        net_inputs_train[first_sample_index][0] * w1
        + net_inputs_train[first_sample_index][1] * w2
        + b
      );
      dw1[0] = net_inputs_train[first_sample_index][0];
      dw2[0] = net_inputs_train[first_sample_index][1];
      dw3[0] = 0;
      derb[0] = 1;

      neuron_data[1] = (
        net_inputs_train[first_sample_index + 1][0] * w1
        + net_inputs_train[first_sample_index + 1][1] * w2
        + neuron_data[0] * w3
        + b
      );
      dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
      dw2[1] = net_inputs_train[first_sample_index + 1][1] + dw2[0] * w3;
      dw3[1] = neuron_data[0] + w3 * dw3[0];
      derb[1] = 1 + derb[0] * w3;

      neuron_data[2] = (
        net_inputs_train[first_sample_index + 2][0] * w1
        + net_inputs_train[first_sample_index + 2][1] * w2
        + neuron_data[1] * w3
        + b
      );
      dw1[2] = net_inputs_train[first_sample_index + 2][0] + dw1[1] * w3;
      dw2[2] = net_inputs_train[first_sample_index + 2][1] + dw2[1] * w3;
      dw3[2] = neuron_data[1] + w3 * dw3[1];
      derb[2] = 1 + derb[1] * w3;

      /* Calculate the error and the gradients */
      manual_error = (
        pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
        +pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
        +pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
      );

      neuron_deriv[2] = (
        (-(neuron_data[2] - expected_output[first_sample_index + 2][0])/2.0)
      );
      neuron_deriv[1] = (
        (-(neuron_data[1] - expected_output[first_sample_index + 1][0])/2.0)
        + (w3 * neuron_deriv[2])
      );
      neuron_deriv[0] = (
        (-(neuron_data[0] - expected_output[first_sample_index + 0][0])/2.0)
        + (w3 * neuron_deriv[1])
      );

      w1 += ((
        neuron_deriv[2] * dw1[2]
        + neuron_deriv[1] * dw1[1]
        + neuron_deriv[0] * dw1[0]
      ) / 300.0);

      w2 += (learning_rate * (
        neuron_deriv[2] * dw2[2]
        + neuron_deriv[1] * dw2[1]
        + neuron_deriv[0] * dw2[0]
      ) / number_of_samples);

      w3 += (learning_rate * (
        neuron_deriv[2] * dw3[2]
        + neuron_deriv[1] * dw3[1]
        + neuron_deriv[0] * dw3[0]
      ) / number_of_samples);

      b += (learning_rate * (
        neuron_deriv[2] * derb[2]
        + neuron_deriv[1] * derb[1]
        + neuron_deriv[0] * derb[0]
      ) / number_of_samples);
      std::cout << "\r Error: " << manual_error << "                    ";

    }
  }

    return 0;
}

我尝试通过时间手动实现反向传播，但最终网络无法融合。我尝试在网上四处寻找有关BPTT的描述和课程，并且代码确实...

Answer 1

1
投票

我认为这是拼写错误：

我的BPTT实现有什么问题？

问题描述投票：0回答：1

1个回答

最新问题

我的BPTT实现有什么问题？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1