我觉得没有inplace操作,但是出现了inplace操作错误

问题描述 投票:0回答:1

出现如下错误:

C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\autograd\__init__.py:199: UserWarning:
Error detected in CudnnRnnBackward0. Traceback of forward call that caused the error:
  File "d:/anaconda/myproject/MYPROJECT77_4class.py", line 328, in <module>
    lstm_output, (hidden_state, cell_state) = lstm_module(backbone_output_flattened, (hidden_state, cell_state))
  File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "d:/anaconda/myproject/MYPROJECT77_4class.py", line 190, in forward
    out, hidden = self.lstm(x, hidden)
  File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\nn\modules\rnn.py", line 775, in forward
    self.dropout, self.training, self.bidirectional, self.batch_first)
  File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\fx\traceback.py", line 57, in format_stack
    return traceback.format_stack()
 (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\autograd\python_anomaly_mode.cpp:119.)
  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
  File "d:/anaconda/myproject/MYPROJECT77_4class.py", line 358, in <module>
    loss.backward(retain_graph=True)
  File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\_tensor.py", line 489, in backward
    self, gradient, retain_graph, create_graph, inputs=inputs
  File "C:\Users\D3\anaconda3\envs\kittimos\lib\site-packages\torch\autograd\__init__.py", line 199, in backward
    allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [2048, 80352]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

首先,我给每个部分都做了注释,看看哪里出了问题

所有与lstm相关的部分都做了注释。现有代码是将 lstm 网络的输出传输到全连接层的代码。 将输入lstm的tensor不经过lstm,改成全连接层后,错误好像消失了。但我不确定。

我也查看了其他区域是否有inplace操作,但是发现没有

+=
-=
等操作。

最后,我尝试在 ReLU 中使用

inplace=False
作为 Stack Overflow 中类似问题的答案,但它仍然是一样的。

代码如下。 (我认为 lstm 部分是问题所在,但我会上传其他部分以防万一。)

class UpsampleBlock(nn.Module):
    def __init__(self, in_channels, out_channels, scale_factor):
        super(UpsampleBlock, self).__init__()
        self.layers = nn.Sequential(
            nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            #nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )
    def forward(self, x):
        return self.layers(x)  

class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size).to(device)
        self.fc2 = nn.Linear(hidden_size, hidden_size).to(device)
        self.fc3 = nn.Linear(hidden_size , num_classes)

        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(1, -1)  # Flatten the input tensor
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x


class PointPillarsFeatureExtractor(nn.Module):
    def __init__(self, in_channels=9, out_channels=64):
        super(PointPillarsFeatureExtractor, self).__init__()

        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=1)
        self.batchnorm = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.conv1(x)
        x = self.batchnorm(x)
        x = self.relu(x)

        # Max pooling over the points (N dimension)
        x, _ = torch.max(x, dim=2)
        return x
   
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, num_layers):
        super(CNNBlock, self).__init__()
        self.layers = nn.Sequential(*[
            nn.Sequential(
                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size,
                          stride=2 if i == 0 else 1, padding=padding),
                #nn.BatchNorm2d(out_channels),
                nn.ReLU()
            )
            for i in range(num_layers)
        ])

    def forward(self, x):
        return self.layers(x)


class UpsampleBlock(nn.Module):
    def __init__(self, in_channels, out_channels, scale_factor):
        super(UpsampleBlock, self).__init__()
        self.layers = nn.Sequential(
            nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            nn.ReLU()
        )

    def forward(self, x):
        return self.layers(x)   


class PointPillarsBackbone(nn.Module):
    def __init__(self, in_channels=64, C=64, S=2):
        super(PointPillarsBackbone, self).__init__()

        self.block1 = CNNBlock(in_channels, C, kernel_size=3, stride=2, padding=1, num_layers=1)
        self.block2 = CNNBlock(C, 2 * C, kernel_size=3, stride=2 , padding=1, num_layers=1)
        self.block3 = CNNBlock(2 * C, 4 * C, kernel_size=3, stride=2, padding=1, num_layers=1)

        self.up1 = UpsampleBlock(in_channels,  C//2 , scale_factor=1/4)
        self.up2 = UpsampleBlock(2 * C,   C//2, scale_factor=1/2)
        self.up3 = UpsampleBlock(4 * C,   C//2, scale_factor=1)
    def forward(self, x):
        block1_out = self.block1(x)
        block2_out = self.block2(block1_out)
        block3_out = self.block3(block2_out)

        up1_out = self.up1(block1_out)
        up2_out = self.up2(block2_out)
        up3_out = self.up3(block3_out)

        concat_features = torch.cat((up1_out, up2_out, up3_out), dim=1)
        return concat_features



class LSTMModule(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, batch_first=True):
        super(LSTMModule, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first)
       
    def forward(self, x, hidden=None):
        if hidden is None:
            out, hidden = self.lstm(x)
        else:
            out, hidden = self.lstm(x, hidden)

        return out, hidden




for i in range(0, 401):
    filename = f"{i:03d}.bin"
    filepath = os.path.join(folder, filename)

    if os.path.isfile(filepath):
        tensor = process_bin_file(filepath)
        tensor = torch.Tensor(tensor)
        tensor = tensor.permute(1, 0, 2).to(device)  
        features = feature_extractor(tensor)
        features = features.permute(1, 0)
        features = features.unsqueeze(0).permute(0, 1, 2)


        x_indices = (((tensor[7, :, 0] - point_cloud_range[0]) // voxel_size[0]).long())
        y_indices = (((tensor[8, :, 0] - point_cloud_range[1]) // voxel_size[1]).long())


        pseudo_image = scatter_features_module(features, x_indices, y_indices)
        backbone_output = backbone_module(pseudo_image)
        print("backbone_output shape  is !!!!! :",  backbone_output.shape)

        backbone_output_flattened = backbone_output.view(1, 1, -1)
        print("backbone_output_flattened shape  is !!!!! :",  backbone_output_flattened.shape)

        lstm_output, (hidden_state, cell_state) = lstm_module(backbone_output_flattened, (hidden_state, cell_state))


        if i >= sequence_length - 1:
            hidden_state = hidden_state[:, 1:, :].clone()
            cell_state = cell_state[:, 1:, :].clone()

            hidden_state = torch.cat([hidden_state, lstm_output[:, -1, :].unsqueeze(1)], dim=1).clone()
            cell_state = torch.cat([cell_state, lstm_output[:, -1, :].unsqueeze(1)], dim=1).clone()
        else:
            hidden_state = lstm_output[:, -1, :].unsqueeze(1).clone()
            cell_state = lstm_output[:, -1, :].unsqueeze(1).clone()
        output = classifier(lstm_output)

        predicted_class = torch.argmax(output[0, 1:]) + 1

        # Calculate the loss and update parameters
        gt_label = torch.tensor([ground_truth[i]], dtype=torch.long).to(device)
        loss = criterion(output, gt_label)
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
python pytorch lstm
1个回答
0
投票

我发现我犯了什么错误。 是

cell_state
hidden_state
的更新引起的问题,我加了几行代码就解决了

    hidden = (hidden_state.detach(), cell_state.detach())
    lstm_output2, (hidden_state2, cell_state2) = 
    lstm_module(backbone_output_flattened, hidden)
© www.soinside.com 2019 - 2024. All rights reserved.