我的模特:
class myNet(nn.Module):
def __init__(self):
super(myNet,self).__init__()
self.act1=Dynamic_relu_b(64)
self.conv1=nn.Conv2d(3,64,3)
self.pool=nn.AdaptiveAvgPool2d(1)
self.fc=nn.Linear(128,20)
def forward(self,x):
x=self.conv1(x)
x=self.act1(x)
x=self.pool(x)
x=x.view(x.shape[0],-1)
x=self.fc(x)
return x
提供了复制实验的代码:
def one_hot_smooth_label(x,num_class,smooth=0.1):
num=x.shape[0]
labels=torch.zeros((num,20))
for i in range(num):
labels[i][x[i]]=1
labels=(1-(num_class-1)/num_class*smooth)*labels+smooth/num_class
return labels
images=torch.rand((4,3,300,300))
images=images.cuda()
labels=torch.from_numpy(np.array([1,0,0,1]))
model=myNet()
model=model.cuda()
output=model(images)
labels=one_hot_smooth_label(labels,20)
labels = labels.cuda()
criterion=nn.BCEWithLogitsLoss()
loss=criterion(output,labels)
loss.backward()
错误:
RuntimeError Traceback (most recent call last)
<ipython-input-42-1268777e87e6> in <module>()
21
22 loss=criterion(output,labels)
---> 23 loss.backward()
1 frames
/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
98 Variable._execution_engine.run_backward(
99 tensors, grad_tensors, retain_graph, create_graph,
--> 100 allow_unreachable=True) # allow_unreachable flag
101
102
RuntimeError: Function AddBackward0 returned an invalid gradient at index 1 - expected type TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false) but got TensorOptions(dtype=float, device=cuda:0, layout=Strided, requires_grad=false) (validate_outputs at /pytorch/torch/csrc/autograd/engine.cpp:484)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x46 (0x7fcf7711b536 in /usr/local/lib/python3.6/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x2d84224 (0x7fcfb1bad224 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #2: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x548 (0x7fcfb1baed58 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7fcfb1bb0ce2 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::Engine::thread_init(int) + 0x39 (0x7fcfb1ba9359 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7fcfbe2e8378 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch_python.so)
frame #6: <unknown function> + 0xbd6df (0x7fcfe23416df in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #7: <unknown function> + 0x76db (0x7fcfe34236db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #8: clone + 0x3f (0x7fcfe375c88f in /lib/x86_64-linux-gnu/libc.so.6)
[经过多次实验,我发现模型中的act1是问题。如果删除act1,则不会出现该错误!
但是我不知道为什么act1有这个问题。
错误的错误部分似乎是requiers_grad = False,我不知道是哪一部分造成的。
这是关于act1(Dynamic_relu_b)的代码:
class Residual(nn.Module):
def __init__(self, in_channel, R=8, k=2):
super(Residual, self).__init__()
self.avg = nn.AdaptiveAvgPool2d((1, 1))
self.relu = nn.ReLU(inplace=True)
self.R = R
self.k = k
out_channel = int(in_channel / R)
self.fc1 = nn.Linear(in_channel, out_channel)
fc_list = []
for i in range(k):
fc_list.append(nn.Linear(out_channel, 2 * in_channel))
self.fc2 = nn.ModuleList(fc_list)
def forward(self, x):
x = self.avg(x)
x = torch.squeeze(x)
x = self.fc1(x)
x = self.relu(x)
result_list = []
for i in range(self.k):
result = self.fc2[i](x)
result = 2 * torch.sigmoid(result) - 1
result_list.append(result)
return result_list
class Dynamic_relu_b(nn.Module):
def __init__(self, inchannel, R=8, k=2):
super(Dynamic_relu_b, self).__init__()
self.lambda_alpha = 1
self.lambda_beta = 0.5
self.R = R
self.k = k
self.init_alpha = torch.zeros(self.k)
self.init_beta = torch.zeros(self.k)
self.init_alpha[0] = 1
self.init_beta[0] = 1
for i in range(1, k):
self.init_alpha[i] = 0
self.init_beta[i] = 0
self.residual = Residual(inchannel)
def forward(self, input):
delta = self.residual(input)
in_channel = input.shape[1]
bs = input.shape[0]
alpha = torch.zeros((self.k, bs, in_channel))
beta = torch.zeros((self.k, bs, in_channel))
for i in range(self.k):
for j, c in enumerate(range(0, in_channel * 2, 2)):
alpha[i, :, j] = delta[i][:, c]
beta[i, :, j] = delta[i][:, c + 1]
alpha1 = alpha[0]
beta1 = beta[0]
max_result = self.dynamic_function(alpha1, beta1, input, 0)
for i in range(1, self.k):
alphai = alpha[i]
betai = beta[i]
result = self.dynamic_function(alphai, betai, input, i)
max_result = torch.max(max_result, result)
return max_result
def dynamic_function(self, alpha, beta, x, k):
init_alpha = self.init_alpha[k]
init_beta = self.init_beta[k]
alpha = init_alpha + self.lambda_alpha * alpha
beta = init_beta + self.lambda_beta * beta
bs = x.shape[0]
channel = x.shape[1]
results = torch.zeros_like(x)
for i in range(bs):
for c in range(channel):
results[i, c, :, :] = x[i, c] * alpha[i, c] + beta[i, c]
return results
我应该如何解决这个问题?
在PyTorch中,两个张量需要位于同一设备上,以在它们之间执行任何数学运算。但是在您的情况下,一个在CPU上,另一个在GPU上。该错误并不像通常那样清晰,因为它发生在向后传递中。您很(幸运)您的前传没有失败。那是因为相同的设备限制有一个例外,即在数学运算中使用标量值时,例如tensor * 2
,甚至当标量是张量时也会发生:cpu_tensor * tensor(2, device='cuda:0')
。您正在使用很多循环并访问各个标量来计算进一步的结果。
尽管前向传递像这样工作,但在计算梯度时,在后向传递中,将梯度乘以先前的梯度(应用链式规则)。那时,两者位于不同的设备上。
您已经确定它在Dynamic_relu_b
中。在其中,您需要确保创建的每个张量都与输入在同一设备上。您在forward方法中创建的两个张量是:
alpha = torch.zeros((self.k, bs, in_channel))
beta = torch.zeros((self.k, bs, in_channel))
这些是在CPU上创建的,但是您的输入是在GPU上的,因此您也需要将它们放在GPU上。一般而言,应将其放置在输入所在的设备上。
alpha = torch.zeros((self.k, bs, in_channel), device=input.device)
beta = torch.zeros((self.k, bs, in_channel), device=input.device)
您的代码中最大的问题是循环。它们不仅掩盖了错误,而且对它们的性能非常有害,因为它们既不能并行化也不能矢量化,这就是GPU如此快的原因。我敢肯定,这些循环可以用更高效的操作代替,但是您必须摆脱创建空张量然后逐个填充的想法。
[dynamic_function
中的一个例子,]]
)相乘,其中每个平面(results = torch.zeros_like(x) for i in range(bs): for c in range(channel): results[i, c, :, :] = x[i, c] * alpha[i, c] + beta[i, c]
您要将
)与x
(大小:[[bs,通道,高度,宽度]alpha
(大小:[bs,通道]
x
的高度,宽度)乘以另一个alpha(标量)元素。这与使用张量与平面[[height,width]大小相同的张量进行元素明智的乘法相同,但是其中所有元素都是相同的标量。[谢谢,您不需要自己重复它们,因为奇异尺寸(尺寸为1的尺寸)会自动扩展为与另一个张量的尺寸匹配,请参见PyTorch - Broadcasting Semantics了解详细信息。这意味着您只需要重塑alpha
的大小即可<< [[bs,channel,1,1]。
results = x * alpha.view(bs, channel, 1, 1) + beta.view(bs, channel, 1, 1)
通过消除该循环,您可以获得很多性能,并且您的初始错误也变得更加清楚,因为前向传递将失败,并显示以下消息:
File "main.py", line 78, in dynamic_function results = x * alpha.view(bs, channel, 1, 1) + beta.view(bs, channel, 1, 1) RuntimeError: expected device cuda:0 but got device cpu
现在您将知道其中一个在CPU上,另一个在GPU上。