我最近将cuda从9.0升级到10.2,但是当我成功升级时,我的演示如下,将默认出现“RuntimeError: NCCL Error 2: unhandled system error”。
我不知道为什么,尝试在 github 或 stackoverflow 中寻找答案,但失败了。所以我希望有人能帮助我。
import torch
from torchvision import datasets, transforms
import torchvision
from tqdm import tqdm
device_ids = [0, 1] # GPU
BATCH_SIZE = 64
transform = transforms.Compose([transforms.ToTensor()])
data_train = datasets.MNIST(root = "./data/",
transform=transform,
train=True,
download=True)
data_test = datasets.MNIST(root="./data/",
transform=transform,
train=False)
data_loader_train = torch.utils.data.DataLoader(dataset=data_train,
batch_size=BATCH_SIZE * len(device_ids),
shuffle=True,
num_workers=2)
data_loader_test = torch.utils.data.DataLoader(dataset=data_test,
batch_size=BATCH_SIZE * len(device_ids),
shuffle=True,
num_workers=2)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.conv1 = torch.nn.Sequential(
torch.nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(stride=2, kernel_size=2),
)
self.dense = torch.nn.Sequential(
torch.nn.Linear(14 * 14 * 128, 1024),
torch.nn.ReLU(),
torch.nn.Dropout(p=0.5),
torch.nn.Linear(1024, 10)
)
def forward(self, x):
x = self.conv1(x)
x = x.view(-1, 14 * 14 * 128)
x = self.dense(x)
return x
model = Model()
model = torch.nn.DataParallel(model, device_ids=device_ids)
model = model.cuda(device=device_ids[0])
cost = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
from time import sleep
n_epochs = 50
for epoch in range(n_epochs):
running_loss = 0.0
running_correct = 0
print("Epoch {}/{}".format(epoch, n_epochs))
print("-"*10)
for data in tqdm(data_loader_train):
X_train, y_train = data
X_train, y_train = X_train.cuda(device=device_ids[0]), y_train.cuda(device=device_ids[0])
outputs = model(X_train)
_,pred = torch.max(outputs.data, 1)
optimizer.zero_grad()
loss = cost(outputs, y_train)
loss.backward()
optimizer.step()
running_loss += loss.data.item()
running_correct += torch.sum(pred == y_train.data)
testing_correct = 0
for data in data_loader_test:
X_test, y_test = data
X_test, y_test = X_test.cuda(device=device_ids[0]), y_test.cuda(device=device_ids[0])
outputs = model(X_test)
_, pred = torch.max(outputs.data, 1)
testing_correct += torch.sum(pred == y_test.data)
print("Loss is:{:.4f}, Train Accuracy is:{:.4f}%, Test Accuracy is:{:.4f}".format(torch.true_divide(running_loss, len(data_train)),
torch.true_divide(100*running_correct, len(data_train)),
torch.true_divide(100*testing_correct, len(data_test))))
torch.save(model.state_dict(), "model_parameter.pkl")
以下是错误信息。
Epoch 0/50
----------
0%| | 0/469 [00:00<?, ?it/s]7aea7ed215cf:50693:50693 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.14<0>
7aea7ed215cf:50693:50693 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
7aea7ed215cf:50693:50693 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
7aea7ed215cf:50693:50693 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.14<0>
7aea7ed215cf:50693:50693 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda10.2
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
7aea7ed215cf:50693:50808 [0] NCCL INFO Channel 00/02 : 0 1
7aea7ed215cf:50693:50808 [0] NCCL INFO Channel 01/02 : 0 1
7aea7ed215cf:50693:50809 [1] NCCL INFO Setting affinity for GPU 1 to 3ff003ff
7aea7ed215cf:50693:50808 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
7aea7ed215cf:50693:50808 [0] NCCL INFO Setting affinity for GPU 0 to 3ff003ff
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] include/shm.h:28 NCCL WARN Call to posix_fallocate failed : No space left on device
7aea7ed215cf:50693:50809 [1] NCCL INFO include/shm.h:41 -> 2
7aea7ed215cf:50693:50809 [1] include/shm.h:48 NCCL WARN Error while creating shared memory segment nccl-shm-recv-3bd03c4f9664d387-0-0-1 (size 9637888)
7aea7ed215cf:50693:50809 [1] NCCL INFO transport/shm.cc:100 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO transport.cc:34 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO transport.cc:84 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO init.cc:778 -> 2
7aea7ed215cf:50693:50808 [0] include/shm.h:28 NCCL WARN Call to posix_fallocate failed : No space left on device
7aea7ed215cf:50693:50808 [0] NCCL INFO include/shm.h:41 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO init.cc:904 -> 2
7aea7ed215cf:50693:50808 [0] include/shm.h:48 NCCL WARN Error while creating shared memory segment nccl-shm-recv-3bd03c4f9664d387-0-1-0 (size 9637888)
7aea7ed215cf:50693:50808 [0] NCCL INFO transport/shm.cc:100 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO transport.cc:34 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO transport.cc:84 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO init.cc:778 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO group.cc:72 -> 2 [Async thread]
7aea7ed215cf:50693:50808 [0] NCCL INFO init.cc:904 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO group.cc:72 -> 2 [Async thread]
7aea7ed215cf:50693:50693 [0] NCCL INFO init.cc:973 -> 2
0%| | 0/469 [00:03<?, ?it/s]
Traceback (most recent call last):
File "test.py", line 73, in <module>
outputs = model(X_train)
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 167, in forward
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 172, in replicate
return replicate(module, device_ids, not torch.is_grad_enabled())
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 71, in _broadcast_coalesced_reshape
tensor_copies = Broadcast.apply(devices, *tensors)
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/_functions.py", line 23, in forward
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: NCCL Error 2: unhandled system error