运行时错误:NCCL 错误 2:未处理的系统错误

问题描述 投票:0回答:2

我最近将cuda从9.0升级到10.2,但是当我成功升级时,我的演示如下,将默认出现“RuntimeError: NCCL Error 2: unhandled system error”。

我不知道为什么,尝试在 github 或 stackoverflow 中寻找答案,但失败了。所以我希望有人能帮助我。

import torch
from torchvision import datasets, transforms
import torchvision
from tqdm import tqdm
 
device_ids = [0, 1] # GPU
BATCH_SIZE = 64
 
transform = transforms.Compose([transforms.ToTensor()])
data_train = datasets.MNIST(root = "./data/",
                            transform=transform,
                            train=True,
                            download=True)
data_test = datasets.MNIST(root="./data/",
                           transform=transform,
                           train=False)
 
data_loader_train = torch.utils.data.DataLoader(dataset=data_train,
                                                
                                                batch_size=BATCH_SIZE * len(device_ids),
                                                shuffle=True,
                                                num_workers=2)
 
data_loader_test = torch.utils.data.DataLoader(dataset=data_test,
                                               batch_size=BATCH_SIZE * len(device_ids),
                                               shuffle=True,
                                               num_workers=2)
 
 
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = torch.nn.Sequential(
        torch.nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
        torch.nn.ReLU(),
        torch.nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
        torch.nn.ReLU(),
        torch.nn.MaxPool2d(stride=2, kernel_size=2),
    )
        self.dense = torch.nn.Sequential(
            torch.nn.Linear(14 * 14 * 128, 1024),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.5),
            torch.nn.Linear(1024, 10)
    )
    def forward(self, x):
        x = self.conv1(x)
        x = x.view(-1, 14 * 14 * 128)
        x = self.dense(x)
        return x
 
 
model = Model()

model = torch.nn.DataParallel(model, device_ids=device_ids)

model = model.cuda(device=device_ids[0])
 
cost = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
from time import sleep
n_epochs = 50
for epoch in range(n_epochs):
    running_loss = 0.0
    running_correct = 0
    print("Epoch {}/{}".format(epoch, n_epochs))
    print("-"*10)
    for data in tqdm(data_loader_train):
        X_train, y_train = data
        
        X_train, y_train = X_train.cuda(device=device_ids[0]), y_train.cuda(device=device_ids[0])
        outputs = model(X_train)
        _,pred = torch.max(outputs.data, 1)
        optimizer.zero_grad()
        loss = cost(outputs, y_train)
 
        loss.backward()
        optimizer.step()
        running_loss += loss.data.item()
        running_correct += torch.sum(pred == y_train.data)
    testing_correct = 0
    for data in data_loader_test:
        X_test, y_test = data
        
        X_test, y_test = X_test.cuda(device=device_ids[0]), y_test.cuda(device=device_ids[0])
        outputs = model(X_test)
        _, pred = torch.max(outputs.data, 1)
        testing_correct += torch.sum(pred == y_test.data)
    print("Loss is:{:.4f}, Train Accuracy is:{:.4f}%, Test Accuracy is:{:.4f}".format(torch.true_divide(running_loss, len(data_train)),
                                                                                      torch.true_divide(100*running_correct, len(data_train)),
                                                                                      torch.true_divide(100*testing_correct, len(data_test))))
torch.save(model.state_dict(), "model_parameter.pkl")

以下是错误信息。

Epoch 0/50
----------
  0%|                                                                                                                                                                               | 0/469 [00:00<?, ?it/s]7aea7ed215cf:50693:50693 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.14<0>
7aea7ed215cf:50693:50693 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation

7aea7ed215cf:50693:50693 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
7aea7ed215cf:50693:50693 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.14<0>
7aea7ed215cf:50693:50693 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda10.2
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)
7aea7ed215cf:50693:50809 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
7aea7ed215cf:50693:50808 [0] NCCL INFO Channel 00/02 :    0   1
7aea7ed215cf:50693:50808 [0] NCCL INFO Channel 01/02 :    0   1
7aea7ed215cf:50693:50809 [1] NCCL INFO Setting affinity for GPU 1 to 3ff003ff
7aea7ed215cf:50693:50808 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
7aea7ed215cf:50693:50808 [0] NCCL INFO Setting affinity for GPU 0 to 3ff003ff
7aea7ed215cf:50693:50809 [1] NCCL INFO Could not enable P2P between dev 1(=3e000) and dev 0(=3d000)
7aea7ed215cf:50693:50808 [0] NCCL INFO Could not enable P2P between dev 0(=3d000) and dev 1(=3e000)

7aea7ed215cf:50693:50809 [1] include/shm.h:28 NCCL WARN Call to posix_fallocate failed : No space left on device
7aea7ed215cf:50693:50809 [1] NCCL INFO include/shm.h:41 -> 2

7aea7ed215cf:50693:50809 [1] include/shm.h:48 NCCL WARN Error while creating shared memory segment nccl-shm-recv-3bd03c4f9664d387-0-0-1 (size 9637888)
7aea7ed215cf:50693:50809 [1] NCCL INFO transport/shm.cc:100 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO transport.cc:34 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO transport.cc:84 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO init.cc:778 -> 2

7aea7ed215cf:50693:50808 [0] include/shm.h:28 NCCL WARN Call to posix_fallocate failed : No space left on device
7aea7ed215cf:50693:50808 [0] NCCL INFO include/shm.h:41 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO init.cc:904 -> 2

7aea7ed215cf:50693:50808 [0] include/shm.h:48 NCCL WARN Error while creating shared memory segment nccl-shm-recv-3bd03c4f9664d387-0-1-0 (size 9637888)
7aea7ed215cf:50693:50808 [0] NCCL INFO transport/shm.cc:100 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO transport.cc:34 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO transport.cc:84 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO init.cc:778 -> 2
7aea7ed215cf:50693:50809 [1] NCCL INFO group.cc:72 -> 2 [Async thread]
7aea7ed215cf:50693:50808 [0] NCCL INFO init.cc:904 -> 2
7aea7ed215cf:50693:50808 [0] NCCL INFO group.cc:72 -> 2 [Async thread]
7aea7ed215cf:50693:50693 [0] NCCL INFO init.cc:973 -> 2
  0%|                                                                                                                                                                               | 0/469 [00:03<?, ?it/s]
Traceback (most recent call last):
  File "test.py", line 73, in <module>
    outputs = model(X_train)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 167, in forward
    replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 172, in replicate
    return replicate(module, device_ids, not torch.is_grad_enabled())
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
    param_copies = _broadcast_coalesced_reshape(params, devices, detach)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/replicate.py", line 71, in _broadcast_coalesced_reshape
    tensor_copies = Broadcast.apply(devices, *tensors)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/_functions.py", line 23, in forward
    outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
  File "/root/anaconda3/envs/pytorch171/lib/python3.7/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: NCCL Error 2: unhandled system error
python pytorch cuda
2个回答
4
投票

这显然是由较新版本的 nccl 引起的,其中包括使用 Linux 共享内存进行节点间通信的数据路径(请参阅此处)。如果该系统配置错误或不可用,那么您可能会在任何使用 nccl 的代码库中看到此问题。

解决此问题的两个选择是

  1. 正确设置linux tmpfs系统
  2. 使用
    NCCL_SHM_DISABLE
    环境变量来防止 nccl 尝试使用此数据路径(请参阅文档此处)。这将迫使 nccl 回退到可能较慢的数据路径。

0
投票

我尝试了 NCCL_IB_DISABLE=1 python ...,它对我有用。

© www.soinside.com 2019 - 2024. All rights reserved.