在超梯度中使用分布式数据并行时出错“DistributedSampler”对象没有属性“keys”

问题描述 投票:0回答:1

我正在尝试使用 2 个 GPU 来训练超级梯度库中的 yolo nas 模型。

我参考了这个链接

另请参阅这个

基本上,我正在尝试使用 2 个 GPU 进行 multipgu 训练。我正在尝试使用分布式数据并行策略,但是由于某种原因它给出了错误。有些采样器没有钥匙或其他东西。 这是代码:


import torch
import os
from PIL import Image

from super_gradients.training import Trainer, dataloaders, models
from super_gradients.training.dataloaders.dataloaders import (
    coco_detection_yolo_format_train, coco_detection_yolo_format_val
)
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import DetectionMetrics_050
from super_gradients.training.models.detection_models.pp_yolo_e import (
    PPYoloEPostPredictionCallback
)
from super_gradients.training.utils.distributed_training_utils import setup_device

class config:
    #trainer params
    CHECKPOINT_DIR = 'checkpoints' #specify the path you want to save checkpoints to
    EXPERIMENT_NAME = 'experiment_v2' #specify the experiment name

    #dataset params
    DATA_DIR = 'yolo_data' #parent directory to where data lives

    TRAIN_IMAGES_DIR = 'train/images' #child dir of DATA_DIR where train images are
    TRAIN_LABELS_DIR = 'train/labels' #child dir of DATA_DIR where train labels are

    VAL_IMAGES_DIR = 'val/images' #child dir of DATA_DIR where validation images are
    VAL_LABELS_DIR = 'val/labels' #child dir of DATA_DIR where validation labels are

    # if you have a test set
    TEST_IMAGES_DIR = 'test/images' #child dir of DATA_DIR where test images are
    TEST_LABELS_DIR = 'test/labels' #child dir of DATA_DIR where test labels are

    CLASSES = ['Face'] #what class names do you have

    NUM_CLASSES = len(CLASSES)

    #dataloader params - you can add whatever PyTorch dataloader params you have
    #could be different across train, val, and test
    DATALOADER_PARAMS={
    'batch_size':64,
    'num_workers':4
    }

    # model params
    MODEL_NAME = 'yolo_nas_l' # choose from yolo_nas_s, yolo_nas_m, yolo_nas_l
    PRETRAINED_WEIGHTS = 'coco' #only one option here: coco
    DEVICE = 'cuda' if torch.cuda.is_available() else "cpu"


setup_device(multi_gpu='DDP', num_gpus=2)
trainer = Trainer(experiment_name=config.EXPERIMENT_NAME, ckpt_root_dir=config.CHECKPOINT_DIR)


train_data = coco_detection_yolo_format_train(
    dataset_params={
        'data_dir': config.DATA_DIR,
        'images_dir': config.TRAIN_IMAGES_DIR,
        'labels_dir': config.TRAIN_LABELS_DIR,
        'classes': config.CLASSES
    },
    dataloader_params=config.DATALOADER_PARAMS
)

val_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': config.DATA_DIR,
        'images_dir': config.VAL_IMAGES_DIR,
        'labels_dir': config.VAL_LABELS_DIR,
        'classes': config.CLASSES
    },
    dataloader_params=config.DATALOADER_PARAMS
)

model = models.get(config.MODEL_NAME, 
                   num_classes=config.NUM_CLASSES, 
                   pretrained_weights=config.PRETRAINED_WEIGHTS
                   ).to(config.DEVICE)

train_params = {
    "average_best_models":True,
    "warmup_mode": "linear_epoch_step",
    "warmup_initial_lr": 8e-6,
    "lr_warmup_epochs": 5,
    "initial_lr": 40e-4,
    "lr_mode": "cosine",
    "cosine_final_lr_ratio": 0.1,
    "optimizer": "Adam",
    "optimizer_params": {"weight_decay": 0.0001},
    "zero_weight_decay_on_bias_and_bn": True,
    "ema": True,
    "ema_params": {"decay": 0.9, "decay_type": "threshold"},
    "max_epochs": 300,
    "mixed_precision": True,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        # NOTE: num_classes needs to be defined here
        num_classes=config.NUM_CLASSES,
        reg_max=16
    ),
    "valid_metrics_list": [
        DetectionMetrics_050(
            score_thres=0.1,
            top_k_predictions=300,
            # NOTE: num_classes needs to be defined here
            num_cls=config.NUM_CLASSES,
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7
            )
        )
    ],
    "metric_to_watch": '[email protected]'
}

trainer.train(model=model, 
              training_params=train_params, 
              train_loader=train_data, 
              valid_loader=val_data)

这里有错误!

raceback (most recent call last):
  File "train.py", line 72, in <module>
    val_data = coco_detection_yolo_format_val(
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 286, in coco_detection_yolo_format_val
    return get_data_loader(
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 76, in get_data_loader
    dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 96, in _process_dataloader_params
    dataloader_params = _process_sampler_params(dataloader_params, dataset, default_dataloader_params)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 121, in _process_sampler_params
    dataloader_params = _instantiate_sampler(dataset, dataloader_params)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 144, in _instantiate_sampler
    sampler_name = list(dataloader_params["sampler"].keys())[0]
AttributeError: 'DistributedSampler' object has no attribute 'keys'
Caching annotations: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3226/3226 [00:00<00:00, 5659.64it/s]
Traceback (most recent call last):
  File "train.py", line 72, in <module>
    val_data = coco_detection_yolo_format_val(
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 286, in coco_detection_yolo_format_val
    return get_data_loader(
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 76, in get_data_loader
    dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 96, in _process_dataloader_params
    dataloader_params = _process_sampler_params(dataloader_params, dataset, default_dataloader_params)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 121, in _process_sampler_params
    dataloader_params = _instantiate_sampler(dataset, dataloader_params)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 144, in _instantiate_sampler
    sampler_name = list(dataloader_params["sampler"].keys())[0]
AttributeError: 'DistributedSampler' object has no attribute 'keys'
[2023-06-08 19:04:47] WARNING - api.py - Sending process 74086 closing signal SIGTERM
WARNING: Logging before flag parsing goes to stderr.
W0608 19:04:47.502784 140609054398272 api.py:699] Sending process 74086 closing signal SIGTERM
[2023-06-08 19:04:47] ERROR - api.py - failed (exitcode: 1) local_rank: 0 (pid: 74081) of binary: /opt/conda/bin/python3
E0608 19:04:47.773856 140609054398272 api.py:673] failed (exitcode: 1) local_rank: 0 (pid: 74081) of binary: /opt/conda/bin/python3
Traceback (most recent call last):
  File "train.py", line 56, in <module>
    setup_device(multi_gpu='DDP', num_gpus=2)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/common/decorators/factory_decorator.py", line 36, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/utils/distributed_training_utils.py", line 240, in setup_device
    setup_gpu(multi_gpu, num_gpus)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/utils/distributed_training_utils.py", line 278, in setup_gpu
    restart_script_with_ddp(num_gpus=num_gpus)
  File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/utils/distributed_training_utils.py", line 387, in restart_script_with_ddp
    elastic_launch(config=config, entrypoint=sys.executable)(*sys.argv, *EXTRA_ARGS)
  File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
train.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-06-08_19:04:47
  host      : 0d9c785c0a81
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 74081)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

python deep-learning yolo
1个回答
0
投票

尝试添加设备配置 setup_device(设备='cuda',multi_gpu='DDP',num_gpus=2)

© www.soinside.com 2019 - 2024. All rights reserved.