yolov8 ddp训练报错，代码bug修改

单机多卡没错，多机多卡报错。修改两处处代码解决问题。

cv-daily

1515人浏览 · 2023-11-13 16:20:37

cv-daily · 2023-11-13 16:20:37 发布

单机多卡没错，多机多卡报错。
报错信息：

解决参考官方提问：
https://github.com/ultralytics/ultralytics/issues/2132

修改两处处代码解决问题
①ultralytics/engine/trainer.py
原始：

from ultralytics.utils import (DEFAULT_CFG, LOGGER, RANK, TQDM, __version__, callbacks, clean_url, colorstr, emojis,
                               yaml_save)
def _setup_ddp(self, world_size):
    """Initializes and sets the DistributedDataParallel parameters for training."""
    torch.cuda.set_device(RANK)
    self.device = torch.device('cuda', RANK)
    # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
    os.environ['NCCL_BLOCKING_WAIT'] = '1'  # set to enforce timeout
    dist.init_process_group(
        'nccl' if dist.is_nccl_available() else 'gloo',
        timeout=timedelta(seconds=10800),  # 3 hours
        rank=RANK,
        world_size=world_size)

改为：

from ultralytics.utils import (DEFAULT_CFG, LOGGER, LOCAL_RANK, RANK, TQDM, __version__, callbacks, clean_url, colorstr, emojis,
                               yaml_save)
def _setup_ddp(self, world_size):
    """Initializes and sets the DistributedDataParallel parameters for training."""
    torch.cuda.set_device(LOCAL_RANK)
    self.device = torch.device('cuda', LOCAL_RANK)
    # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
    os.environ['NCCL_BLOCKING_WAIT'] = '1'  # set to enforce timeout
    dist.init_process_group(
        'nccl' if dist.is_nccl_available() else 'gloo',
        timeout=timedelta(seconds=10800))

②ultralytics/models/yolo/detect/train.py
原始

from ultralytics.utils import LOGGER, RANK
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
    """Construct and return dataloader."""
    assert mode in ['train', 'val']
    with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP

from ultralytics.utils import LOGGER, RANK, LOCAL_RANK
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
    """Construct and return dataloader."""
    assert mode in ['train', 'val']
    with torch_distributed_zero_first(LOCAL_RANK):  # init dataset *.cache only once if DDP