单机多卡没错,多机多卡报错。
报错信息:

解决参考官方提问:
https://github.com/ultralytics/ultralytics/issues/2132

修改两处处代码解决问题
①ultralytics/engine/trainer.py
原始:

from ultralytics.utils import (DEFAULT_CFG, LOGGER, RANK, TQDM, __version__, callbacks, clean_url, colorstr, emojis,
                               yaml_save)
def _setup_ddp(self, world_size):
    """Initializes and sets the DistributedDataParallel parameters for training."""
    torch.cuda.set_device(RANK)
    self.device = torch.device('cuda', RANK)
    # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
    os.environ['NCCL_BLOCKING_WAIT'] = '1'  # set to enforce timeout
    dist.init_process_group(
        'nccl' if dist.is_nccl_available() else 'gloo',
        timeout=timedelta(seconds=10800),  # 3 hours
        rank=RANK,
        world_size=world_size)

改为:

from ultralytics.utils import (DEFAULT_CFG, LOGGER, LOCAL_RANK, RANK, TQDM, __version__, callbacks, clean_url, colorstr, emojis,
                               yaml_save)
def _setup_ddp(self, world_size):
    """Initializes and sets the DistributedDataParallel parameters for training."""
    torch.cuda.set_device(LOCAL_RANK)
    self.device = torch.device('cuda', LOCAL_RANK)
    # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
    os.environ['NCCL_BLOCKING_WAIT'] = '1'  # set to enforce timeout
    dist.init_process_group(
        'nccl' if dist.is_nccl_available() else 'gloo',
        timeout=timedelta(seconds=10800))

②ultralytics/models/yolo/detect/train.py
原始

from ultralytics.utils import LOGGER, RANK
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
    """Construct and return dataloader."""
    assert mode in ['train', 'val']
    with torch_distributed_zero_first(rank):  # init dataset *.cache only once if DDP
from ultralytics.utils import LOGGER, RANK, LOCAL_RANK
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
    """Construct and return dataloader."""
    assert mode in ['train', 'val']
    with torch_distributed_zero_first(LOCAL_RANK):  # init dataset *.cache only once if DDP
Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐