|
4 | 4 | import torch.nn.functional as F |
5 | 5 | import torch.distributed as dist |
6 | 6 | import os |
7 | | -from torchvision import datasets, transforms |
| 7 | +from torchvision import transforms |
8 | 8 | from torch.nn.parallel import DistributedDataParallel as DDP |
9 | 9 | from torch.utils.data import DataLoader, DistributedSampler |
10 | 10 |
|
@@ -36,13 +36,19 @@ def forward(self, x): |
36 | 36 | return F.log_softmax(x, dim=1) |
37 | 37 |
|
38 | 38 | def train(): |
| 39 | + master_addr = os.environ.get("MASTER_ADDR", "localhost") |
| 40 | + master_port = os.environ.get("MASTER_PORT", "29500") |
| 41 | + world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) |
| 42 | + rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) |
| 43 | + local_rank = rank % torch.cuda.device_count() |
| 44 | + |
39 | 45 | # Initialize process group |
40 | | - dist.init_process_group(backend="nccl") |
41 | | - |
42 | | - # Get local rank from environment variable |
43 | | - local_rank = int(os.environ["LOCAL_RANK"]) |
44 | | - rank = int(os.environ["RANK"]) |
45 | | - world_size = int(os.environ["WORLD_SIZE"]) |
| 46 | + dist.init_process_group( |
| 47 | + backend="nccl", |
| 48 | + init_method=f"tcp://{master_addr}:{master_port}", |
| 49 | + world_size=world_size, |
| 50 | + rank=rank |
| 51 | + ) |
46 | 52 |
|
47 | 53 | # Set device |
48 | 54 | torch.cuda.set_device(local_rank) |
@@ -77,7 +83,8 @@ def transform(example): |
77 | 83 | loss.backward() |
78 | 84 | optimizer.step() |
79 | 85 |
|
80 | | - if batch_idx % 10 == 0: |
| 86 | + dist.all_reduce(loss, op=dist.ReduceOp.AVG) |
| 87 | + if rank == 0 and batch_idx % 10 == 0: |
81 | 88 | print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}") |
82 | 89 |
|
83 | 90 | if rank == 0: |
|
0 commit comments