
{ "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 1, "tid": 7, "ts": 1713351140570122, "dur": 386, "args": { "External id": 1529, "device": 1, "context": 1, "stream": 7, "correlation": 1529, "bytes": 163840000, "memory bandwidth (GB/s)": 424.2921773719471 } } import torch import torch.distributed as dist import torch.nn as nn import torch.optim as optim import torch.profiler from torch.nn.parallel import DistributedDataParallel as DDP def setup(rank, world_size): import os os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' dist.init_process_group("nccl", rank=rank, world_size=world_size) torch.cuda.set_device(rank) def cleanup(): dist.destroy_process_group() class SimpleModel(nn.Module): def __init__(self): super(SimpleModel, self).__init__() self.fc = nn.Linear(6400, 6400) def forward(self, x): return self.fc(x) def demo_basic(rank, world_size): setup(rank, world_size) # Create model and move it to GPU with id rank model = SimpleModel().to(rank) model = DDP(model, device_ids=[rank]) optimizer = optim.SGD(model.parameters(), lr=0.01) # Create a random tensor to simulate input data inputs = torch.randn(200, 6400).to(rank) labels = torch.randn(200, 6400).to(rank) with torch.profiler.profile( activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], schedule=torch.profiler.schedule(wait=1, warmup=1, active=3), on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'), profile_memory=True, # Track memory allocation/deallocation. with_stack=True ) as prof: for _ in range(10): outputs = model(inputs) loss = nn.functional.mse_loss(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() prof.step() cleanup() def main(): world_size = 2 torch.multiprocessing.spawn(demo_basic, args=(world_size,), nprocs=world_size, join=True) if __name__ == "__main__": main() 第一个代码块是第二个代码块的日志的一份信息,我想测试两个卡之间的通信带宽,但是这个代码块一中的 bandwidth 有点看不懂了,为什么能达到 400+GB/s,硬件时 PCIE 4.0 x16 单机双卡 4090, 我用 https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/p2pBandwidthLatencyTest 测试了 p2p=disable 时的带宽(见代码块三),求老哥/师傅们解惑
Bidirectional P2P=Disabled Bandwidth Matrix (GB/s) D\D 0 1 0 919.12 2.28 1 2.49 812.51