Multi-GPU Communication for NVIDIA B200 & AMD MI350X
NVIDIA Collective Communications Library
ROCm Communication Collectives Library
// Initialize communicator ncclComm_t comm; ncclCommInitRank(&comm, nRanks, id, myRank); // Create CUDA stream cudaStream_t stream; cudaStreamCreate(&stream); // Allocate GPU buffers float *sendbuff, *recvbuff; cudaMalloc(&sendbuff, size * sizeof(float)); cudaMalloc(&recvbuff, size * sizeof(float)); // Perform AllReduce (sum) ncclAllReduce( sendbuff, // send buffer recvbuff, // recv buffer size, // count ncclFloat, // datatype ncclSum, // reduction op comm, // communicator stream // CUDA stream ); // Wait for completion cudaStreamSynchronize(stream);
// Initialize (same API as NCCL) ncclComm_t comm; ncclCommInitRank(&comm, nRanks, id, myRank); // Create HIP stream hipStream_t stream; hipStreamCreate(&stream); // Allocate GPU buffers float *sendbuff, *recvbuff; hipMalloc(&sendbuff, size * sizeof(float)); hipMalloc(&recvbuff, size * sizeof(float)); // Perform AllReduce (identical call!) ncclAllReduce( sendbuff, // send buffer recvbuff, // recv buffer size, // count ncclFloat, // datatype ncclSum, // reduction op comm, // communicator stream // HIP stream ); // Wait for completion hipStreamSynchronize(stream);