Ich möchte das verteilte Paket in PyTorch für die Punkt-zu-Punkt-Kommunikation zwischen zwei Rängen verwenden. aber Ausf
Posted: 12 Jan 2025, 11:27
Code: Select all
def runTpoly(rank, size, pp, cs, pkArithmetics_evals,
pkSelectors_evals, domain):
init_process(rank, size)
group2 = torch.distributed.new_group([1,2])
if rank == 0:
device = torch.device(f"cuda:{rank}")
wo_eval_8n = torch.ones(SCALE * 8 * 1, 4, dtype=torch.int64, device='cuda')
if rank == 1:
wo_eval_8n = torch.ones(SCALE * 8 * 10, 4, dtype=torch.int64, device='cuda')
wo_eval_8n=wo_eval_8n+wo_eval_8n
send(wo_eval_8n, 2)
if rank == 2:
wo_eval_8n = torch.ones(SCALE * 8 * 10, 4, dtype=torch.int64, device='cuda')
print(wo_eval_8n.size())
recv(wo_eval_8n,1)
print(wo_eval_8n)
if rank == 3:[enter image description here](https://i.sstatic.net/M6FMng1p.png)
wo_eval_8n = torch.ones(SCALE * 10 * 10, 4, dtype=torch.int64, device='cuda')
print(wo_eval_8n.size())
# 清理进程组
dist.destroy_process_group()
if __name__ == "__main__":
world_size = 4 # GPU数目
print(torch.__file__)
pp, pk, cs = load("/home/whyin/data/9-data/")
domain= Radix2EvaluationDomain.new(cs.circuit_bound())
spawn(runTpoly, args=(world_size,pp,cs,pk.arithmetics_evals,pk.selectors_evals,domain), nprocs=world_size, join=True)

Code: Select all
RuntimeError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer
Exception raised from recvBytes at /home/whyin/pnp_new/PNP/torch/csrc/distributed/c10d/Utils.hpp:616 (most recent call first)