From 23dffbee6f64805feaa7b298ff45379691e4354d Mon Sep 17 00:00:00 2001 From: glaxy Date: Wed, 17 Dec 2025 21:28:08 +0800 Subject: [PATCH 1/3] fix --- src/proxy.cc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/proxy.cc b/src/proxy.cc index 4d51a7b1d..0b6a73b9b 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -252,27 +252,27 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* } ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) { - printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll"); + printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, ncclFuncToString((ncclFunc_t)op->coll)); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = op->subs+s; + printf(" | %s channel %s/%02d", sub->connection->send ? "send" : "recv", ncclTransports[sub->connection->transport]->name, sub->channelId); if (op->state == ncclProxyOpProgress) { char status = ' '; - if (op->pattern == ncclPatternRecv) { + if (sub->connection->send) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init + else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU + else if (sub->done < sub->transmitted) status = 'S'; // Sending + else status = 'D'; // Done + printf(": %d -> %d / status %c (nsteps %d, posted %ld, transmitted %ld, done %ld)", sub->rank, sub->peer, status, sub->nsteps, sub->posted, sub->transmitted, sub->done); + } else { if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init else if (sub->received < sub->posted) status = 'R'; // Receiving else if (sub->received < sub->transmitted) status = 'R'; // Receiving else if (sub->transmitted < sub->received) status = 'F'; // Flushing else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU else status = 'D'; // Done - } else if (op->pattern == ncclPatternSend) { - if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init - else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU - else if (sub->done < sub->transmitted) status = 'S'; // Sending - else status = 'D'; // Done + printf(": %d <- %d / status %c (nsteps %d, posted %ld, received %ld, transmitted %ld, done %ld)", sub->rank, sub->peer, status, sub->nsteps, sub->posted, sub->received, sub->transmitted, sub->done); } - printf(" %d%c/%d", sub->peer, status, sub->channelId); - } else { - printf(" %d/%d", sub->peer, sub->channelId); } } printf("]"); @@ -858,7 +858,7 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int } #include -static ncclProxyProgressState* ncclLastProxyState; +static __thread ncclProxyProgressState* ncclLastProxyState; void ncclDumpProxyState(int signal) { dumpProxyState(ncclLastProxyState); } From cea6821b2060b0baecb30ccfdd67bec87a8c0bf2 Mon Sep 17 00:00:00 2001 From: Zhang Y <42159666+Rhai2307@users.noreply.github.com> Date: Fri, 19 Dec 2025 09:01:39 +0800 Subject: [PATCH 2/3] fix (#2) # Conflicts: # src/proxy.cc Co-authored-by: glaxy --- src/proxy.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/proxy.cc b/src/proxy.cc index 0b6a73b9b..5464b2303 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -263,7 +263,7 @@ ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU else if (sub->done < sub->transmitted) status = 'S'; // Sending else status = 'D'; // Done - printf(": %d -> %d / status %c (nsteps %d, posted %ld, transmitted %ld, done %ld)", sub->rank, sub->peer, status, sub->nsteps, sub->posted, sub->transmitted, sub->done); + printf(": %d -> %d / status %c (nsteps %d, posted %lu, transmitted %lu, done %lu)", sub->rank, sub->peer, status, sub->nsteps, sub->posted, sub->transmitted, sub->done); } else { if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init else if (sub->received < sub->posted) status = 'R'; // Receiving @@ -271,7 +271,7 @@ ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) else if (sub->transmitted < sub->received) status = 'F'; // Flushing else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU else status = 'D'; // Done - printf(": %d <- %d / status %c (nsteps %d, posted %ld, received %ld, transmitted %ld, done %ld)", sub->rank, sub->peer, status, sub->nsteps, sub->posted, sub->received, sub->transmitted, sub->done); + printf(": %d <- %d / status %c (nsteps %d, posted %lu, received %lu, transmitted %lu, done %lu)", sub->rank, sub->peer, status, sub->nsteps, sub->posted, sub->received, sub->transmitted, sub->done); } } } From 7944e21e5383bc615dca3401909f35d5490a6a86 Mon Sep 17 00:00:00 2001 From: Zhang Y <42159666+Rhai2307@users.noreply.github.com> Date: Fri, 19 Dec 2025 18:24:27 +0800 Subject: [PATCH 3/3] print commHash (#3) Co-authored-by: glaxy --- src/include/proxy.h | 2 ++ src/proxy.cc | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/include/proxy.h b/src/include/proxy.h index 4a79612f4..545bef12f 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -100,6 +100,7 @@ struct ncclProxyOp { bool incWorkCounter; int eActivationMask; void* taskEventHandle; + uint64_t commHash; int rank; int peer; pid_t pid; @@ -164,6 +165,7 @@ struct ncclProxySubArgs { struct ncclProxyArgs { struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; proxyProgressFunc_t progress; + uint64_t commHash; int nsubs; int done; int onePPN; diff --git a/src/proxy.cc b/src/proxy.cc index 5464b2303..e739bfc94 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -252,7 +252,7 @@ ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* } ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) { - printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, ncclFuncToString((ncclFunc_t)op->coll)); + printf("[0x%lx|%d-%d|%ld| %s", op->commHash, poolIndex, opIndex, op->opCount, ncclFuncToString((ncclFunc_t)op->coll)); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = op->subs+s; printf(" | %s channel %s/%02d", sub->connection->send ? "send" : "recv", ncclTransports[sub->connection->transport]->name, sub->channelId); @@ -406,6 +406,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr } //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress)); args->done = 0; + args->commHash = op->commHash; args->opCount = op->opCount; args->sliceSteps = op->sliceSteps; args->chunkSteps = op->chunkSteps; @@ -564,7 +565,7 @@ static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; - + op->commHash = comm->commHash; struct ncclChannelPeer* peerComm = channel->peers[peer]; struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) {