feat: add Work definition, fix gradient_as_bucket_view option

Chamberlain0w0 · Chamberlain0w0 · commit 19c9727c9e44 · 2025-11-14T17:56:34.000+08:00
diff --git a/infini_train/include/nn/parallel/process_group.h b/infini_train/include/nn/parallel/process_group.h
@@ -11,6 +11,7 @@
 #endif
 
 #include "infini_train/include/nn/parallel/reduce_op_type.h"
+#include "infini_train/include/nn/parallel/work.h"
 
 namespace infini_train {
 class Tensor;
@@ -55,10 +56,8 @@ class ProcessGroup {
 
     std::vector<std::shared_ptr<Tensor>> NcclRecv(std::vector<std::shared_ptr<Tensor>> tensors, int src_rank) const;
 
-    // Overlap helper functions
-    void EnqueueAllReduce(cudaEvent_t ready_event, cudaEvent_t done_event, const std::shared_ptr<Tensor> &tensor,
-                          function::ReduceOpType reduce_op) const;
-    void WaitAllReduceDone(cudaEvent_t done_event, const std::shared_ptr<Tensor> &tensor) const;
+    // Async communication functions
+    std::shared_ptr<Work> AllReduceAsync(const std::shared_ptr<Tensor> &tensor, function::ReduceOpType reduce_op) const;
 
 private:
     std::vector<ncclComm_t> comms_;
diff --git a/infini_train/include/nn/parallel/reducer.h b/infini_train/include/nn/parallel/reducer.h
@@ -30,12 +30,15 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
                                                                const std::vector<size_t> &tensor_indices = {});
 
 struct ReducerOptions {
+    // Pack all Reducer-related args together
+    // Ref: https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+
     // Max capacity for each bucket(in MB)
     size_t first_bucket_cap_mb = 128;
     size_t normal_bucket_cap_mb = 512;
 
     // When set true, map param.grad directly to the slice of bucket.flat(same address in memory) instead of memcpy
-    bool gradient_as_bucket_view = false;
+    bool gradient_as_bucket_view = true;
 };
 
 // DDP Reducer that handles gradient bucketing in backward
@@ -50,7 +53,9 @@ class Reducer : public std::enable_shared_from_this<Reducer> {
      */
     explicit Reducer(std::vector<std::shared_ptr<Tensor>> parameters, std::vector<std::vector<size_t>> bucket_indices,
                      const ReducerOptions &opts);
-    ~Reducer();
+
+    // Attach PostAllReduceHooks to params
+    void AttachHooksToParameters();
 
     // Prepare bucket info for next step
     void PrepareForBackward();
@@ -91,7 +96,7 @@ class Reducer : public std::enable_shared_from_this<Reducer> {
 
         // Views into the `gradients` tensor for each individual gradient
         std::vector<std::shared_ptr<Tensor>> bucket_views_in;
-        // NOTE(zbl): reserved for occasions where grads have different stride/layout
+        // TODO(zbl): reserved for occasions where grads have different stride/layout
         std::vector<std::shared_ptr<Tensor>> bucket_views_out;
 
         // Number of gradients left to be computed before the bucket is ready to be reduced
@@ -104,18 +109,10 @@ class Reducer : public std::enable_shared_from_this<Reducer> {
         // If `true`, then this implies that `bucket.variables.size() == 1`.
         // TODO(zbl): support logics for sparse gradient later
         bool expect_sparse_gradient = false;
-
-#ifdef USE_CUDA
-        // Event to mark that AllReduce is completed
-        cudaEvent_t allreduce_done = nullptr;
-        // Event to mark that all tensors' grad in bucket are ready
-        cudaEvent_t bucket_ready = nullptr;
-#endif
     };
 
 private:
     void InitializeBuckets(const std::vector<std::vector<size_t>> &bucket_indices);
-    void AttachHooksToParameters();
 
     // NOTE(zbl): all grads are assumed dense and stored continously in bucket for now
     void MarkVariableReadyDense(size_t variable_index);
diff --git a/infini_train/include/nn/parallel/work.h b/infini_train/include/nn/parallel/work.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <exception>
+#include <memory>
+#include <mutex>
+
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef USE_NCCL
+#include <nccl.h>
+#endif
+
+#include "infini_train/include/device.h"
+
+namespace infini_train::nn::parallel {
+
+class Work {
+public:
+    virtual ~Work() = default;
+
+    virtual bool Wait(std::chrono::milliseconds timeout = std::chrono::milliseconds::zero()) = 0;
+
+    virtual bool IsCompleted() const = 0;
+    virtual bool IsSuccess() const = 0;
+
+    virtual void Synchronize() const = 0;
+
+    virtual std::exception_ptr exception() const = 0;
+
+    virtual void *ready_event() const = 0;
+    virtual void *done_event() const = 0;
+};
+
+#ifdef USE_NCCL
+class WorkNccl final : public Work {
+public:
+    WorkNccl(const Device *device, ncclComm_t comm);
+    ~WorkNccl() override;
+
+    bool Wait(std::chrono::milliseconds timeout = std::chrono::milliseconds::zero()) override;
+
+    bool IsCompleted() const override;
+    bool IsSuccess() const override;
+
+    void Synchronize() const override;
+
+    std::exception_ptr exception() const override { return exception_; };
+
+    void *ready_event() const override { return reinterpret_cast<void *>(ready_event_); };
+    void *done_event() const override { return reinterpret_cast<void *>(done_event_); };
+
+private:
+    bool CheckNcclStatus();
+    void SetException(std::exception_ptr e);
+
+private:
+    Device *device_ = nullptr;
+    cudaEvent_t ready_event_;
+    cudaEvent_t done_event_;
+    ncclComm_t comm_;
+
+    mutable std::mutex mutex_;
+    std::exception_ptr exception_;
+    std::atomic<bool> completed_{false};
+    std::atomic<bool> success_{false};
+};
+#endif
+
+} // namespace infini_train::nn::parallel
diff --git a/infini_train/include/tensor.h b/infini_train/include/tensor.h
@@ -82,6 +82,9 @@ class Tensor : public std::enable_shared_from_this<Tensor> {
     Tensor To(const Device *device);
     Tensor To(DataType dtype);
 
+    void CopyFrom(const Tensor &src);
+    void CopyFrom(const std::shared_ptr<Tensor> &src);
+
     // operator overloading
     std::shared_ptr<Tensor> Equals(const std::shared_ptr<Tensor> &other);
     std::shared_ptr<Tensor> Equals(float scalar);
diff --git a/infini_train/src/autograd/accumulate.cc b/infini_train/src/autograd/accumulate.cc
@@ -27,13 +27,16 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
     if (grad_output) {
         if (grad) {
             if (tensor_->ConsumeGradOverwriteFlag()) {
-                auto new_grad = std::make_shared<Tensor>(*grad_output.get(), 0, grad_output->Dims());
-                tensor_->set_grad(std::move(new_grad));
+                // If the tensor is marked to overrite its current grad on next grad update
+                // See notes in `infini_train::nn::parallel::Reducer::PrepareForBackward()`
+                // NOTE(zbl): must copy, cannot change grad buffer address
+                grad->CopyFrom(grad_output);
             } else {
                 auto kernel = Dispatcher::Instance().GetKernel({device->Type(), "AccumulateGrad"});
                 kernel.Call<void>(grad_output, learning_rate_, grad);
             }
         } else {
+            // NOTE(zbl): check whether need to do copying instead of slicing
             auto new_grad = std::make_shared<Tensor>(*grad_output.get(), 0, grad_output->Dims());
             tensor_->set_grad(std::move(new_grad));
         }
diff --git a/infini_train/src/nn/parallel/distributed_data_parallel.cc b/infini_train/src/nn/parallel/distributed_data_parallel.cc
@@ -20,8 +20,7 @@ constexpr char kModuleName[] = "module";
 DistributedDataParallel::DistributedDataParallel(std::shared_ptr<nn::Module> module, int device_id,
                                                  const ReducerOptions &opts) {
     for (auto &param : module->Parameters()) {
-        auto device = param->GetDevice();
-        CHECK_EQ(device->Index(), device_id) << "All parameters must be on the same device as the module";
+        CHECK_EQ(param->GetDevice()->Index(), device_id) << "All parameters must be on the same device as the module";
     }
     for (auto &buffer : module->Buffers()) {
         CHECK_EQ(buffer->GetDevice()->Index(), device_id) << "All buffers must be on the same device as the module";
@@ -35,7 +34,8 @@ DistributedDataParallel::DistributedDataParallel(std::shared_ptr<nn::Module> mod
     std::vector<size_t> bucket_size_limits = {first_cap_bytes, normal_cap_bytes};
     auto bucket_indices = ComputeBucketAssignmentBySize(params, bucket_size_limits);
 
-    reducer_ = std::make_shared<Reducer>(std::move(params), bucket_indices, opts);
+    reducer_ = std::make_shared<Reducer>(params, bucket_indices, opts);
+    reducer_->AttachHooksToParameters();
 }
 
 std::vector<std::shared_ptr<Tensor>>
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
@@ -335,36 +335,34 @@ std::vector<std::shared_ptr<Tensor>> ProcessGroup::NcclRecv(std::vector<std::sha
     return tensors;
 }
 
-void ProcessGroup::EnqueueAllReduce(cudaEvent_t ready_event, cudaEvent_t done_event,
-                                    const std::shared_ptr<Tensor> &tensor, function::ReduceOpType reduce_op) const {
-    CHECK(ready_event && done_event) << "Events must be created.";
+std::shared_ptr<Work> ProcessGroup::AllReduceAsync(const std::shared_ptr<Tensor> &tensor,
+                                                   function::ReduceOpType reduce_op) const {
+    void *buffer = tensor->DataPtr();
     const auto *device = dynamic_cast<const CudaDevice *>(tensor->GetDevice());
-    CHECK(std::find(devices_.begin(), devices_.end(), device) != devices_.end())
-        << "Device of target Tensor is not in current ProcessGroup";
+    device->SetDevice();
+
+    auto comm = device_comm_map_.at(device);
 
     cudaStream_t compute_stream = device->Stream();
     cudaStream_t comm_stream = device_stream_map_.at(device);
 
-    cudaEventRecord(ready_event, compute_stream);
-    cudaStreamWaitEvent(comm_stream, ready_event, 0);
+    auto work = std::make_shared<WorkNccl>(device, comm);
+
+    cudaEvent_t ready_event = reinterpret_cast<cudaEvent_t>(work->ready_event());
+    cudaEvent_t done_event = reinterpret_cast<cudaEvent_t>(work->done_event());
+
+    CUDA_CHECK(cudaEventRecord(ready_event, compute_stream));
+    CUDA_CHECK(cudaStreamWaitEvent(comm_stream, ready_event, 0));
 
     // Perform NcclAllReduce on comm stream
-    device->SetDevice();
-    NCCL_CHECK(ncclAllReduce(tensor->DataPtr(), tensor->DataPtr(), tensor->NumElements(),
-                             kNcclDtypeMap.at(tensor->Dtype()), kNcclReduceOpMap.at(reduce_op),
-                             device_comm_map_.at(device), comm_stream));
+    NCCL_CHECK(ncclAllReduce(buffer, buffer, tensor->NumElements(), kNcclDtypeMap.at(tensor->Dtype()),
+                             kNcclReduceOpMap.at(reduce_op), comm, comm_stream));
 
-    cudaEventRecord(done_event, comm_stream);
-}
+    CUDA_CHECK(cudaEventRecord(done_event, comm_stream));
+    CUDA_CHECK(cudaStreamWaitEvent(compute_stream, done_event, 0));
 
-void ProcessGroup::WaitAllReduceDone(cudaEvent_t done_event, const std::shared_ptr<Tensor> &tensor) const {
-    CHECK(done_event) << "Events must be created.";
-    const auto *device = dynamic_cast<const CudaDevice *>(tensor->GetDevice());
-    CHECK(std::find(devices_.begin(), devices_.end(), device) != devices_.end())
-        << "Device of target Tensor is not in current ProcessGroup";
-    cudaStreamWaitEvent(device->Stream(), done_event, 0);
+    return std::move(work);
 }
-
 #endif
 
 ProcessGroupFactory *ProcessGroupFactory::Instance() {
diff --git a/infini_train/src/nn/parallel/reducer.cc b/infini_train/src/nn/parallel/reducer.cc
@@ -16,6 +16,7 @@
 #include "infini_train/include/common/cuda/common_cuda.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/nn/parallel/utils.h"
+#include "infini_train/include/nn/parallel/work.h"
 
 namespace infini_train::nn::parallel {
 namespace {
@@ -178,43 +179,9 @@ Reducer::Reducer(std::vector<std::shared_ptr<Tensor>> parameters, std::vector<st
     : params_(std::move(parameters)), opts_(opts) {
     BuildBuckets(bucket_indices);
     ready_seen_this_iter_.assign(params_.size(), 0);
-    AttachHooksToParameters();
-}
-
-Reducer::~Reducer() {
-#ifdef USE_CUDA
-    for (auto &b : buckets_) {
-        if (!b.contents) {
-            continue;
-        }
-        if (b.contents->GetDevice()->Type() == DeviceType::kCUDA) {
-            if (b.allreduce_done) {
-                CUDA_CHECK(cudaEventDestroy(b.allreduce_done));
-            }
-            if (b.bucket_ready) {
-                CUDA_CHECK(cudaEventDestroy(b.bucket_ready));
-            }
-        }
-    }
-#endif
 }
 
 void Reducer::InitializeBuckets(const std::vector<std::vector<size_t>> &bucket_indices) {
-#ifdef USE_CUDA
-    for (auto &b : buckets_) {
-        if (!b.contents) {
-            continue;
-        }
-        if (b.contents->GetDevice()->Type() == DeviceType::kCUDA) {
-            if (b.allreduce_done) {
-                CUDA_CHECK(cudaEventDestroy(b.allreduce_done));
-            }
-            if (b.bucket_ready) {
-                CUDA_CHECK(cudaEventDestroy(b.bucket_ready));
-            }
-        }
-    }
-#endif
     buckets_.clear();
     locators_.clear();
     next_bucket_ = 0;
@@ -235,16 +202,6 @@ void Reducer::InitializeBucketViews(Bucket &bucket) {
     }
     // Set (out == in) by default when all grads are dense
     bucket.bucket_views_out = bucket.bucket_views_in;
-
-    if (opts_.gradient_as_bucket_view) {
-        for (size_t i = 0; i < bucket.variables.size(); ++i) {
-            auto &v = bucket.variables[i];
-            auto g = v->grad();
-            if (g && g.get() != bucket.bucket_views_in[i].get()) {
-                v->set_grad(bucket.bucket_views_in[i]);
-            }
-        }
-    }
 }
 
 void Reducer::BuildBuckets(const std::vector<std::vector<size_t>> &bucket_indices) {
@@ -280,16 +237,8 @@ void Reducer::BuildBuckets(const std::vector<std::vector<size_t>> &bucket_indice
         auto dev = bucket.variables.front()->GetDevice();
         bucket.contents
             = std::make_shared<Tensor>(std::vector<int64_t>{static_cast<int64_t>(total_elems)}, bucket.dtype, dev);
-        // bucket.contents->Fill(0);
         bucket.pending = bucket.variables.size();
 
-#ifdef USE_CUDA
-        if (bucket.contents->GetDevice()->Type() == DeviceType::kCUDA) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&bucket.allreduce_done, cudaEventDisableTiming));
-            CUDA_CHECK(cudaEventCreateWithFlags(&bucket.bucket_ready, cudaEventDisableTiming));
-        }
-#endif
-
         bucket.variable_indices = bucket_indices[bucket_idx];
         InitializeBucketViews(bucket);
         buckets_.push_back(std::move(bucket));
@@ -368,11 +317,18 @@ void Reducer::PrepareForBackward() {
                 auto view = bucket.bucket_views_in[i];
                 auto grad = param->grad();
 
-                if (grad == nullptr) {
-                    param->MarkGradOverwriteOnNextAccum();
+                // NOTE(zbl): This will affect behaviors in `infini_train::autograd::AccumulateGrad::Backward()`
+                // If ZeroGrad(set_to_none=True), grad is nullptr at this point
+                // If ZeroGrad(set_to_none=False), grad is set to view of bucket.contents (or modified by user)
+                // Either way, we reset grad to view of bucket.contents
+                // Since bucket.contents might not be zeroed, we need to overwrite it on next grad accumulation
+                if (!grad || (grad.get() != view.get())) {
+                    if (grad) {
+                        LOG(WARNING) << "gradient_as_bucket_view is enabled, but param " << param
+                                     << " has a non-view grad tensor. Automatically overwriting it with bucket view.";
+                    }
                     param->set_grad(view);
-                } else {
-                    CHECK_EQ(grad.get(), view.get()) << "Param's gradient should be a slice of bucket's flat buffer.";
+                    param->MarkGradOverwriteOnNextAccum();
                 }
             }
         }
@@ -456,25 +412,24 @@ void Reducer::FinalizeBucketDense(size_t bucket_index) {
     auto &bucket = buckets_.at(bucket_index);
     auto ddp_pg = ProcessGroupFactory::Instance()->Get(GetDataParallelProcessGroupName(bucket.device_rank));
 
+    std::shared_ptr<Work> work;
     if (comm_hook_) {
         std::vector<std::shared_ptr<Tensor>> bucket_view{bucket.contents};
         // NOTE(zbl): Custom hook should do in-place operations
         //            e.g. comm_hook_(GradBucket{bucket_view})[0];
         // FIXME(zbl): support custom hook later
         LOG(FATAL) << "Custom hook is not supported now";
     } else {
-        ddp_pg->EnqueueAllReduce(bucket.bucket_ready, bucket.allreduce_done, bucket.contents,
-                                 function::ReduceOpType::kAvg);
+        work = ddp_pg->AllReduceAsync(bucket.contents, function::ReduceOpType::kAvg);
     }
 
     if (!opts_.gradient_as_bucket_view) {
         for (size_t i = 0; i < bucket.variables.size(); ++i) {
-            // Directly assgin bucket slice to grad instead of copying
-            // Same behavior as `CopyBucketToGrad(bucket.contents, bucket.variables[i]->grad(), bucket.offsets[i]);`
-            bucket.variables[i]->set_grad(bucket.bucket_views_in[i]);
+            // NOTE(zbl): For better performance, try `bucket.variables[i]->set_grad(bucket.bucket_views_in[i]);`
+            //            to directly assgin bucket slice to grad instead of copying
+            CopyBucketToGrad(bucket.contents, bucket.variables[i]->grad(), bucket.offsets[i]);
+            // bucket.variables[i]->set_grad(bucket.bucket_views_in[i]);
         }
     }
-
-    ddp_pg->WaitAllReduceDone(bucket.allreduce_done, bucket.contents);
 }
 } // namespace infini_train::nn::parallel
diff --git a/infini_train/src/nn/parallel/work.cc b/infini_train/src/nn/parallel/work.cc
diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc