fix: fix requested changes and add sync in profiler

Chamberlain0w0 · Chamberlain0w0 · commit ed1a608244f5 · 2025-11-25T17:58:57.000+08:00
diff --git a/infini_train/include/nn/parallel/distributed_data_parallel.h b/infini_train/include/nn/parallel/distributed_data_parallel.h
@@ -20,7 +20,7 @@ class DistributedDataParallel : public nn::Module {
     std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) override;
 
 private:
-    std::shared_ptr<Reducer> reducer_;
+    std::shared_ptr<Reducer> reducer_ = nullptr;
 };
 
 } // namespace infini_train::nn::parallel
diff --git a/infini_train/include/nn/parallel/process_group.h b/infini_train/include/nn/parallel/process_group.h
@@ -11,14 +11,16 @@
 #endif
 
 #include "infini_train/include/nn/parallel/reduce_op_type.h"
-#include "infini_train/include/nn/parallel/work.h"
 
 namespace infini_train {
 class Tensor;
 class Device;
 namespace nn {
 class Module;
-}
+namespace parallel {
+class Work;
+} // namespace parallel
+} // namespace nn
 
 } // namespace infini_train
 
diff --git a/infini_train/include/nn/parallel/reducer.h b/infini_train/include/nn/parallel/reducer.h
@@ -1,21 +1,34 @@
 #pragma once
 
+#include <atomic>
 #include <memory>
 #include <mutex>
 #include <vector>
 
-#include "infini_train/include/autograd/function_hook.h"
+#include "infini_train/include/datatype.h"
 #include "infini_train/include/nn/parallel/parallel_functional.h"
-#include "infini_train/include/tensor.h"
+
+namespace infini_train {
+class Tensor;
+class Device;
+namespace autograd {
+class PostAccumulateGradHook;
+} // namespace autograd
+} // namespace infini_train
 
 namespace infini_train::nn::parallel {
+namespace {
+constexpr int kFirstBucketCapMB = 25;
+constexpr int kNormalBucketCapMB = 25;
+constexpr size_t kBytesPerMB = 1024ULL * 1024ULL;
+} // namespace
 
 // GradBucket passes bucket contents tensor to DDP communication hook.
 // ref: https://github.com/pytorch/pytorch/blob/main/torch/csrc/distributed/c10d/comm.hpp
 class GradBucket {
 public:
     explicit GradBucket(const std::vector<std::shared_ptr<Tensor>> &tensors) : tensors_(tensors) {}
-    const std::vector<std::shared_ptr<Tensor>> &getTensors() const { return tensors_; }
+    const std::vector<std::shared_ptr<Tensor>> &tensors() const { return tensors_; }
 
 private:
     std::vector<std::shared_ptr<Tensor>> tensors_;
@@ -34,11 +47,15 @@ struct ReducerOptions {
     // Ref: https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
 
     // Max capacity for each bucket(in MB)
-    size_t first_bucket_cap_mb = 128;
-    size_t normal_bucket_cap_mb = 512;
+    size_t first_bucket_cap_mb = kFirstBucketCapMB;
+    size_t normal_bucket_cap_mb = kNormalBucketCapMB;
 
     // When set true, map param.grad directly to the slice of bucket.flat(same address in memory) instead of memcpy
     bool gradient_as_bucket_view = true;
+
+    // Whether to enable gradient bucketing
+    // FIXME(zbl): should enable gradient bucketing by default
+    bool gradient_bucketing_enabled = true;
 };
 
 // DDP Reducer that handles gradient bucketing in backward
@@ -60,7 +77,7 @@ class Reducer : public std::enable_shared_from_this<Reducer> {
     // Prepare bucket info for next step
     void PrepareForBackward();
 
-    // For custom DDP hook to overwrite the default AllReduce. T
+    // For custom DDP hook to overwrite the default AllReduce.
     // This can be used for algorithms like Gradient Compression/GossipGrad.
     // Hook is registered using `Reducer::RegisterCommHook()`.
     // TODO(zbl): Leave the placeholder for the moment
diff --git a/infini_train/include/tensor.h b/infini_train/include/tensor.h
@@ -196,7 +196,7 @@ class Tensor : public std::enable_shared_from_this<Tensor> {
     std::shared_ptr<Tensor> RequiresGrad();
 
     std::shared_ptr<Tensor> grad() const;
-    void set_grad(std::shared_ptr<Tensor> grad);
+    void set_grad(std::shared_ptr<Tensor> &grad);
 
     bool requires_grad() const;
     void set_requires_grad(bool requires_grad);
diff --git a/infini_train/src/autograd/accumulate.cc b/infini_train/src/autograd/accumulate.cc
@@ -36,9 +36,9 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output
                 kernel.Call<void>(grad_output, learning_rate_, grad);
             }
         } else {
-            // NOTE(zbl): check whether need to do copying instead of slicing
+            // FIXME(zbl): check whether need to do copying instead of slicing
             auto new_grad = std::make_shared<Tensor>(*grad_output.get(), 0, grad_output->Dims());
-            tensor_->set_grad(std::move(new_grad));
+            tensor_->set_grad(new_grad);
         }
         auto hook = tensor_->post_accumulate_grad_hook();
         if (hook != nullptr) {
diff --git a/infini_train/src/nn/parallel/distributed_data_parallel.cc b/infini_train/src/nn/parallel/distributed_data_parallel.cc
@@ -20,22 +20,32 @@ constexpr char kModuleName[] = "module";
 DistributedDataParallel::DistributedDataParallel(std::shared_ptr<nn::Module> module, int device_id,
                                                  const ReducerOptions &opts) {
     for (auto &param : module->Parameters()) {
-        CHECK_EQ(param->GetDevice()->Index(), device_id) << "All parameters must be on the same device as the module";
+        auto device = param->GetDevice();
+        CHECK_EQ(device->Index(), device_id) << "All parameters must be on the same device as the module";
+        if (!opts.gradient_bucketing_enabled) {
+            auto ddp_pg
+                = ProcessGroupFactory::Instance()->Get(GetDataParallelProcessGroupName(device->rank().thread_rank()));
+            auto hook = std::make_unique<infini_train::autograd::AllReducePostAccumulateHook>(
+                function::ReduceOpType::kAvg, ddp_pg);
+            param->RegisterPostAccumulateGradHook(std::move(hook));
+        }
     }
     for (auto &buffer : module->Buffers()) {
         CHECK_EQ(buffer->GetDevice()->Index(), device_id) << "All buffers must be on the same device as the module";
     }
     modules_[kModuleName] = std::move(module);
 
-    // Bucket Assignment
-    auto params = modules_[kModuleName]->Parameters();
-    const size_t first_cap_bytes = opts.first_bucket_cap_mb * 1024ULL * 1024ULL;
-    const size_t normal_cap_bytes = opts.normal_bucket_cap_mb * 1024ULL * 1024ULL;
-    std::vector<size_t> bucket_size_limits = {first_cap_bytes, normal_cap_bytes};
-    auto bucket_indices = ComputeBucketAssignmentBySize(params, bucket_size_limits);
+    if (opts.gradient_bucketing_enabled) {
+        // Bucket Assignment
+        auto params = modules_[kModuleName]->Parameters();
+        const size_t first_cap_bytes = opts.first_bucket_cap_mb * kBytesPerMB;
+        const size_t normal_cap_bytes = opts.normal_bucket_cap_mb * kBytesPerMB;
+        std::vector<size_t> bucket_size_limits = {first_cap_bytes, normal_cap_bytes};
+        auto bucket_indices = ComputeBucketAssignmentBySize(params, bucket_size_limits);
 
-    reducer_ = std::make_shared<Reducer>(params, bucket_indices, opts);
-    reducer_->AttachHooksToParameters();
+        reducer_ = std::make_shared<Reducer>(params, bucket_indices, opts);
+        reducer_->AttachHooksToParameters();
+    }
 }
 
 std::vector<std::shared_ptr<Tensor>>
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
@@ -1,6 +1,5 @@
 #include "infini_train/include/nn/parallel/process_group.h"
 
-#include <algorithm>
 #include <numeric>
 #include <vector>
 
@@ -14,6 +13,7 @@
 #include "infini_train/include/datatype.h"
 #include "infini_train/include/device.h"
 #include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/nn/parallel/work.h"
 #include "infini_train/include/tensor.h"
 
 namespace infini_train {
@@ -57,8 +57,8 @@ ProcessGroup::ProcessGroup(const std::vector<int> &device_indices) : comm_size_(
 
         device->SetDevice();
         int low, high;
-        cudaDeviceGetStreamPriorityRange(&low, &high);
-        cudaStreamCreateWithPriority(&comm_streams_[i], cudaStreamNonBlocking, high);
+        CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&low, &high));
+        CUDA_CHECK(cudaStreamCreateWithPriority(&comm_streams_[i], cudaStreamNonBlocking, high));
         device_stream_map_[device] = comm_streams_[i];
     }
 
diff --git a/infini_train/src/nn/parallel/reducer.cc b/infini_train/src/nn/parallel/reducer.cc
@@ -14,9 +14,9 @@
 
 #include "infini_train/include/autograd/function_hook.h"
 #include "infini_train/include/common/cuda/common_cuda.h"
-#include "infini_train/include/device.h"
 #include "infini_train/include/nn/parallel/utils.h"
 #include "infini_train/include/nn/parallel/work.h"
+#include "infini_train/include/tensor.h"
 
 namespace infini_train::nn::parallel {
 namespace {
@@ -106,7 +106,6 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
             return (std::hash<int>()(k.dev) << 1) ^ std::hash<int>()(static_cast<int>(k.dtype));
         }
     };
-    auto key_of = [&](size_t i) -> Key { return Key{tensors[i]->GetDevice()->Index(), tensors[i]->Dtype()}; };
 
     // Maintain the current state of each bucket
     struct State {
@@ -117,8 +116,6 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
 
     std::unordered_map<Key, State, KeyHash> states;
     std::vector<Key> key_order;
-    // NOTE(zbl): Assume combinations of (device, dtype) <= 8
-    states.reserve(8);
 
     std::vector<std::vector<size_t>> buckets_all;
     buckets_all.reserve(tensors.size());
@@ -130,9 +127,7 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
         }
     };
 
-    auto current_cap = [&](const State &s) -> size_t { return bucket_size_limits[s.limit_idx]; };
-
-    auto flush_current_bucket = [&](State &s) {
+    auto flushCurrentBucket = [&](State &s) {
         if (!s.current_tensors.empty()) {
             buckets_all.push_back(std::move(s.current_tensors));
             s.current_tensors.clear();
@@ -146,7 +141,7 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
         const auto &tensor = tensors[idx_in_order];
         CHECK(tensor);
 
-        const Key k = key_of(idx_in_order);
+        const Key k = Key{tensors[idx_in_order]->GetDevice()->Index(), tensors[idx_in_order]->Dtype()};
         auto it = states.find(k);
         if (it == states.end()) {
             it = states.emplace(k, State{}).first;
@@ -156,20 +151,20 @@ std::vector<std::vector<size_t>> ComputeBucketAssignmentBySize(const std::vector
 
         const size_t element_size_in_bytes = kDataTypeToSize.at(tensor->Dtype());
         const size_t bytes = tensor->NumElements() * element_size_in_bytes;
-        const size_t cap = current_cap(state);
+        const size_t cap = bucket_size_limits[state.limit_idx];
 
         // Assign current tensor to current bucket first
         state.current_tensors.push_back(idx_in_order);
         state.current_bytes += bytes;
 
         // If current bucket is out of capacity, then flush and move on to the next bucket
         if (state.current_bytes >= cap) {
-            flush_current_bucket(state);
+            flushCurrentBucket(state);
         }
     }
 
     // Flush the last bucket of each group manually
-    for (auto &key : key_order) { flush_current_bucket(states[key]); }
+    for (auto &key : key_order) { flushCurrentBucket(states[key]); }
 
     return buckets_all;
 }
@@ -215,6 +210,7 @@ void Reducer::BuildBuckets(const std::vector<std::vector<size_t>> &bucket_indice
         CHECK(!bucket_indices[bucket_idx].empty());
         const auto &first_param = params_[bucket_indices[bucket_idx][0]];
         bucket.dtype = first_param->Dtype();
+        // FIXME(zbl): use global_rank() in multi-node settings
         bucket.device_rank = first_param->GetDevice()->rank().thread_rank();
 
         size_t total_elems = 0;
@@ -274,8 +270,8 @@ void Reducer::RebuildBuckets() {
         tensors_in_order.push_back(params_[global_idx]);
     }
 
-    const size_t first_cap_bytes = opts_.first_bucket_cap_mb * 1024ULL * 1024ULL;
-    const size_t normal_cap_bytes = opts_.normal_bucket_cap_mb * 1024ULL * 1024ULL;
+    const size_t first_cap_bytes = opts_.first_bucket_cap_mb * kBytesPerMB;
+    const size_t normal_cap_bytes = opts_.normal_bucket_cap_mb * kBytesPerMB;
     std::vector<size_t> bucket_size_limits = {first_cap_bytes, normal_cap_bytes};
     auto new_bucket_indices = ComputeBucketAssignmentBySize(tensors_in_order, bucket_size_limits, full_order);
 
@@ -364,8 +360,7 @@ void Reducer::MarkVariableReadyDense(size_t variable_index) {
     auto &bucket = buckets_.at(loc.bucket_index);
 
     // Record real order of bucket being ready
-    if (!has_rebuilt_bucket_ && variable_index < ready_seen_this_iter_.size()
-        && !ready_seen_this_iter_[variable_index]) {
+    if (!has_rebuilt_bucket_ && !ready_seen_this_iter_[variable_index]) {
         grad_ready_order_indices_.push_back(variable_index);
         ready_seen_this_iter_[variable_index] = 1;
     }
diff --git a/infini_train/src/profiler.cc b/infini_train/src/profiler.cc
@@ -84,6 +84,11 @@ void Profiler::StartRecord(const std::string &name, DeviceType device) {
         cudaStream_t stream = GetCudaStream();
         CUDA_CHECK(cudaEventCreate(&start));
         CUDA_CHECK(cudaEventCreate(&stop));
+
+        // Make sure the compute stream has done waiting, and ready for the execution of next op
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        // Start record after waiting
+        cpu_timing_map_[name] = std::chrono::high_resolution_clock::now();
         CUDA_CHECK(cudaEventRecord(start, stream));
         cuda_timing_map_[name] = {reinterpret_cast<void *>(start), reinterpret_cast<void *>(stop)};
         break;
diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc
@@ -544,12 +544,12 @@ std::shared_ptr<Tensor> Tensor::RequiresGrad() {
 }
 
 std::shared_ptr<Tensor> Tensor::grad() const { return grad_; };
-void Tensor::set_grad(std::shared_ptr<Tensor> grad) {
+void Tensor::set_grad(std::shared_ptr<Tensor> &grad) {
     if (grad) {
         CHECK(grad->GetDevice() == GetDevice());
         CHECK(grad->Dtype() == Dtype());
         CHECK(grad->Dims() == Dims());
-        grad_ = std::move(grad);
+        grad_ = grad;
     } else {
         grad_.reset();
     }

Original file line number	Diff line number	Diff line change
`@@ -36,9 +36,9 @@ AccumulateGrad::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_output`
`36`	`36`	`kernel.Call<void>(grad_output, learning_rate_, grad);`
`37`	`37`	`}`
`38`	`38`	`} else {`
`39`		`- // NOTE(zbl): check whether need to do copying instead of slicing`
	`39`	`+ // FIXME(zbl): check whether need to do copying instead of slicing`
`40`	`40`	`auto new_grad = std::make_shared<Tensor>(*grad_output.get(), 0, grad_output->Dims());`
`41`		`- tensor_->set_grad(std::move(new_grad));`
	`41`	`+ tensor_->set_grad(new_grad);`
`42`	`42`	`}`
`43`	`43`	`auto hook = tensor_->post_accumulate_grad_hook();`
`44`	`44`	`if (hook != nullptr) {`
Original file line number	Diff line number	Diff line change
`@@ -544,12 +544,12 @@ std::shared_ptr<Tensor> Tensor::RequiresGrad() {`
`544`	`544`	`}`
`545`	`545`
`546`	`546`	`std::shared_ptr<Tensor> Tensor::grad() const { return grad_; };`
`547`		`-void Tensor::set_grad(std::shared_ptr<Tensor> grad) {`
	`547`	`+void Tensor::set_grad(std::shared_ptr<Tensor> &grad) {`
`548`	`548`	`if (grad) {`
`549`	`549`	`CHECK(grad->GetDevice() == GetDevice());`
`550`	`550`	`CHECK(grad->Dtype() == Dtype());`
`551`	`551`	`CHECK(grad->Dims() == Dims());`
`552`		`- grad_ = std::move(grad);`
	`552`	`+ grad_ = grad;`
`553`	`553`	`} else {`
`554`	`554`	`grad_.reset();`
`555`	`555`	`}`