fix: fix custom_ar bug for rocm

liaocz · liaocz · commit 9a0500534b22 · 2025-12-01T15:46:31.000+08:00
diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmDistributedOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmDistributedOp.cc
@@ -111,9 +111,11 @@ AllReduceOutput ROCmDevice::allReduce(const AllReduceParams& params) {
     if (use_custom_ar) {
         auto custom_ar_res_buf =
             allocateBuffer({buffer->type(), buffer->shape(), AllocationType::DEVICE}, {"custom_ar_buf"});
+        printBufferData(*buffer, "ar_input_buffer");
         torch::Tensor input_tensor  = Buffer2torchTensor(*buffer, false);
         torch::Tensor output_tensor = Buffer2torchTensor(*custom_ar_res_buf, false);
         custom_allreduce_comm_->allReduce(input_tensor, output_tensor);
+        printBufferData(*custom_ar_res_buf, "ar_output_buffer_after");
         return AllReduceOutput{custom_ar_res_buf};
     }
 
diff --git a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc
@@ -29,7 +29,6 @@ CustomAllReduceComm::~CustomAllReduceComm() {
 
 bool CustomAllReduceComm::checkAllReduceAvailable(size_t elts_total_num, DataType data_type, size_t world_size) {
     size_t elts_total_size = elts_total_num * getTypeSize(data_type);
-
     if (elts_total_size % 16 != 0) {
         return false;
     }
@@ -86,11 +85,10 @@ void CustomAllReduceComm::init(const NcclParam& nccl_para, hipStream_t stream) {
 
     // meta data buffers need to be "uncached" for signal on MI200
     meta_   = aiter::allocate_meta_buffer(aiter::meta_size() + comm_buf_threshold_);
-    buffer_ = torch::empty(
-        {
-            comm_buf_threshold_,
-        },
-        torch::dtype(torch::kUInt8).device(torch::kCUDA));
+    void* raw_ptr;
+    hipMalloc(&raw_ptr, comm_buf_threshold_);
+    auto deleter = [](void* p) { hipFree(p); };
+    buffer_ = torch::from_blob(raw_ptr, {comm_buf_threshold_}, deleter, torch::kCUDA);
     rank_data_ = torch::empty({16 * 1024 * 1024}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
 
     std::vector<torch::Tensor> meta_handles   = prepareP2PBuffer_(nccl_para, meta_, stream);