From d9f16b85bcfe1d757a9c368e778af545494c6605 Mon Sep 17 00:00:00 2001 From: zhuyue Date: Fri, 31 Oct 2025 15:44:15 +0800 Subject: [PATCH] Add random_sample python interface and tests. --- include/infinicore/ops/random_sample.hpp | 24 ++ python/infinicore/ops/random_sample.py | 22 ++ .../ops/random_sample/random_sample.cc | 38 +++ .../random_sample/random_sample_infiniop.cc | 66 +++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/random_sample.hpp | 32 +++ test/infinicore/ops/random_sample.py | 228 ++++++++++++++++++ 7 files changed, 412 insertions(+) create mode 100644 include/infinicore/ops/random_sample.hpp create mode 100644 python/infinicore/ops/random_sample.py create mode 100644 src/infinicore/ops/random_sample/random_sample.cc create mode 100644 src/infinicore/ops/random_sample/random_sample_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/random_sample.hpp create mode 100644 test/infinicore/ops/random_sample.py diff --git a/include/infinicore/ops/random_sample.hpp b/include/infinicore/ops/random_sample.hpp new file mode 100644 index 000000000..3450c295c --- /dev/null +++ b/include/infinicore/ops/random_sample.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +#include "infinicore/tensor.hpp" + +namespace infinicore::op { + +class RandomSample { +public: + using schema = void (*)(Tensor, Tensor, float, float, int, float); + static void execute(Tensor indices, Tensor logits, float random_val, float topp, int topk, float temperature); + static common::OpDispatcher &dispatcher(); +}; + +// Out-of-place API +Tensor random_sample(Tensor logits, float random_val, float topp, int topk, float temperature); +// In-place API +void random_sample_(Tensor indices, Tensor logits, float random_val, float topp, int topk, float temperature); + +} // namespace infinicore::op + + diff --git a/python/infinicore/ops/random_sample.py b/python/infinicore/ops/random_sample.py new file mode 100644 index 000000000..d2f3c39da --- /dev/null +++ b/python/infinicore/ops/random_sample.py @@ -0,0 +1,22 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def random_sample(logits, random_val, topp, topk, temperature, *, out=None): + if out is None: + return Tensor( + _infinicore.random_sample( + logits._underlying, random_val, topp, topk, temperature + ) + ) + + _infinicore.random_sample_( + out._underlying, + logits._underlying, + random_val, + topp, + topk, + temperature, + ) + + diff --git a/src/infinicore/ops/random_sample/random_sample.cc b/src/infinicore/ops/random_sample/random_sample.cc new file mode 100644 index 000000000..75a8f77a3 --- /dev/null +++ b/src/infinicore/ops/random_sample/random_sample.cc @@ -0,0 +1,38 @@ +#include "infinicore/ops/random_sample.hpp" + +namespace infinicore::op { + +common::OpDispatcher &RandomSample::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void RandomSample::execute( + Tensor indices, Tensor logits, + float random_val, float topp, int topk, float temperature) { + dispatcher().lookup(context::getDevice().getType())( + indices, logits, random_val, topp, topk, temperature); +} + +Tensor random_sample( + Tensor logits, + float random_val, + float topp, + int topk, + float temperature) { + auto indices = Tensor::empty({}, DataType::I32, logits->device()); + random_sample_(indices, logits, random_val, topp, topk, temperature); + return indices; +} + +void random_sample_( + Tensor indices, + Tensor logits, + float random_val, + float topp, + int topk, + float temperature) { + RandomSample::execute(indices, logits, random_val, topp, topk, temperature); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/random_sample/random_sample_infiniop.cc b/src/infinicore/ops/random_sample/random_sample_infiniop.cc new file mode 100644 index 000000000..aab02c7c9 --- /dev/null +++ b/src/infinicore/ops/random_sample/random_sample_infiniop.cc @@ -0,0 +1,66 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/random_sample.hpp" +#include + +namespace infinicore::op::random_sample_impl::infiniop_backend { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopRandomSampleDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyRandomSampleDescriptor(desc)); + desc = nullptr; + } + }); + +static void calculate( + Tensor indices, + Tensor logits, + float random_val, + float topp, + int topk, + float temperature) { + // cache per (result desc + logits desc) on device + size_t seed = hash_combine(indices, logits); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopRandomSampleDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateRandomSampleDescriptor( + context::getInfiniopHandle(), &desc, + indices->desc(), logits->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetRandomSampleWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopRandomSample( + desc, + workspace->data(), workspace_size, + indices->data(), logits->data(), + random_val, topp, topk, temperature, + context::getStream())); +} + +} // namespace infinicore::op::random_sample_impl::infiniop_backend + +namespace infinicore::op { + +static bool registered = []() { + RandomSample::dispatcher().registerAll(&random_sample_impl::infiniop_backend::calculate, false); + return true; +}(); + +} // namespace infinicore::op diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 0036f49f6..7db9d27d2 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -6,6 +6,7 @@ #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/matmul.hpp" +#include "ops/random_sample.hpp" #include "ops/rearrange.hpp" #include "ops/rms_norm.hpp" #include "ops/silu.hpp" @@ -19,6 +20,7 @@ inline void bind(py::module &m) { bind_add(m); bind_attention(m); bind_causal_softmax(m); + bind_random_sample(m); bind_matmul(m); bind_rearrange(m); bind_rms_norm(m); diff --git a/src/infinicore/pybind11/ops/random_sample.hpp b/src/infinicore/pybind11/ops/random_sample.hpp new file mode 100644 index 000000000..e5beb9e4e --- /dev/null +++ b/src/infinicore/pybind11/ops/random_sample.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include + +#include "infinicore/ops/random_sample.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_random_sample(py::module &m) { + m.def("random_sample", + &op::random_sample, + py::arg("logits"), + py::arg("random_val"), + py::arg("topp"), + py::arg("topk"), + py::arg("temperature"), + R"doc(Random sampling: returns an int32 scalar index.)doc"); + + m.def("random_sample_", + &op::random_sample_, + py::arg("indices"), + py::arg("logits"), + py::arg("random_val"), + py::arg("topp"), + py::arg("topk"), + py::arg("temperature"), + R"doc(In-place random sampling into provided int32 scalar tensor.)doc"); +} + +} // namespace infinicore::ops diff --git a/test/infinicore/ops/random_sample.py b/test/infinicore/ops/random_sample.py new file mode 100644 index 000000000..e9a9d91bb --- /dev/null +++ b/test/infinicore/ops/random_sample.py @@ -0,0 +1,228 @@ +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import torch +import infinicore +from infinicore.ops.random_sample import random_sample as ic_random_sample +from framework.base import BaseOperatorTest, TensorSpec, TestCase +from framework.runner import GenericTestRunner +from framework.tensor import TensorInitializer + +# ============================================================================== +# Operator-specific configuration +# ============================================================================== + +# Test cases: (voc, random_val, topp, topk, temperature) +# Aligned with test/infiniop/random_sample.py +_TEST_CASES_DATA = [ + (512, 0.8, 0.8, 3, 0.5), + (4096, 0.05, 0.9, 5, 1.0), + (16384, 0.15, 0.85, 10, 2.0), + (512, 0.08, 0.0, 3, 0.5), + (4096, 0.5, 0.9, 1, 1.0), + (16384, 0.15, 0.0, 1, 2.0), # Duplicate as in infiniop test + (32000, 0.08, 0.8, 50, 1.0), + (32000, 0.08, 1.0, 25, 1.0), + # (119696, 0.01, 1.0, 100, 1.0), # Commented out in infiniop test +] + + +def parse_test_cases(data): + voc, random_val, topp, topk, temperature = data + + inputs = [] + # logits: will be set in prepare_inputs to match infiniop pattern + # Use RANDOM as placeholder, will be replaced + inputs.append(TensorSpec.from_tensor((voc,))) + + # output: scalar int32 (required by backend), use zeros init to avoid torch.rand(int) error + output = TensorSpec.from_tensor( + (), dtype=infinicore.int32, init_mode=TensorInitializer.ZEROS + ) + return TestCase( + TestCase.BOTH, + inputs, + output, + voc=voc, + random_val=random_val, + topp=topp, + topk=topk, + temperature=temperature, + ) + + +_TEST_CASES = [parse_test_cases(d) for d in _TEST_CASES_DATA] + +# Data types - note: infiniop random_sample supports F16/BF16/F32/F64 for logits +# But NVIDIA backend may have restrictions, adjust based on actual device support +_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 0, "rtol": 0}, + infinicore.bfloat16: {"atol": 0, "rtol": 0}, +} + + +def torch_random_sample(data, random_val, topp, topk, voc, temperature): + if topp > 0 and topk > 1: + sorted_vals, sorted_indices = torch.sort(data, descending=True) + + scaled_vals = (sorted_vals - sorted_vals[0]) / temperature + try: + probs = torch.softmax(scaled_vals, dim=0) + except RuntimeError as e: + if "not implemented for 'Half'" in str(e): + scaled_vals = scaled_vals.to(torch.float32) + probs = torch.softmax(scaled_vals, dim=0) + else: + raise + cum_probs = torch.cumsum(probs, dim=0) + + k_index = min(topk, voc) - 1 + threshold = min(cum_probs[k_index], topp) * random_val + + try: + idx = torch.searchsorted(cum_probs, threshold) + except Exception: + indices = (cum_probs >= threshold).nonzero(as_tuple=True)[0] + idx = indices[0] if indices.numel() > 0 else torch.tensor(len(cum_probs) - 1, device=cum_probs.device) + return sorted_indices[idx] + + return torch.argmax(data) + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("RandomSample") + + def get_test_cases(self): + return _TEST_CASES + + def get_tensor_dtypes(self): + return _TENSOR_DTYPES + + def get_tolerance_map(self): + return _TOLERANCE_MAP + + def prepare_inputs(self, test_case, device, dtype_config): + """Create logits matching infiniop test pattern: torch.arange(voc)[_perm].float() * 0.0001""" + inputs, kwargs = super().prepare_inputs(test_case, device, dtype_config) + + voc = test_case.kwargs["voc"] + from framework.devices import torch_device_map + if device not in torch_device_map: + raise ValueError(f"Unsupported device: {device}") + torch_device = torch.device(torch_device_map[device]) + + # Get dtype for logits + if isinstance(dtype_config, dict) and "input_0" in dtype_config: + tensor_dtype = dtype_config["input_0"] + else: + tensor_dtype = dtype_config if not isinstance(dtype_config, (list, tuple)) else dtype_config[0] + + from framework.datatypes import to_torch_dtype + torch_dtype = to_torch_dtype(tensor_dtype) + + # Match infiniop test: torch.arange(voc)[_perm].float() * 0.0001 + _perm = torch.randperm(voc, device=torch_device) + inputs[0] = (torch.arange(voc, dtype=torch.float32, device=torch_device)[_perm] * 0.0001).to(torch_dtype) + + return inputs, kwargs + + def torch_operator(self, logits, out=None, **kwargs): + idx = torch_random_sample( + logits, + kwargs["random_val"], + kwargs["topp"], + kwargs["topk"], + kwargs["voc"], + kwargs["temperature"], + ).to(torch.int32) + if out is None: + return idx + out.copy_(idx) + return out + + def infinicore_operator(self, logits, out=None, **kwargs): + if out is None: + return ic_random_sample( + logits, + kwargs["random_val"], + kwargs["topp"], + kwargs["topk"], + kwargs["temperature"], + ) + return ic_random_sample( + logits, + kwargs["random_val"], + kwargs["topp"], + kwargs["topk"], + kwargs["temperature"], + out=out, + ) + + def _run_single_test(self, device, test_case, dtype_config, config, mode_name): + """Override to add fallback comparison: indices match OR logits values match (matches infiniop test)""" + from framework.utils import infinicore_tensor_from_torch, convert_infinicore_to_torch + + # Store logits for fallback comparison + inputs, kwargs = self.prepare_inputs(test_case, device, dtype_config) + logits_tensor = inputs[0] + + # Try parent comparison first + try: + super()._run_single_test(device, test_case, dtype_config, config, mode_name) + return # Success with normal comparison + except AssertionError: + # Fallback: check if logits values match when indices differ + infini_inputs = [infinicore_tensor_from_torch(inp) if isinstance(inp, torch.Tensor) else inp for inp in inputs] + + if test_case.operation_mode == TestCase.OUT_OF_PLACE: + torch_result = self.torch_operator(*inputs, **kwargs) + infini_result = self.infinicore_operator(*infini_inputs, **kwargs) + torch_result_from_infini = convert_infinicore_to_torch(infini_result, torch_result) + ic_idx = torch_result_from_infini.item() + ref_idx = torch_result.item() + else: # IN_PLACE - need to manually handle + from framework.tensor import TensorSpec + from framework.devices import torch_device_map + from framework.datatypes import to_torch_dtype + + output_dtype = self.get_output_dtype(test_case, dtype_config) + if test_case.output.is_contiguous or test_case.output.strides is None: + output_spec = TensorSpec.from_tensor(test_case.output.shape, output_dtype, init_mode=test_case.output.init_mode) + else: + output_spec = TensorSpec.from_strided_tensor(test_case.output.shape, test_case.output.strides, output_dtype, init_mode=test_case.output.init_mode) + + torch_output = output_spec.create_torch_tensor(device, output_dtype) + if not test_case.output.is_contiguous and test_case.output.strides is not None: + torch_output.zero_() + + torch_output_ref = torch_output.clone() + self.torch_operator(*inputs, out=torch_output_ref, **kwargs) + + torch_dummy = torch.zeros(test_case.output.shape, dtype=to_torch_dtype(output_dtype), device=torch_device_map[device]) + if not test_case.output.is_contiguous and test_case.output.strides is not None: + from framework.utils import rearrange_tensor + rearrange_tensor(torch_dummy, list(torch_output.stride())) + infini_output = infinicore_tensor_from_torch(torch_dummy) + self.infinicore_operator(*infini_inputs, out=infini_output, **kwargs) + + torch_result_from_infini = convert_infinicore_to_torch(infini_output, torch_output) + ic_idx = torch_result_from_infini.item() + ref_idx = torch_output_ref.item() + + # Fallback comparison: indices match OR logits values match + if ic_idx != ref_idx and logits_tensor[ic_idx] != logits_tensor[ref_idx]: + raise AssertionError(f"RandomSample {mode_name}: indices differ ({ic_idx} vs {ref_idx}) and logits values differ") + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main()