InfiniTensor · Ziminli · Jan 20, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 22, 2025
diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py
@@ -15,10 +15,21 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import (
+    get_args, 
+    debug,
+    get_tolerance,
+)
 from enum import Enum, auto
 import torch
 
+DEBUG = False
+
+# the atol and rtol for each data type
+tolerance_map = {
+    torch.float16: {'atol': 0, 'rtol': 1e-3},
+    torch.float32: {'atol': 0, 'rtol': 1e-5}, 
+}
 
 class Inplace(Enum):
     OUT_OF_PLACE = auto()
@@ -83,7 +94,11 @@ def test(
     check_error(
         lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
     )
-    assert torch.allclose(c, ans, atol=0, rtol=1e-3)
+
+    atol, rtol = get_tolerance(tolerance_map, tensor_dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
     check_error(lib.infiniopDestroyAddDescriptor(descriptor))
 
 
@@ -157,6 +172,8 @@ def test_bang(lib, test_cases):
         infiniopAddDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/attention.py b/operatorspy/tests/attention.py
@@ -18,10 +18,11 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, debug
 import torch
 import torch.nn.functional as F
 
+DEBUG = False
 
 class AttentionDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -184,6 +185,8 @@ def test(
         )
     )
 
+    if DEBUG:
+        debug(out, ans, atol=1e-4, rtol=1e-2)
     assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
 
     check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
@@ -406,6 +409,8 @@ def test_bang(lib, test_cases):
         infiniopAttentionDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/avg_pool.py b/operatorspy/tests/avg_pool.py
@@ -16,17 +16,27 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import (
+    get_args, 
+    debug,
+    get_tolerance,
+)
 import torch
 from typing import Tuple
 
+DEBUG = False
 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
 #       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+# the atol and rtol for each data type
+tolerance_map = {
+    torch.float16: {'atol': 0, 'rtol': 1e-3},
+    torch.float32: {'atol': 0, 'rtol': 1e-5}, 
+}
 
 class AvgPoolDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -156,7 +166,10 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    atol, rtol = get_tolerance(tolerance_map, tensor_dtype)
+    if DEBUG:
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
     check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
 
 
@@ -228,6 +241,8 @@ def test_bang(lib, test_cases):
         infiniopAvgPoolDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/causal_softmax.py b/operatorspy/tests/causal_softmax.py
@@ -18,9 +18,10 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, debug
 import torch
 
+DEBUG = False
 
 class CausalSoftmaxDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -72,6 +73,9 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float1
             None,
         )
     )
+
+    if DEBUG:
+        debug(x, ans, atol=0, rtol=1e-2)
     assert torch.allclose(x, ans, atol=0, rtol=1e-2)
     check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
 
@@ -143,6 +147,8 @@ def test_ascend(lib, test_cases):
         infiniopCausalSoftmaxDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/conv.py b/operatorspy/tests/conv.py
@@ -16,20 +16,30 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import (
+    get_args, 
+    debug,
+    get_tolerance,
+)
 import torch
 import math
 import ctypes
 from torch.nn import functional as F
 from typing import List, Tuple
 
+DEBUG = False
 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
 #       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+# the atol and rtol for each data type
+tolerance_map = {
+    torch.float16: {'atol': 0, 'rtol': 1e-2},
+    torch.float32: {'atol': 0, 'rtol': 1e-3}, 
+}
 
 class ConvDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -177,10 +187,10 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
-    if (tensor_dtype == torch.float16):
-        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
-    else:
-        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    atol, rtol = get_tolerance(tolerance_map, tensor_dtype)
+    if DEBUG:
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
     check_error(lib.infiniopDestroyConvDescriptor(descriptor))
 
 
@@ -286,6 +296,8 @@ def test_bang(lib, test_cases):
         infiniopConvDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/expand.py b/operatorspy/tests/expand.py
@@ -17,9 +17,11 @@
     rearrange_tensor,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import get_args, debug
 import torch
 
+DEBUG = False
+
 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
 #       e.g., cudaDeviceSynchronize() for CUDA
@@ -101,7 +103,10 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+
+    if DEBUG:
+        debug(y, ans, atol=0, rtol=0)
+    assert torch.allclose(y, ans, atol=0, rtol=0)
     check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
 
 
@@ -168,6 +173,8 @@ def test_bang(lib, test_cases):
         infiniopExpandDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/gemm.py b/operatorspy/tests/gemm.py
@@ -17,16 +17,28 @@
     rearrange_tensor,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import (
+    get_args, 
+    debug,
+    get_tolerance,
+)
 import torch
 
+DEBUG = False
+
 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
 #       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+# the atol and rtol for each data type
+tolerance_map = {
+    torch.float16: {'atol': 0, 'rtol': 1e-2},
+    torch.float32: {'atol': 0, 'rtol': 1e-2}, 
+}
+
 class GEMMDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -161,7 +173,10 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-2)
+    atol, rtol = get_tolerance(tolerance_map, dtype)
+    if DEBUG:
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
     check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
 
 
@@ -363,6 +378,8 @@ def test_bang(lib, test_cases):
         infiniopGEMMDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/global_avg_pool.py b/operatorspy/tests/global_avg_pool.py
@@ -16,16 +16,26 @@
     check_error,
 )
 
-from operatorspy.tests.test_utils import get_args
+from operatorspy.tests.test_utils import (
+    get_args, 
+    debug,
+    get_tolerance,
+)
 import torch, time
 
+DEBUG = False
 # constant for control whether profile the pytorch and lib functions
 # NOTE: need to manually add synchronization function to the lib function,
 #       e.g., cudaDeviceSynchronize() for CUDA
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+# the atol and rtol for each data type
+tolerance_map = {
+    torch.float16: {'atol': 0, 'rtol': 1e-3},
+    torch.float32: {'atol': 0, 'rtol': 1e-4}, 
+}
 
 class GlobalAvgPoolDescriptor(Structure):
     _fields_ = [("device", c_int32)]
@@ -118,7 +128,10 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
-    assert torch.allclose(y, ans, atol=0, rtol=1e-3)
+    atol, rtol = get_tolerance(tolerance_map, tensor_dtype)
+    if DEBUG:
+        debug(y, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y, ans, atol=atol, rtol=rtol)
     check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
 
 
@@ -197,6 +210,8 @@ def test_bang(lib, test_cases):
         infiniopGlobalAvgPoolDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.cpu:
         test_cpu(lib, test_cases)
     if args.cuda:

diff --git a/operatorspy/tests/matmul.py b/operatorspy/tests/matmul.py
@@ -19,13 +19,25 @@
     create_workspace,
 )
 
-from operatorspy.tests.test_utils import get_args, synchronize_device
+from operatorspy.tests.test_utils import (
+    get_args,
+    synchronize_device,
+    debug,
+    get_tolerance,
+)
 import torch
 
+DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+# the atol and rtol for each data type
+tolerance_map = {
+    torch.float16: {'atol': 0, 'rtol': 1e-2},
+    torch.float32: {'atol': 0, 'rtol': 1e-3}, 
+}
+
 class MatmulDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -115,7 +127,10 @@ def test(
         )
     )
 
-    assert torch.allclose(c, ans, atol=0, rtol=1e-2)
+    atol, rtol = get_tolerance(tolerance_map, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
 
     if PROFILE:
         for i in range(NUM_PRERUN):
@@ -343,6 +358,8 @@ def test_ascend(lib, test_cases):
         infiniopMatmulDescriptor_t,
     ]
 
+    if args.debug:
+        DEBUG = True
     if args.profile:
         PROFILE = True
     if args.cpu: