flashinfer-ai · zanderjiang · Nov 10, 2025 · Nov 10, 2025 · Nov 17, 2025 · Nov 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,5 @@ web/**/public
 
 tmp/
 AGENTS.md
+
+.env
diff --git a/examples/ffi/Example-FlashInfer-Trace/definitions/gemm_n4096_k4096.json b/examples/ffi/Example-FlashInfer-Trace/definitions/gemm_n4096_k4096.json
@@ -0,0 +1,48 @@
+{
+  "name": "gemm_n4096_k4096",
+  "description": "General matrix multiply (GEMM) C = A @ B.T. Captured from Llama 3.1 8B attn.o_proj.",
+  "op_type": "gemm",
+  "tags": [
+    "status:verified",
+    "model:llama-3.1-8b"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 4096
+    },
+    "K": {
+      "type": "const",
+      "value": 4096
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "float16"
+    },
+    "B": {
+      "shape": [
+        "N",
+        "K"
+      ],
+      "dtype": "float16"
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "float16"
+    }
+  },
+  "reference": "import torch\n\ndef run(A, B):\n    C = torch.matmul(A, B.T)\n    return C"
+}
diff --git a/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json b/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
@@ -0,0 +1,24 @@
+{
+  "name": "example_agent_solution",
+  "definition": "gemm_n4096_k4096",
+  "description": "example agent vibecoded kernel generated by  gpt-5-2025-08-07 (reasoning effort: high)",
+  "author": "gpt-5-2025-08-07",
+  "spec": {
+    "language": "cuda",
+    "target_hardware": [
+      "B200"
+    ],
+    "entry_point": "kernel.cu::gemm_n_4096_k_4096",
+    "dependencies": []
+  },
+  "sources": [
+    {
+      "path": "kernel.h",
+      "content": "#ifndef GEMM_N_4096_K_4096_KERNEL_H\n#define GEMM_N_4096_K_4096_KERNEL_H\n\n#include <cuda_runtime.h>\n#include <cuda_fp16.h>\n#include <cstdint>\n\n// Constants fixed by specification\nconstexpr int GEMM_N_CONST = 4096;\nconstexpr int GEMM_K_CONST = 4096;\n\n// Utility: ceiling division\ninline int ceil_div(int a, int b) { return (a + b - 1) / b; }\n\n#endif // GEMM_N_4096_K_4096_KERNEL_H"
+    },
+    {
+      "path": "kernel.cu",
+      "content": "#include \"kernel.h\"\n#include <tvm/ffi/container/tensor.h>\n#include <tvm/ffi/extra/c_env_api.h>\n#include <tvm/ffi/function.h>\n#include <tvm/ffi/error.h>\n#include <mma.h>\n#include <cstdio>\n\nusing namespace nvcuda;\n\n// Error check macro\n#ifndef CUDA_CHECK\n#define CUDA_CHECK(expr)                                                     \\\n  do {                                                                       \\\n    cudaError_t _err = (expr);                                               \\\n    if (_err != cudaSuccess) {                                               \\\n      fprintf(stderr, \"CUDA Error %s at %s:%d: %s\\n\", #expr, __FILE__, __LINE__, cudaGetErrorString(_err)); \\\n      abort();                                                               \\\n    }                                                                        \\\n  } while (0)\n#endif\n\n// Kernel configuration tuned for B200\n// - Block tile: 128 x 256 (M x N)\n// - K tile: 64\n// - 8 warps per block (256 threads), each warp computes a 64x64 sub-tile via WMMA (4x4 tiles of 16x16)\n// - Accumulate in FP32, convert to FP16 on store\nconstexpr int BLOCK_M = 128;\nconstexpr int BLOCK_N = 256;\nconstexpr int BLOCK_K = 64;\n\nconstexpr int WARPS_PER_BLOCK = 8;\nconstexpr int THREADS_PER_BLOCK = WARPS_PER_BLOCK * 32;\n\nconstexpr int WARP_TILE_M = 64;\nconstexpr int WARP_TILE_N = 64;\n\nconstexpr int WMMA_M = 16;\nconstexpr int WMMA_N = 16;\nconstexpr int WMMA_K = 16;\n\n// Padding to avoid shared memory bank conflicts (in elements)\nconstexpr int SKEW_HALF  = 8; // for half elements\nconstexpr int SKEW_FLOAT = 8; // for float elements\n\n// Align pointer p up to 'alignment' bytes\n__device__ __forceinline__ char* align_up(char* p, size_t alignment) {\n  uintptr_t ip = reinterpret_cast<uintptr_t>(p);\n  ip = (ip + (alignment - 1)) & ~(alignment - 1);\n  return reinterpret_cast<char*>(ip);\n}\n\n__global__ __launch_bounds__(THREADS_PER_BLOCK, 2)\nvoid gemm_n_4096_k_4096_kernel(const __half* __restrict__ A,\n                               const __half* __restrict__ B,\n                               __half* __restrict__ C,\n                               int M) {\n  // Shared memory layout (dynamically allocated):\n  // [A_smem (half) | B_smem (half) | C_smem (float)]\n  extern __shared__ char smem_raw[];\n  char* smem_ptr = smem_raw;\n\n  // Compute sizes\n  const int A_smem_elems = BLOCK_M * (BLOCK_K + SKEW_HALF);\n  const int B_smem_elems = BLOCK_N * (BLOCK_K + SKEW_HALF);\n  const int C_smem_elems = BLOCK_M * (BLOCK_N + SKEW_FLOAT);\n\n  const size_t A_smem_bytes = A_smem_elems * sizeof(__half);\n  const size_t B_smem_bytes = B_smem_elems * sizeof(__half);\n  const size_t C_smem_bytes = C_smem_elems * sizeof(float);\n\n  __half* A_smem = reinterpret_cast<__half*>(smem_ptr);\n  smem_ptr = align_up(smem_ptr + A_smem_bytes, 16);\n  __half* B_smem = reinterpret_cast<__half*>(smem_ptr);\n  smem_ptr = align_up(smem_ptr + B_smem_bytes, 16);\n  float*  C_smem = reinterpret_cast<float*>(smem_ptr);\n\n  // Block coordinates\n  const int block_m = blockIdx.y; // along M\n  const int block_n = blockIdx.x; // along N\n  const int m0 = block_m * BLOCK_M;\n  const int n0 = block_n * BLOCK_N;\n\n  // Early exit if out of range (shouldn't happen due to gridDim.y, but guard anyway)\n  if (m0 >= M) return;\n\n  // Global strides (row-major)\n  const int lda = GEMM_K_CONST; // 4096\n  const int ldb = GEMM_K_CONST; // 4096\n  const int ldc = GEMM_N_CONST; // 4096\n\n  // Thread identifiers\n  const int tid     = threadIdx.x;\n  const int warp_id = tid / 32;\n  const int lane_id = tid % 32;\n\n  // Warp tile coordinates within the block\n  const int WARPS_N = BLOCK_N / WARP_TILE_N; // 256/64 = 4\n  const int warp_m_tile = warp_id / WARPS_N; // 0..1\n  const int warp_n_tile = warp_id % WARPS_N; // 0..3\n\n  // Initialize accumulators\n  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag[WARP_TILE_M / WMMA_M][WARP_TILE_N / WMMA_N];\n#pragma unroll\n  for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n#pragma unroll\n    for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n      wmma::fill_fragment(c_frag[i][j], 0.0f);\n    }\n  }\n\n  // Loop over K dimension in tiles of BLOCK_K\n  for (int k0 = 0; k0 < GEMM_K_CONST; k0 += BLOCK_K) {\n\n    // Load A tile into shared memory: [BLOCK_M x BLOCK_K] with stride (BLOCK_K + SKEW_HALF)\n    {\n      const int total_vec = (BLOCK_M * BLOCK_K) / 8; // 1024\n#pragma unroll\n      for (int v = 0; v < (total_vec / THREADS_PER_BLOCK); ++v) {\n        const int vec_idx = tid + v * THREADS_PER_BLOCK;\n        const int elem_idx = vec_idx * 8;\n        const int row = elem_idx / BLOCK_K;\n        const int col = elem_idx % BLOCK_K;\n        const int g_row = m0 + row;\n        const int g_col = k0 + col;\n\n        const __half* gptr = A + g_row * lda + g_col;\n        int4 data;\n\n        if (g_row < M) {\n          data = *reinterpret_cast<const int4*>(gptr);\n        } else {\n          data = {0, 0, 0, 0};\n        }\n\n        __half* sptr = A_smem + row * (BLOCK_K + SKEW_HALF) + col;\n        *reinterpret_cast<int4*>(sptr) = data;\n      }\n    }\n\n    // Load B tile into shared memory as [BLOCK_N x BLOCK_K] row-major with stride (BLOCK_K + SKEW_HALF)\n    {\n      const int total_vec = (BLOCK_N * BLOCK_K) / 8; // 2048\n#pragma unroll\n      for (int v = 0; v < (total_vec / THREADS_PER_BLOCK); ++v) {\n        const int vec_idx = tid + v * THREADS_PER_BLOCK;\n        const int elem_idx = vec_idx * 8;\n        const int n = elem_idx / BLOCK_K;\n        const int kk = elem_idx % BLOCK_K;\n\n        const __half* gptr = B + (n0 + n) * ldb + (k0 + kk);\n        int4 data = *reinterpret_cast<const int4*>(gptr);\n\n        __half* sptr = B_smem + n * (BLOCK_K + SKEW_HALF) + kk;\n        *reinterpret_cast<int4*>(sptr) = data;\n      }\n    }\n\n    __syncthreads();\n\n    // Compute using WMMA over BLOCK_K split into 16-wide k-steps\n#pragma unroll\n    for (int kk = 0; kk < BLOCK_K; kk += WMMA_K) {\n      // Preload 4 B fragments for this warp (across N within the warp tile)\n      wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, __half, wmma::col_major> b_frag[WARP_TILE_N / WMMA_N];\n#pragma unroll\n      for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n        const int n_off = warp_n_tile * WARP_TILE_N + j * WMMA_N;\n        const __half* b_tile_ptr = B_smem + n_off * (BLOCK_K + SKEW_HALF) + kk;\n        wmma::load_matrix_sync(b_frag[j], b_tile_ptr, (BLOCK_K + SKEW_HALF));\n      }\n\n      // For each of 4 A subtiles in M within the warp tile, multiply with 4 B fragments\n#pragma unroll\n      for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n        const int m_off = warp_m_tile * WARP_TILE_M + i * WMMA_M;\n        const __half* a_tile_ptr = A_smem + m_off * (BLOCK_K + SKEW_HALF) + kk;\n\n        wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, __half, wmma::row_major> a_frag;\n        wmma::load_matrix_sync(a_frag, a_tile_ptr, (BLOCK_K + SKEW_HALF));\n\n#pragma unroll\n        for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n          wmma::mma_sync(c_frag[i][j], a_frag, b_frag[j], c_frag[i][j]);\n        }\n      }\n    }\n\n    __syncthreads();\n  }\n\n  // Store accumulators to shared C_smem (float), then cooperatively convert/store to global as half\n#pragma unroll\n  for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n#pragma unroll\n    for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n      const int row = warp_m_tile * WARP_TILE_M + i * WMMA_M;\n      const int col = warp_n_tile * WARP_TILE_N + j * WMMA_N;\n      float* c_tile_ptr = C_smem + row * (BLOCK_N + SKEW_FLOAT) + col;\n      wmma::store_matrix_sync(c_tile_ptr, c_frag[i][j], (BLOCK_N + SKEW_FLOAT), wmma::mem_row_major);\n    }\n  }\n\n  __syncthreads();\n\n  // Cooperative conversion and store to global memory\n  const int total_elems = BLOCK_M * BLOCK_N; // 32768\n#pragma unroll 4\n  for (int idx = tid; idx < total_elems; idx += THREADS_PER_BLOCK) {\n    const int row = idx / BLOCK_N;\n    const int col = idx % BLOCK_N;\n    const int g_row = m0 + row;\n    const int g_col = n0 + col;\n\n    if (g_row < M) {\n      float val = C_smem[row * (BLOCK_N + SKEW_FLOAT) + col];\n      __half h = __float2half_rn(val);\n      C[g_row * ldc + g_col] = h;\n    }\n  }\n}\n\n// TVM FFI binding function\nvoid gemm_n_4096_k_4096(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C) {\n  // Validate inputs\n  TVM_FFI_ICHECK_EQ(A.ndim(), 2) << \"A must be 2D [M, 4096]\";\n  TVM_FFI_ICHECK_EQ(B.ndim(), 2) << \"B must be 2D [4096, 4096]\";\n  TVM_FFI_ICHECK_EQ(C.ndim(), 2) << \"C must be 2D [M, 4096]\";\n  \n  TVM_FFI_ICHECK_EQ(A.size(1), GEMM_K_CONST) << \"A.shape[1] must be 4096 (K)\";\n  TVM_FFI_ICHECK_EQ(B.size(0), GEMM_N_CONST) << \"B.shape[0] must be 4096 (N)\";\n  TVM_FFI_ICHECK_EQ(B.size(1), GEMM_K_CONST) << \"B.shape[1] must be 4096 (K)\";\n  \n  const int64_t M = A.size(0);\n  TVM_FFI_ICHECK_EQ(C.size(0), M) << \"C.shape[0] must match A.shape[0]\";\n  TVM_FFI_ICHECK_EQ(C.size(1), GEMM_N_CONST) << \"C.shape[1] must be 4096 (N)\";\n  \n  // Check dtype\n  DLDataType dt_a = A.dtype();\n  DLDataType dt_b = B.dtype();\n  DLDataType dt_c = C.dtype();\n  \n  if (dt_a.code != kDLFloat || dt_a.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"A must be float16\";\n  }\n  if (dt_b.code != kDLFloat || dt_b.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"B must be float16\";\n  }\n  if (dt_c.code != kDLFloat || dt_c.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"C must be float16\";\n  }\n  \n  // Check contiguous\n  TVM_FFI_ICHECK(A.IsContiguous()) << \"A must be contiguous\";\n  TVM_FFI_ICHECK(B.IsContiguous()) << \"B must be contiguous\";\n  TVM_FFI_ICHECK(C.IsContiguous()) << \"C must be contiguous\";\n  \n  // Check device\n  DLDevice dev = A.device();\n  TVM_FFI_ICHECK_EQ(dev.device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  TVM_FFI_ICHECK_EQ(B.device().device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  TVM_FFI_ICHECK_EQ(C.device().device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  \n  if (M <= 0) return;\n  \n  // Get data pointers\n  const __half* A_ptr = reinterpret_cast<const __half*>(A.data_ptr());\n  const __half* B_ptr = reinterpret_cast<const __half*>(B.data_ptr());\n  __half* C_ptr = reinterpret_cast<__half*>(C.data_ptr());\n  \n  // Get CUDA stream from environment\n  cudaStream_t stream = static_cast<cudaStream_t>(\n      TVMFFIEnvGetStream(dev.device_type, dev.device_id));\n  \n  // Launch configuration\n  dim3 block(THREADS_PER_BLOCK, 1, 1);\n  dim3 grid(GEMM_N_CONST / BLOCK_N, ceil_div(static_cast<int>(M), BLOCK_M), 1);\n  \n  // Dynamic shared memory size\n  const int A_smem_elems = BLOCK_M * (BLOCK_K + SKEW_HALF);\n  const int B_smem_elems = BLOCK_N * (BLOCK_K + SKEW_HALF);\n  const int C_smem_elems = BLOCK_M * (BLOCK_N + SKEW_FLOAT);\n  \n  const size_t shmem_bytes =\n      A_smem_elems * sizeof(__half) +\n      B_smem_elems * sizeof(__half) +\n      C_smem_elems * sizeof(float);\n  \n  // Opt-in to large dynamic shared memory if needed\n  CUDA_CHECK(cudaFuncSetAttribute(gemm_n_4096_k_4096_kernel,\n                                  cudaFuncAttributeMaxDynamicSharedMemorySize,\n                                  (int)shmem_bytes));\n  \n  gemm_n_4096_k_4096_kernel<<<grid, block, shmem_bytes, stream>>>(A_ptr, B_ptr, C_ptr, static_cast<int>(M));\n  CUDA_CHECK(cudaGetLastError());\n}\n\n// Export the function with TVM FFI\nTVM_FFI_DLL_EXPORT_TYPED_FUNC(gemm_n_4096_k_4096, gemm_n_4096_k_4096);"
+    }
+  ]
+}