diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py
index c754fabce..f09417dad 100644
--- a/lightllm/common/basemodel/cuda_graph.py
+++ b/lightllm/common/basemodel/cuda_graph.py
@@ -3,6 +3,7 @@
 import copy
 import bisect
 from typing import Optional
+from tqdm import tqdm
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed import dist_group_manager, lightllm_capture_graph, CustomProcessGroup
@@ -191,7 +192,12 @@ def warmup(self, model):
         model: TpPartBaseModel = model
 
         # decode cuda graph init
-        for batch_size in self.cuda_graph_batch_sizes[::-1]:
+        progress_bar = tqdm(self.cuda_graph_batch_sizes[::-1], desc="Capturing CUDA graphs")
+        for batch_size in progress_bar:
+            # Get available memory info
+            avail_mem, total_mem = torch.cuda.mem_get_info()
+            avail_mem_gb = avail_mem / (1024 ** 3)
+            progress_bar.set_description(f"Capturing CUDA graphs - Batch: {batch_size}, AvailMem: {avail_mem_gb:.2f}GB")
             seq_len = 2
             total_token_num = batch_size * seq_len
             max_len_in_batch = self.graph_max_len_in_batch
@@ -246,7 +252,14 @@ def warmup_overlap(self, model):
 
         model: TpPartBaseModel = model
 
-        for batch_size in self.cuda_graph_batch_sizes[::-1]:
+        progress_bar = tqdm(self.cuda_graph_batch_sizes[::-1], desc="Capturing overlap CUDA graphs")
+        for batch_size in progress_bar:
+            # Get available memory info
+            avail_mem, total_mem = torch.cuda.mem_get_info()
+            avail_mem_gb = avail_mem / (1024 ** 3)
+            progress_bar.set_description(
+                f"Capturing overlap CUDA graphs - Batch: {batch_size}, AvailMem: {avail_mem_gb:.2f}GB"
+            )
             decode_batches = []
             for micro_batch_index in [0, 1]:
                 # dummy decoding, capture the cudagraph
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index b3dab0614..cb815cd86 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -9,3 +9,4 @@
 from .norm_weight import NormWeight, GEMMANormWeight, TpNormWeight
 from .fused_moe_weight_tp import create_tp_moe_wegiht_obj
 from .fused_moe_weight_ep import FusedMoeWeightEP
+from .parameter_weight import ParameterWeight, TpParameterWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/parameter_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/parameter_weight.py
new file mode 100644
index 000000000..65adcd469
--- /dev/null
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/parameter_weight.py
@@ -0,0 +1,44 @@
+import torch
+from typing import Dict
+from .base_weight import BaseWeightTpl
+from lightllm.utils.dist_utils import get_current_device_id
+
+
+class ParameterWeight(BaseWeightTpl):
+    def __init__(self, weight_name: str, data_type: torch.dtype, bias_name: str = None):
+        super().__init__()
+        self.weight_name = weight_name
+        self.bias_name = bias_name
+        self.data_type_ = data_type
+        self.weight = None
+        self.bias = None
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
+        if self.weight_name in weights:
+            self.weight = weights[self.weight_name].to(self.data_type_).cuda(get_current_device_id())
+        if self.bias_name in weights:
+            self.bias = weights[self.bias_name].to(self.data_type_).cuda(get_current_device_id())
+
+    def verify_load(self):
+        load_ok = True
+        # Verify weight. The weight must be not None.
+        load_ok = load_ok and self.weight is not None
+        # Verify bias. If bias_name is set, it must be not None.
+        if self.bias_name is not None:
+            load_ok = load_ok and self.bias is not None
+        return load_ok
+
+
+class TpParameterWeight(ParameterWeight):
+    def __init__(self, weight_name: str, data_type: torch.dtype, split_n_embed: int, bias_name: str = None):
+        super().__init__(weight_name, data_type, bias_name)
+        self.split_n_embed = split_n_embed
+
+    def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
+        start = self.split_n_embed * self.tp_rank_
+        end = self.split_n_embed * (self.tp_rank_ + 1)
+
+        if self.weight_name in weights:
+            self.weight = weights[self.weight_name][start:end].to(self.data_type_).cuda(get_current_device_id())
+        if self.bias_name in weights:
+            self.bias = weights[self.bias_name][start:end].to(self.data_type_).cuda(get_current_device_id())
diff --git a/lightllm/common/kv_cache_mem_manager/mem_manager.py b/lightllm/common/kv_cache_mem_manager/mem_manager.py
index d8fd93009..23d5a5dfa 100755
--- a/lightllm/common/kv_cache_mem_manager/mem_manager.py
+++ b/lightllm/common/kv_cache_mem_manager/mem_manager.py
@@ -24,16 +24,9 @@
 logger = init_logger(__name__)
 
 
-class MemoryManager:
-    def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9):
+class BaseAllocator:
+    def __init__(self, size, mem_manager_name=None):
         self.size = size
-        self.head_num = head_num
-        self.head_dim = head_dim
-        self.layer_num = layer_num
-        self.always_copy = always_copy
-        self.dtype = dtype
-        # profile the max total token num if the size is None
-        self.profile_size(mem_fraction)
 
         self.mem_state = torch.arange(
             0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
@@ -48,14 +41,95 @@ def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False
         self.can_use_mem_size = self.size
 
         # 用共享内存进行共享，router 模块读取进行精确的调度估计, nccl port 作为一个单机中单实列的标记。防止冲突。
-        from lightllm.utils.envs_utils import get_unique_server_name
-
+        if mem_manager_name is None:
+            mem_manager_name = get_unique_server_name()
         rank_in_node = get_current_rank_in_node()
-        self.shared_can_use_token_num = SharedInt(
-            f"{get_unique_server_name()}_mem_manger_can_use_token_num_{rank_in_node}"
-        )
+        self.shared_can_use_token_num = SharedInt(f"{mem_manager_name}_mem_manger_can_use_token_num_{rank_in_node}")
 
         self.shared_can_use_token_num.set_value(self.can_use_mem_size)
+        self.HOLD_TOKEN_MEMINDEX = self.size
+
+    def alloc(self, need_size) -> torch.Tensor:
+        if need_size > self.mark_end - self.mark_start:
+            logger.error(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
+            assert False, "error alloc state"
+
+        start = self.mark_start
+        end = self.mark_start + need_size
+        self.mark_start += need_size
+
+        self.can_use_mem_size -= need_size
+        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
+
+        # 利用缓冲区返回，避免异步情况下的内存竞争
+        if self._return_start + need_size > self._mem_state_return.shape[0]:
+            self._return_start = 0
+        ans = self._mem_state_return[self._return_start : self._return_start + need_size]
+        ans.copy_(self.mem_state[start:end])
+        self._return_start += need_size
+        return ans
+
+    def free(self, free_index: Union[torch.Tensor, List[int]]):
+        """_summary_
+
+        Args:
+            free_index (torch.Tensor): _description_
+        """
+        end = self.mark_start
+        start = self.mark_start - len(free_index)
+        assert start >= 0, f"error free state start: {self.mark_start} free len {len(free_index)}"
+
+        if isinstance(free_index, list):
+            self.mem_state.numpy()[start:end] = free_index
+        else:
+            # 从 gpu 到 cpu 的拷贝操作是流内阻塞操作
+            self.mem_state[start:end] = free_index
+
+        self.mark_start -= len(free_index)
+
+        self.can_use_mem_size += len(free_index)
+        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
+
+        if self.can_use_mem_size == len(self.mem_state):
+            logger.debug(f"freed all gpu mem size {self.can_use_mem_size}")
+        return
+
+    def free_all(self):
+        self.can_use_mem_size = len(self.mem_state)
+        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
+        self.mem_state.numpy()[:] = list(range(0, len(self.mem_state)))
+        self.mark_start = 0
+        self.mark_end = len(self.mem_state)
+
+    def resize_mem(self, new_size):
+        """
+        just for test code
+        """
+        self.size = new_size
+        self.mem_state = torch.arange(
+            0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
+        )
+        self.mark_start = 0
+        self.mark_end = self.size
+        self.can_use_mem_size = self.size
+        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
+        return
+
+
+class MemoryManager(BaseAllocator):
+    def __init__(
+        self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9, mem_manager_name=None
+    ):
+        self.size = size
+        self.head_num = head_num
+        self.head_dim = head_dim
+        self.layer_num = layer_num
+        self.always_copy = always_copy
+        self.dtype = dtype
+        # profile the max total token num if the size is None
+        self.profile_size(mem_fraction)
+        super().__init__(self.size, mem_manager_name)
+
         self._init_buffers(
             self.size,
             dtype,
@@ -63,7 +137,6 @@ def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False
             head_dim,
             layer_num,
         )
-        self.HOLD_TOKEN_MEMINDEX = self.size
 
     def get_cell_size(self):
         return 2 * self.head_num * self.head_dim * self.layer_num * torch._utils._element_size(self.dtype)
@@ -326,59 +399,13 @@ def _write_kv_move_data_p2p(self, token_indexes: torch.Tensor, buffer_tensor: to
     def _free_buffers(self):
         self.kv_buffer = None
 
-    def alloc(self, need_size) -> torch.Tensor:
-        if need_size > self.mark_end - self.mark_start:
-            logger.error(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
-            assert False, "error alloc state"
-
-        start = self.mark_start
-        end = self.mark_start + need_size
-        self.mark_start += need_size
-
-        self.can_use_mem_size -= need_size
-        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
-
-        # 利用缓冲区返回，避免异步情况下的内存竞争
-        if self._return_start + need_size > self._mem_state_return.shape[0]:
-            self._return_start = 0
-        ans = self._mem_state_return[self._return_start : self._return_start + need_size]
-        ans.copy_(self.mem_state[start:end])
-        self._return_start += need_size
-        return ans
-
-    def free(self, free_index: Union[torch.Tensor, List[int]]):
-        """_summary_
-
-        Args:
-            free_index (torch.Tensor): _description_
-        """
-
-        end = self.mark_start
-        start = self.mark_start - len(free_index)
-        assert start >= 0, f"error free state start: {self.mark_start} free len {len(free_index)}"
-
-        if isinstance(free_index, list):
-            self.mem_state.numpy()[start:end] = free_index
-        else:
-            # 从 gpu 到 cpu 的拷贝操作是流内阻塞操作
-            self.mem_state[start:end] = free_index
-
-        self.mark_start -= len(free_index)
-
-        self.can_use_mem_size += len(free_index)
-        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
-
-        if self.can_use_mem_size == len(self.mem_state):
-            logger.debug(f"freed all gpu mem size {self.can_use_mem_size}")
-        return
+    def get_index_kv_buffer(self, index):
+        return {"kv_buffer": self.kv_buffer[:, index]}
 
-    def free_all(self):
-        self.can_use_mem_size = len(self.mem_state)
-        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
-        self.mem_state.numpy()[:] = list(range(0, len(self.mem_state)))
-        self.mark_start = 0
-        self.mark_end = len(self.mem_state)
+    def load_index_kv_buffer(self, index, load_tensor_dict):
+        self.kv_buffer[:, index].copy_(load_tensor_dict["kv_buffer"])
 
+    # 重写resize_mem方法，添加_free_buffers和_init_buffers调用
     def resize_mem(self, new_size):
         """
         just for test code
@@ -389,14 +416,9 @@ def resize_mem(self, new_size):
         head_dim = self.head_dim
         layer_num = self.layer_num
 
-        self.size = new_size
-        self.mem_state = torch.arange(
-            0, self.size, dtype=torch.int32, device="cpu", requires_grad=False, pin_memory=True
-        )
-        self.mark_start = 0
-        self.mark_end = self.size
-        self.can_use_mem_size = self.size
-        self.shared_can_use_token_num.set_value(self.can_use_mem_size)
+        # 调用父类的resize_mem
+        super().resize_mem(new_size)
+
         self._free_buffers()
         self._init_buffers(size, dtype, head_num, head_dim, layer_num)
         return
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=16,H=8,K=128,V=128}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=16,H=8,K=128,V=128}_NVIDIA_H200.json
new file mode 100644
index 000000000..4b002622a
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=16,H=8,K=128,V=128}_NVIDIA_H200.json
@@ -0,0 +1,8 @@
+{
+  "4": {
+    "BK": 128,
+    "BV": 64,
+    "num_stages": 4,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=32,H=8,K=128,V=128}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=32,H=8,K=128,V=128}_NVIDIA_H200.json
new file mode 100644
index 000000000..cc5c68eb7
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=32,H=8,K=128,V=128}_NVIDIA_H200.json
@@ -0,0 +1,8 @@
+{
+  "4": {
+    "BK": 128,
+    "BV": 64,
+    "num_stages": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=64,H=8,K=128,V=128}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=64,H=8,K=128,V=128}_NVIDIA_H200.json
new file mode 100644
index 000000000..7421097fa
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_fwd_o/{BT=64,H=8,K=128,V=128}_NVIDIA_H200.json
@@ -0,0 +1,8 @@
+{
+  "4": {
+    "BK": 64,
+    "BV": 128,
+    "num_stages": 3,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_gated_delta_rule_fwd_h/{BT=64,H=8,K=128,V=128}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_gated_delta_rule_fwd_h/{BT=64,H=8,K=128,V=128}_NVIDIA_H200.json
new file mode 100644
index 000000000..d831f32c4
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_gated_delta_rule_fwd_h/{BT=64,H=8,K=128,V=128}_NVIDIA_H200.json
@@ -0,0 +1,7 @@
+{
+  "4": {
+    "BV": 32,
+    "num_stages": 4,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_local_cumsum_scalar/{B=1,BT=64,H=8,IS_VARLEN=true,REVERSE=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_local_cumsum_scalar/{B=1,BT=64,H=8,IS_VARLEN=true,REVERSE=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..354a6f93a
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_local_cumsum_scalar/{B=1,BT=64,H=8,IS_VARLEN=true,REVERSE=false}_NVIDIA_H200.json
@@ -0,0 +1,41 @@
+{
+  "1": {
+    "num_warps": 4
+  },
+  "100": {
+    "num_warps": 8
+  },
+  "1024": {
+    "num_warps": 4
+  },
+  "128": {
+    "num_warps": 1
+  },
+  "16": {
+    "num_warps": 4
+  },
+  "164096": {
+    "num_warps": 1
+  },
+  "2048": {
+    "num_warps": 2
+  },
+  "256": {
+    "num_warps": 1
+  },
+  "32": {
+    "num_warps": 8
+  },
+  "4096": {
+    "num_warps": 2
+  },
+  "64": {
+    "num_warps": 8
+  },
+  "8": {
+    "num_warps": 8
+  },
+  "8448": {
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_scaled_dot_kkt_fwd/{BT=64,H=8,IS_VARLEN=true,K=128}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_scaled_dot_kkt_fwd/{BT=64,H=8,IS_VARLEN=true,K=128}_NVIDIA_H200.json
new file mode 100644
index 000000000..9fbae2414
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/chunk_scaled_dot_kkt_fwd/{BT=64,H=8,IS_VARLEN=true,K=128}_NVIDIA_H200.json
@@ -0,0 +1,7 @@
+{
+  "4": {
+    "BK": 64,
+    "num_stages": 3,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..d00af04ca
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/fused_gdn_gating:v1/{NUM_HEADS=8,a_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,54 @@
+{
+  "1": {
+    "BLK_HEADS": 64,
+    "num_warps": 1
+  },
+  "100": {
+    "BLK_HEADS": 4,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLK_HEADS": 8,
+    "num_warps": 1
+  },
+  "128": {
+    "BLK_HEADS": 16,
+    "num_warps": 4
+  },
+  "16": {
+    "BLK_HEADS": 8,
+    "num_warps": 2
+  },
+  "164096": {
+    "BLK_HEADS": 8,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLK_HEADS": 16,
+    "num_warps": 1
+  },
+  "256": {
+    "BLK_HEADS": 32,
+    "num_warps": 2
+  },
+  "32": {
+    "BLK_HEADS": 8,
+    "num_warps": 1
+  },
+  "4096": {
+    "BLK_HEADS": 16,
+    "num_warps": 4
+  },
+  "64": {
+    "BLK_HEADS": 64,
+    "num_warps": 2
+  },
+  "8": {
+    "BLK_HEADS": 8,
+    "num_warps": 2
+  },
+  "8448": {
+    "BLK_HEADS": 32,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gated_rmsnorm_forward:v1/{N=128,has_bias=false,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gated_rmsnorm_forward:v1/{N=128,has_bias=false,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..84c47d348
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gated_rmsnorm_forward:v1/{N=128,has_bias=false,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,54 @@
+{
+  "1024": {
+    "BLOCK_N": 256,
+    "num_warps": 2
+  },
+  "128": {
+    "BLOCK_N": 256,
+    "num_warps": 1
+  },
+  "1312768": {
+    "BLOCK_N": 64,
+    "num_warps": 2
+  },
+  "16384": {
+    "BLOCK_N": 128,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_N": 64,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_N": 256,
+    "num_warps": 1
+  },
+  "32768": {
+    "BLOCK_N": 256,
+    "num_warps": 2
+  },
+  "512": {
+    "BLOCK_N": 512,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_N": 256,
+    "num_warps": 1
+  },
+  "67584": {
+    "BLOCK_N": 64,
+    "num_warps": 1
+  },
+  "8": {
+    "BLOCK_N": 512,
+    "num_warps": 8
+  },
+  "800": {
+    "BLOCK_N": 64,
+    "num_warps": 1
+  },
+  "8192": {
+    "BLOCK_N": 128,
+    "num_warps": 2
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=2048,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=2048,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..3fd0050d7
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=2048,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,7 @@
+{
+  "2048": {
+    "BLOCK_SIZE": 4096,
+    "num_stages": 4,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=256,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=256,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..3863d48e8
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/gemma_rmsnorm_forward:v1/{N=256,weight_dtype=torch.bfloat16,x_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,7 @@
+{
+  "256": {
+    "BLOCK_SIZE": 256,
+    "num_stages": 1,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=128,N=2048,expert_num=512,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=128,N=2048,expert_num=512,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..fde50e757
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=128,N=2048,expert_num=512,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
@@ -0,0 +1,110 @@
+{
+  "10": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "1000": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "10240": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "1280": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "160": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "20480": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2560": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "320": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "40960": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "640": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "80": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "84480": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=256,expert_num=512,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=10,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=256,expert_num=512,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=10,use_fp8_w8a8=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..612f2b51e
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=256,expert_num=512,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=10,use_fp8_w8a8=false}_NVIDIA_H200.json
@@ -0,0 +1,110 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "8448": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=10}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=10}_NVIDIA_H200.json
new file mode 100644
index 000000000..5923f3164
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=10}_NVIDIA_H200.json
@@ -0,0 +1,54 @@
+{
+  "1": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "32768": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "4096": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "8": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "8448": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=10}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=10}_NVIDIA_H200.json
new file mode 100644
index 000000000..4d6191579
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=10}_NVIDIA_H200.json
@@ -0,0 +1,74 @@
+{
+  "1": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 16
+  },
+  "1024": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 2
+  },
+  "128": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  },
+  "2048": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "4096": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "8448": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/silu_and_mul_fwd:v1/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/silu_and_mul_fwd:v1/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..0b3aa1e36
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/silu_and_mul_fwd:v1/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,152 @@
+{
+  "1": {
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "10": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 2,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 8
+  },
+  "1000": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "10240": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "1280": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 2,
+    "num_warps": 8
+  },
+  "160": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "164096": {
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "20480": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 8
+  },
+  "2560": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "32": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "320": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "40960": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 4,
+    "num_warps": 8
+  },
+  "640": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "80": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 2,
+    "num_warps": 1
+  },
+  "8448": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "84480": {
+    "BLOCK_M": 32,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
index a919f7b28..c69147087 100644
--- a/lightllm/common/triton_utils/autotuner.py
+++ b/lightllm/common/triton_utils/autotuner.py
@@ -62,7 +62,7 @@ def autotune(
         as needed before invocation.
     """
 
-    def decorator(fn):
+    def decorator(fn: Callable) -> Callable:
         return Autotuner(
             fn=fn,
             kernel_name=kernel_name,
@@ -168,7 +168,7 @@ def __call__(self, *args, **kwargs):
             if (dist.is_initialized() and get_current_rank_in_node() == 0) or not dist.is_initialized():
                 logger.warning(
                     f"No kernel config for {self.kernel_name} in {KernelConfigs.get_config_file_name(static_key)},"
-                    f"the performance may be suboptimal!"
+                    f"the performance may be suboptimal! "
                     f"You can use LIGHTLLM_TRITON_AUTOTUNE_LEVEL=1 to enable autotune.",
                 )
             self.cached_configs[static_key] = {}
@@ -215,7 +215,7 @@ def _try_load_cache(self, static_key):
 
         cache_file = os.path.join(self.cache_dir, KernelConfigs.get_config_file_name(static_key))
         if os.path.exists(cache_file):
-            logger.info(f"Loading cached configs for {self.kernel_name} - {static_key}")
+            logger.info(f"Loading cached configs for {self.kernel_name} - {static_key.items()}")
             with open(cache_file, "rb") as f:
                 self.cached_configs[static_key] = orjson.loads(f.read())
         return True
diff --git a/lightllm/models/__init__.py b/lightllm/models/__init__.py
index 5237f8fd2..7798c10bf 100644
--- a/lightllm/models/__init__.py
+++ b/lightllm/models/__init__.py
@@ -8,6 +8,7 @@
 from lightllm.models.qwen2.model import Qwen2TpPartModel
 from lightllm.models.qwen3.model import Qwen3TpPartModel
 from lightllm.models.qwen3_moe.model import Qwen3MOEModel
+from lightllm.models.qwen3next.model import Qwen3NextTpPartModel
 from lightllm.models.chatglm2.model import ChatGlm2TpPartModel
 from lightllm.models.internlm.model import InternlmTpPartModel
 from lightllm.models.stablelm.model import StablelmTpPartModel
diff --git a/lightllm/models/qwen2/model.py b/lightllm/models/qwen2/model.py
index 5b756aadf..e64ea495c 100644
--- a/lightllm/models/qwen2/model.py
+++ b/lightllm/models/qwen2/model.py
@@ -17,7 +17,7 @@ def __init__(self, kvargs):
 
     def _init_config(self):
         super()._init_config()
-        if self.config["sliding_window"] is None:
+        if self.config.get("sliding_window") is None:
             self.config["sliding_window"] = self.max_total_token_num
         # rename key [SYM: to be confirmed]
         return
diff --git a/lightllm/models/qwen3next/layer_infer/post_layer_infer.py b/lightllm/models/qwen3next/layer_infer/post_layer_infer.py
new file mode 100644
index 000000000..73ea97457
--- /dev/null
+++ b/lightllm/models/qwen3next/layer_infer/post_layer_infer.py
@@ -0,0 +1,16 @@
+import os
+import torch
+import torch.functional as F
+import torch.distributed as dist
+import numpy as np
+
+from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer
+from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
+from lightllm.models.qwen3next.triton_kernel.gemma_rmsnorm import gemma_rmsnorm_forward
+
+
+class Qwen3NextPostLayerInfer(LlamaPostLayerInfer):
+    def _norm(self, input, infer_state, layer_weight: LlamaPreAndPostLayerWeight) -> torch.Tensor:
+        out = self.alloc_tensor(input.shape, input.dtype)
+        gemma_rmsnorm_forward(input, layer_weight.final_norm_weight_, self.eps_, out=out)
+        return out
diff --git a/lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py
new file mode 100644
index 000000000..6b62128cf
--- /dev/null
+++ b/lightllm/models/qwen3next/layer_infer/transformer_layer_infer.py
@@ -0,0 +1,343 @@
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from lightllm.models.qwen3next.layer_weights.transformer_layer_weight import Qwen3NextTransformerLayerWeight
+from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
+from functools import partial
+from lightllm.utils.log_utils import init_logger
+from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
+from lightllm.models.qwen3next.mem_manager import Qwen3NextMemoryManager
+from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+from typing import Tuple
+from typing_extensions import override
+from einops import rearrange
+from lightllm.models.qwen3next.triton_kernel.gated_rmsnorm import gated_rmsnorm_forward
+from lightllm.models.qwen3next.triton_kernel.causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+from lightllm.models.qwen3next.triton_kernel.fused_gdn_gating import fused_gdn_gating
+from lightllm.models.qwen3next.triton_kernel.fla.ops import chunk_gated_delta_rule
+from lightllm.models.qwen3next.triton_kernel.fla.ops import fused_recurrent_gated_delta_rule
+
+from lightllm.distributed import all_reduce
+from lightllm.models.llama.triton_kernel.rotary_emb import rotary_emb_fwd
+from lightllm.models.qwen3next.triton_kernel.gemma_rmsnorm import gemma_rmsnorm_forward
+
+logger = init_logger(__name__)
+
+
+class Qwen3NextTransformerLayerInfer(Qwen3MOETransformerLayerInfer):
+    def __init__(self, layer_num, network_config, mode=[]):
+        super().__init__(layer_num, network_config, mode)
+        self.is_linear = (layer_num + 1) % network_config["full_attention_interval"] != 0
+        self.partial_rotary_factor = network_config.get("partial_rotary_factor", 1.0)
+
+        if self.is_linear:
+            self.linear_attn_infer = Qwen3NextGatedDeltaNetInfer(network_config, layer_num, self.tp_world_size_)
+        return
+
+    @override
+    def _bind_norm(self):
+        pass
+
+    def _ffn_with_shared_expert(
+        self, input, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ) -> torch.Tensor:
+        input = input.view(-1, self.embed_dim_)
+        up_gate_out = layer_weight.shared_expert_gate_up_proj.mm(input)
+        ffn1_out = self.alloc_tensor((input.size(0), up_gate_out.size(1) // 2), input.dtype)
+        silu_and_mul_fwd(up_gate_out, ffn1_out)
+        ffn2_out = layer_weight.shared_expert_down_proj.mm(ffn1_out)
+        shared_expert_out = F.sigmoid(layer_weight.shared_expert_gate.mm(input)) * ffn2_out
+        moe_out = self._ffn(input, infer_state, layer_weight)
+        return shared_expert_out + moe_out
+
+    @override
+    def _att_norm(
+        self, input, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ) -> torch.Tensor:
+        out = self.alloc_tensor(input.shape, input.dtype)
+        gemma_rmsnorm_forward(input, layer_weight.att_norm_weight_.weight, self.eps_, out=out)
+        return out
+
+    @override
+    def _ffn_norm(
+        self, input, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ) -> torch.Tensor:
+        out = self.alloc_tensor(input.shape, input.dtype)
+        gemma_rmsnorm_forward(input, layer_weight.ffn_norm_weight_.weight, self.eps_, out=out)
+        return out
+
+    @override
+    def _get_qkv(
+        self,
+        input: torch.Tensor,
+        infer_state: LlamaInferStateInfo,
+        layer_weight: Qwen3NextTransformerLayerWeight,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        input = input.view(-1, self.embed_dim_)
+        q = layer_weight.q_proj.mm(input)
+        cache_kv = layer_weight.kv_proj.mm(
+            input.view(-1, self.embed_dim_),
+        ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
+
+        gemma_rmsnorm_forward(
+            q.view(-1, self.head_dim_),
+            layer_weight.q_norm_weight_.weight,
+            eps=self.eps_,
+            out=q.view(-1, self.head_dim_),
+        )
+
+        cache_kv[:, : self.tp_k_head_num_, :] = gemma_rmsnorm_forward(
+            cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
+            layer_weight.k_norm_weight_.weight,
+            eps=self.eps_,
+        ).view(-1, self.tp_k_head_num_, cache_kv.shape[-1])
+
+        rotary_emb_fwd(
+            q.view(-1, self.tp_q_head_num_, self.head_dim_),
+            cache_kv[:, : self.tp_k_head_num_, :],
+            infer_state.position_cos,
+            infer_state.position_sin,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+        return q, cache_kv
+
+    @override
+    def _get_o(
+        self, input, gate_value, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ) -> torch.Tensor:
+        # Handle different input shapes from different attention kernels
+        input = input.view(-1, gate_value.shape[-1])
+        gated_input = input * gate_value
+        o_tensor = layer_weight.o_proj.mm(gated_input)
+        return o_tensor
+
+    def _context_full_attn(
+        self, input, gate_value, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ):
+        q, cache_kv = self._get_qkv(input, infer_state, layer_weight)
+        input = None
+        self._post_cache_kv(cache_kv, infer_state, layer_weight)
+        o = self._context_attention_kernel(q, cache_kv, infer_state, layer_weight)
+        q = None
+        o = self._get_o(o, gate_value, infer_state, layer_weight)
+        if self.tp_world_size_ > 1:
+            all_reduce(o, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
+        return o
+
+    def context_forward(
+        self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ):
+        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
+        if self.is_linear:
+            o = self.linear_attn_infer._linear_attn(input1, infer_state, layer_weight, is_prefill=True, infer_cls=self)
+        else:
+            gate_value = torch.sigmoid(layer_weight.o_gate_proj.mm(input1))
+            o = self._context_full_attn(input1, gate_value, infer_state, layer_weight)
+        input_embdings.add_(o.view(-1, self.embed_dim_))
+        o = None
+
+        input1 = self._ffn_norm(input_embdings, infer_state, layer_weight)
+        ffn_out = self._ffn_with_shared_expert(input1, infer_state, layer_weight)
+        input1 = None
+        if self.tp_world_size_ > 1:
+            all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
+        input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
+        return input_embdings
+
+    def _token_full_attn(
+        self, input, gate_value, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ):
+        q, cache_kv = self._get_qkv(input, infer_state, layer_weight)
+        input = None
+        self._post_cache_kv(cache_kv, infer_state, layer_weight)
+        o = self._token_attention_kernel(q, infer_state, layer_weight)
+        q = None
+        o = self._get_o(o, gate_value, infer_state, layer_weight)
+        if self.tp_world_size_ > 1:
+            all_reduce(o, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
+        return o
+
+    def token_forward(
+        self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: Qwen3NextTransformerLayerWeight
+    ):
+        input1 = self._att_norm(input_embdings, infer_state, layer_weight)
+        if self.is_linear:
+            o = self.linear_attn_infer._linear_attn(input1, infer_state, layer_weight, is_prefill=False, infer_cls=self)
+        else:
+            gate_value = torch.sigmoid(layer_weight.o_gate_proj.mm(input1))
+            o = self._token_full_attn(input1, gate_value, infer_state, layer_weight)
+        input_embdings.add_(o.view(-1, self.embed_dim_))
+        o = None
+
+        input1 = self._ffn_norm(input_embdings, infer_state, layer_weight)
+        ffn_out = self._ffn_with_shared_expert(input1, infer_state, layer_weight)
+        input1 = None
+        if self.tp_world_size_ > 1:
+            all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
+        input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
+        return input_embdings
+
+
+class Qwen3NextGatedDeltaNetInfer:
+    def __init__(self, network_config, layer_idx, tp_world_size_):
+        self.network_config_ = network_config
+        self.layer_idx_ = layer_idx
+        self.tp_world_size_ = tp_world_size_
+        self.num_v_heads = self.network_config_["linear_num_value_heads"]
+        self.num_k_heads = self.network_config_["linear_num_key_heads"]
+        self.head_k_dim = self.network_config_["linear_key_head_dim"]
+        self.head_v_dim = self.network_config_["linear_value_head_dim"]
+        self.key_dim = self.head_k_dim * self.num_k_heads
+        self.value_dim = self.head_v_dim * self.num_v_heads
+        self.conv_kernel_dim = self.network_config_["linear_conv_kernel_dim"]
+        self.activation = self.network_config_["hidden_act"]
+        self.tp_qkvz_dim = (self.key_dim * 2 + self.value_dim * 2) // self.tp_world_size_
+        self.tp_ba_dim = (self.num_v_heads * 2) // self.tp_world_size_
+        self.tp_num_k_heads = self.num_k_heads // self.tp_world_size_
+        self.tp_num_v_heads = self.num_v_heads // self.tp_world_size_
+        self.tp_key_dim = self.key_dim // self.tp_world_size_
+        self.tp_value_dim = self.value_dim // self.tp_world_size_
+        assert self.num_v_heads % self.num_k_heads == 0, "num_v_heads must be divisible by num_k_heads"
+        self.num_v_heads_per_k_head = self.num_v_heads // self.num_k_heads
+
+    def _fix_query_key_value_ba_ordering(self, mixed_qkvzba):
+        """
+        Derives `query`, `key` and `value` tensors from `mixed_qkvzba`.
+        """
+        mixed_qkvz, mixed_ba = torch.split(mixed_qkvzba, [self.tp_qkvz_dim, self.tp_ba_dim], dim=-1)
+
+        mixed_qkvz = mixed_qkvz.view(
+            -1,
+            self.tp_num_k_heads,
+            self.head_k_dim + self.head_k_dim + (self.head_v_dim + self.head_v_dim) * self.num_v_heads_per_k_head,
+        )
+        mixed_ba = mixed_ba.view(-1, self.tp_num_k_heads, 2 * self.num_v_heads_per_k_head)
+
+        qkvz_split_list = [
+            self.head_k_dim,
+            self.head_k_dim,
+            (self.num_v_heads_per_k_head * self.head_v_dim),
+            (self.num_v_heads_per_k_head * self.head_v_dim),
+        ]
+        (query, key, value, z) = torch.split(mixed_qkvz, qkvz_split_list, dim=2)
+        (b, a) = torch.split(mixed_ba, [self.num_v_heads_per_k_head, self.num_v_heads_per_k_head], dim=2)
+
+        query = query.reshape(-1, self.tp_num_k_heads * self.head_k_dim)
+        key = key.reshape(-1, self.tp_num_k_heads * self.head_k_dim)
+        value = value.reshape(-1, self.tp_num_v_heads * self.head_v_dim)
+        z = z.reshape(-1, self.tp_num_v_heads, self.head_v_dim)
+        b = b.reshape(-1, self.tp_num_v_heads)
+        a = a.reshape(-1, self.tp_num_v_heads)
+
+        return query, key, value, z, b, a
+
+    def _rearrange_mixed_qkv(self, mixed_qkv):
+        if mixed_qkv is None:
+            return None, None, None
+        query, key, value = torch.split(
+            mixed_qkv,
+            [self.tp_key_dim, self.tp_key_dim, self.tp_value_dim],
+            dim=-1,
+        )
+        query, key = map(lambda x: rearrange(x, "l (h d) -> 1 l h d", d=self.head_k_dim), (query, key))
+        value = rearrange(value, "l (h d) -> 1 l h d", d=self.head_v_dim)
+        return query, key, value
+
+    def _linear_attn(
+        self,
+        input: torch.Tensor,
+        infer_state: LlamaInferStateInfo,
+        layer_weight: Qwen3NextTransformerLayerWeight,
+        is_prefill: bool,
+        infer_cls: Qwen3NextTransformerLayerInfer,
+    ):
+        assert layer_weight.is_linear, "layer_weight must be linear"
+        assert isinstance(infer_state.mem_manager, Qwen3NextMemoryManager)
+
+        input = input.view(-1, infer_cls.embed_dim_)
+        buffer_idx = infer_state.req_manager.req_to_buffer_indexes[infer_state.b_req_idx]
+        conv_states, ssm_states = infer_state.mem_manager.get_buffer(self.layer_idx_)
+
+        mixed_qkvzba = layer_weight.linear_in_proj.mm(input)
+        q, k, v, z, b, a = self._fix_query_key_value_ba_ordering(mixed_qkvzba)
+        mixed_qkv = torch.cat([q, k, v], dim=-1)
+
+        if is_prefill:
+            mixed_qkv = mixed_qkv.transpose(0, 1)
+            out_tensor = causal_conv1d_fn(
+                mixed_qkv,
+                layer_weight.linear_conv1d.mm_param.weight.transpose(0, 1),
+                bias=layer_weight.linear_conv1d.mm_param.bias,
+                query_start_loc=infer_state.b1_cu_q_seq_len,
+                cache_indices=buffer_idx,
+                has_initial_state=infer_state.b_ready_cache_len > 0,
+                conv_states=conv_states,
+                activation=self.activation,
+            )
+            mixed_qkv = out_tensor.transpose(0, 1)
+        else:
+            mixed_qkv = causal_conv1d_update(
+                mixed_qkv,
+                conv_states,
+                layer_weight.linear_conv1d.mm_param.weight.transpose(0, 1),
+                bias=layer_weight.linear_conv1d.mm_param.bias,
+                activation=self.activation,
+                conv_state_indices=buffer_idx,
+            )
+
+        # Rearrange mixed_qkv to query, key, value
+        query, key, value = self._rearrange_mixed_qkv(mixed_qkv)
+
+        g, beta = fused_gdn_gating(layer_weight.linear_A_log.weight, a, b, layer_weight.linear_dt_bias.weight)
+
+        if is_prefill:
+            initial_state = ssm_states[buffer_idx].contiguous()
+            (core_attn_out, last_recurrent_state,) = chunk_gated_delta_rule(
+                q=query,
+                k=key,
+                v=value,
+                g=g,
+                beta=beta,
+                initial_state=initial_state,
+                output_final_state=True,
+                cu_seqlens=infer_state.b1_cu_q_seq_len,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True,
+            )
+            # Update SSM state with final state
+            ssm_states[buffer_idx, ...] = last_recurrent_state.to(ssm_states.dtype, copy=False)
+        else:
+            batch_size = input.shape[0]
+            cu_seqlens = torch.arange(0, batch_size + 1, dtype=torch.int32, device=input.device)
+            (core_attn_out, last_recurrent_state,) = fused_recurrent_gated_delta_rule(
+                q=query,
+                k=key,
+                v=value,
+                g=g,
+                beta=beta,
+                initial_state=ssm_states,
+                inplace_final_state=True,
+                cu_seqlens=cu_seqlens,
+                ssm_state_indices=buffer_idx,
+                use_qk_l2norm_in_kernel=True,
+            )
+
+        z_shape_og = z.shape
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        norm_out = infer_cls.alloc_tensor(core_attn_out.shape, core_attn_out.dtype, device=core_attn_out.device)
+        gated_rmsnorm_forward(
+            core_attn_out,
+            layer_weight.linear_norm.weight,
+            layer_weight.linear_norm.bias,
+            infer_cls.eps_,
+            z,
+            out=norm_out,
+        )
+        core_attn_out = norm_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+
+        output = layer_weight.linear_out_proj.mm(core_attn_out)
+        if infer_cls.tp_world_size_ > 1:
+            all_reduce(output, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
+        return output
diff --git a/lightllm/models/qwen3next/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3next/layer_weights/transformer_layer_weight.py
new file mode 100644
index 000000000..611e2bc5a
--- /dev/null
+++ b/lightllm/models/qwen3next/layer_weights/transformer_layer_weight.py
@@ -0,0 +1,176 @@
+import os
+import torch
+import math
+import numpy as np
+from lightllm.common.basemodel import TransformerLayerWeight
+from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
+from lightllm.utils.envs_utils import enable_env_vars
+from lightllm.common.basemodel.layer_weights.meta_weights import (
+    ROWMMWeight,
+    COLMMWeight,
+    NormWeight,
+)
+from functools import partial
+from typing_extensions import override
+from lightllm.common.basemodel.layer_weights.meta_weights import TpParameterWeight
+
+
+class Qwen3NextTransformerLayerWeight(Qwen3MOETransformerLayerWeight):
+    def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
+        super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
+        return
+
+    @override
+    def _parse_config(self):
+        super()._parse_config()
+        self.full_attention_interval = self.network_config_["full_attention_interval"]
+        self.is_linear = (self.layer_num_ + 1) % self.full_attention_interval != 0
+        if self.is_linear:
+            self._parse_linear_config()
+        return
+
+    @override
+    def _init_weight(self):
+        self._init_moe()
+        self._init_shared_expert_weight()
+
+        self.att_norm_weight_ = NormWeight(
+            self._att_norm_weight_name, self.data_type_, bias_name=self._att_norm_bias_name
+        )
+        self.ffn_norm_weight_ = NormWeight(
+            self._ffn_norm_weight_name, self.data_type_, bias_name=self._ffn_norm_bias_name
+        )
+
+        if self.is_linear:
+            self._init_linear_weight()
+        else:
+            self._init_qkv()
+            self._init_o()
+            self.q_norm_weight_ = NormWeight(weight_name=self._q_norm_name, data_type=self.data_type_)
+            self.k_norm_weight_ = NormWeight(weight_name=self._k_norm_name, data_type=self.data_type_)
+            self._o_gate_weight_name = f"model.layers.{self.layer_num_}.self_attn.o_gate_proj.weight"
+            self.o_gate_proj = ROWMMWeight(
+                weight_names=self._o_gate_weight_name,
+                data_type=self.data_type_,
+                bias_names=None,
+                quant_cfg=self.quant_cfg,
+                layer_num=self.layer_num_,
+                name="o_gate_proj",
+            )
+        return
+
+    @override
+    def load_hf_weights(self, weights):
+        if self.is_linear:
+            linear_conv1d_weight_name = f"model.layers.{self.layer_num_}.linear_attn.conv1d.weight"
+            linear_conv1d_bias_name = f"model.layers.{self.layer_num_}.linear_attn.conv1d.bias"
+            if linear_conv1d_weight_name in weights:
+                weights[linear_conv1d_weight_name] = self._parse_linear_conv1d(
+                    weights[linear_conv1d_weight_name].squeeze(1)
+                )
+            if linear_conv1d_bias_name in weights:
+                weights[linear_conv1d_bias_name] = self._parse_linear_conv1d(weights[linear_conv1d_bias_name])
+        else:
+            self._split_q_with_gate(weights)
+        super().load_hf_weights(weights)
+
+    def _init_shared_expert_weight(self):
+        prefix = f"model.layers.{self.layer_num_}.mlp.shared_expert"
+        self.shared_expert_gate_up_proj = ROWMMWeight(
+            weight_names=[f"{prefix}.gate_proj.weight", f"{prefix}.up_proj.weight"],
+            data_type=self.data_type_,
+            quant_cfg=self.quant_cfg,
+            layer_num=self.layer_num_,
+            name="shared_expert_gate_up_proj",
+        )
+        self.shared_expert_down_proj = COLMMWeight(
+            weight_names=f"{prefix}.down_proj.weight",
+            data_type=self.data_type_,
+            quant_cfg=self.quant_cfg,
+            layer_num=self.layer_num_,
+            name="shared_expert_down_proj",
+        )
+        self.shared_expert_gate = ROWMMWeight(
+            weight_names=f"model.layers.{self.layer_num_}.mlp.shared_expert_gate.weight",
+            data_type=self.data_type_,
+            bias_names=None,
+            quant_cfg=self.quant_cfg,
+            layer_num=self.layer_num_,
+            name="shared_expert_gate",
+            tp_rank=0,
+            tp_world_size=1,
+        )
+
+    def _split_q_with_gate(self, weights):
+        if self._q_weight_name in weights:
+            weight = weights[self._q_weight_name]
+            num_heads = self.tp_q_head_num_ * self.tp_world_size_
+            weight = weight.view(num_heads * 2, self.head_dim, -1)
+            _q_proj = weight[0::2].reshape(-1, weight.shape[-1])
+            _gate_proj = weight[1::2].reshape(-1, weight.shape[-1])
+            weights[self._q_weight_name] = _q_proj
+            weights[self._o_gate_weight_name] = _gate_proj
+
+    def _parse_linear_conv1d(self, weight):
+        qk_dim = self.linear_num_k_heads * self.linear_k_head_dim
+        v_dim = self.linear_num_v_heads * self.linear_v_head_dim
+
+        q_bias, k_bias, v_bias = torch.split(weight, [qk_dim, qk_dim, v_dim], dim=0)
+        q_splits = q_bias.chunk(self.tp_world_size_, dim=0)
+        k_splits = k_bias.chunk(self.tp_world_size_, dim=0)
+        v_splits = v_bias.chunk(self.tp_world_size_, dim=0)
+
+        new_weight = torch.cat(
+            [torch.cat([q_splits[i], k_splits[i], v_splits[i]], dim=0) for i in range(self.tp_world_size_)], dim=0
+        )
+
+        return new_weight
+
+    def _parse_linear_config(self):
+        self.linear_num_v_heads = self.network_config_["linear_num_value_heads"]
+        self.linear_num_k_heads = self.network_config_["linear_num_key_heads"]
+        self.linear_k_head_dim = self.network_config_["linear_key_head_dim"]
+        self.linear_v_head_dim = self.network_config_["linear_value_head_dim"]
+
+    def _init_linear_weight(self):
+        prefix = f"model.layers.{self.layer_num_}.linear_attn"
+        self.linear_conv1d = ROWMMWeight(
+            weight_names=f"{prefix}.conv1d.weight",
+            data_type=self.data_type_,
+            quant_cfg=self.quant_cfg,
+            layer_num=self.layer_num_,
+            name="conv1d_weight",
+        )
+
+        self.linear_in_proj = ROWMMWeight(
+            weight_names=[f"{prefix}.in_proj_qkvz.weight", f"{prefix}.in_proj_ba.weight"],
+            data_type=self.data_type_,
+            quant_cfg=self.quant_cfg,
+            layer_num=self.layer_num_,
+            name="in_proj_weight",
+        )
+
+        self.linear_out_proj = COLMMWeight(
+            weight_names=f"{prefix}.out_proj.weight",
+            data_type=self.data_type_,
+            quant_cfg=self.quant_cfg,
+            layer_num=self.layer_num_,
+            name="out_proj_weight",
+        )
+
+        self.linear_dt_bias = TpParameterWeight(
+            weight_name=f"{prefix}.dt_bias",
+            data_type=torch.float32,
+            split_n_embed=self.linear_num_v_heads // self.tp_world_size_,
+        )
+
+        self.linear_A_log = TpParameterWeight(
+            weight_name=f"{prefix}.A_log",
+            data_type=torch.float32,
+            split_n_embed=self.linear_num_v_heads // self.tp_world_size_,
+        )
+
+        self.linear_norm = NormWeight(
+            weight_name=f"{prefix}.norm.weight",
+            data_type=self.data_type_,
+        )
diff --git a/lightllm/models/qwen3next/mem_manager.py b/lightllm/models/qwen3next/mem_manager.py
new file mode 100644
index 000000000..399ef487c
--- /dev/null
+++ b/lightllm/models/qwen3next/mem_manager.py
@@ -0,0 +1,138 @@
+import torch
+import numpy as np
+from typing import Dict, List, Protocol, Set, Union, Tuple, Optional
+from typing_extensions import override
+from lightllm.utils.log_utils import init_logger
+from lightllm.common.kv_cache_mem_manager.mem_manager import BaseAllocator, MemoryManager
+from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.server.router.model_infer.infer_batch import InferReq
+from lightllm.utils.envs_utils import get_unique_server_name
+from lightllm.utils.dist_utils import get_current_rank_in_node
+from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
+from lightllm.server.router.dynamic_prompt.hybrid_radix_cache import HybridMemManager
+
+logger = init_logger(__name__)
+
+
+class LayerCacheMemoryManager(BaseAllocator):
+    def __init__(self, size: int, dtype: torch.dtype, shape: Tuple[int, ...], layer_num: int, mem_manager_nmae: str):
+        super().__init__(size, mem_manager_nmae)
+
+        self.dtype = dtype
+        self.shape = shape
+        self.layer_num = layer_num
+
+        self._init_buffers(
+            self.size,
+            dtype,
+            shape,
+        )
+
+    def _init_buffers(self, size, dtype, shape):
+        self.buffer = torch.zeros((self.layer_num, size + 1, *shape), dtype=dtype, device="cuda")
+
+    def get_cell_size(self):
+        return np.prod(self.shape) * self.layer_num * torch._utils._element_size(self.dtype)
+
+
+class Qwen3NextMemoryManager(HybridMemManager):
+    def __init__(
+        self,
+        full_attn_cache_size,
+        linear_attn_cache_size,
+        dtype,
+        num_kv_heads,
+        head_dim,
+        layer_num,
+        mtp_layer_num,
+        full_attention_interval: int,
+        conv_state_dtype: torch.dtype,
+        conv_state_shape: Tuple[int, ...],
+        ssm_state_dtype: torch.dtype,
+        ssm_state_shape: Tuple[int, ...],
+        max_req_num: int,
+        always_copy=False,
+        mem_fraction=0.9,
+    ):
+        self.full_attention_interval = full_attention_interval
+
+        assert layer_num % full_attention_interval == 0
+        self.layer_num = layer_num
+        self.mtp_layer_num = mtp_layer_num
+        self.full_attn_layer_num = layer_num // full_attention_interval
+        self.linear_attn_layer_num = layer_num - self.full_attn_layer_num
+
+        self.conv_state_dtype = conv_state_dtype
+        self.conv_state_shape = conv_state_shape
+        self.ssm_state_dtype = ssm_state_dtype
+        self.ssm_state_shape = ssm_state_shape
+
+        assert linear_attn_cache_size is not None
+        self.HOLD_BUFFER_INDEX = linear_attn_cache_size
+        self.conv_state_mem_manager = LayerCacheMemoryManager(
+            linear_attn_cache_size, conv_state_dtype, conv_state_shape, self.linear_attn_layer_num, "conv_state"
+        )
+        self.ssm_state_mem_manager = LayerCacheMemoryManager(
+            linear_attn_cache_size, ssm_state_dtype, ssm_state_shape, self.linear_attn_layer_num, "ssm_state"
+        )
+        logger.info(
+            f"Linear attention state cache size: {linear_attn_cache_size}\n"
+            f"Conv state use : "
+            f"{self.conv_state_mem_manager.get_cell_size() * linear_attn_cache_size / 1024 ** 3} GB Memory.\n"
+            f"Ssm state use : "
+            f"{self.ssm_state_mem_manager.get_cell_size() * linear_attn_cache_size / 1024 ** 3} GB Memory.\n"
+        )
+        super().__init__(full_attn_cache_size, dtype, num_kv_heads, head_dim, layer_num, always_copy, mem_fraction)
+
+    @override
+    def _init_buffers(self, size, dtype, head_num, head_dim, layer_num):
+        # kv_buffer = [None, None, None, kv_cache, None, None, None, kv_cache, ...,
+        #                None, kv_cache, mtp_kv_cache, mtp_kv_cache]
+        self.kv_buffer = [None for _ in range(self.layer_num)]
+        for layer_id in range(self.full_attn_layer_num):
+            self.kv_buffer[(layer_id + 1) * self.full_attention_interval - 1] = torch.empty(
+                (size + 1, 2 * head_num, head_dim), dtype=dtype, device="cuda"
+            )
+
+        for _ in range(self.mtp_layer_num):
+            self.kv_buffer.append(torch.empty((size + 1, 2 * head_num, head_dim), dtype=dtype, device="cuda"))
+
+    @override
+    def free_all(self):
+        super().free_all()
+        self.conv_state_mem_manager.free_all()
+        self.ssm_state_mem_manager.free_all()
+        return
+
+    @override
+    def get_buffer(self, layer_index) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert layer_index < self.layer_num, "layer_index is out of range"
+        assert (layer_index + 1) % self.full_attention_interval != 0, "layer_index is not linear attention layer"
+        real_layer_index = layer_index - layer_index // self.full_attention_interval
+        return self.conv_state_mem_manager.buffer[real_layer_index], self.ssm_state_mem_manager.buffer[real_layer_index]
+
+    @override
+    def free_buffer(self, free_buffer_indexes: List[int], reset=True):
+        # conv_state 和 ssm_state 共享buffer_idx
+        self.conv_state_mem_manager.free(free_buffer_indexes)
+        if reset:
+            self.conv_state_mem_manager.buffer[:, free_buffer_indexes] = 0
+            self.ssm_state_mem_manager.buffer[:, free_buffer_indexes] = 0
+
+    @override
+    def alloc_buffer(self, need_size):
+        # conv_state 和 ssm_state 共享buffer_idx
+        return self.conv_state_mem_manager.alloc(need_size)
+
+    @override
+    def get_buffer_can_use_size(self):
+        return self.conv_state_mem_manager.can_use_mem_size
+
+    @override
+    def copy_buffer(self, src_idx, tgt_idx):
+        assert src_idx is not None and tgt_idx is not None
+        assert src_idx != tgt_idx
+        # Use slice operation and in-place copy for better performance
+        self.conv_state_mem_manager.buffer[:, tgt_idx].copy_(self.conv_state_mem_manager.buffer[:, src_idx])
+        self.ssm_state_mem_manager.buffer[:, tgt_idx].copy_(self.ssm_state_mem_manager.buffer[:, src_idx])
+        return
diff --git a/lightllm/models/qwen3next/model.py b/lightllm/models/qwen3next/model.py
new file mode 100644
index 000000000..b83222027
--- /dev/null
+++ b/lightllm/models/qwen3next/model.py
@@ -0,0 +1,117 @@
+import os
+import torch
+from typing import Optional
+from typing_extensions import override
+import triton
+from lightllm.models.registry import ModelRegistry
+from lightllm.models.qwen3_moe.model import Qwen3MOEModel
+from lightllm.models.qwen3next.layer_weights.transformer_layer_weight import Qwen3NextTransformerLayerWeight
+from lightllm.models.qwen3next.layer_infer.transformer_layer_infer import Qwen3NextTransformerLayerInfer
+from lightllm.models.qwen3next.layer_infer.post_layer_infer import Qwen3NextPostLayerInfer
+from lightllm.utils.log_utils import init_logger
+from lightllm.distributed.communication_op import dist_group_manager
+from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.models.qwen3next.mem_manager import Qwen3NextMemoryManager
+from lightllm.server.core.objs.start_args_type import StartArgs
+from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
+from lightllm.models.qwen3next.req_manager import Qwen3NextReqManager
+
+logger = init_logger(__name__)
+
+
+def _triton_allocator(size: int, alignment: int, stream: Optional[int]) -> torch.Tensor:
+    return torch.empty(size, device="cuda", dtype=torch.int8)
+
+
+@ModelRegistry("qwen3_next")
+class Qwen3NextTpPartModel(Qwen3MOEModel):
+    # weight class
+    transformer_weight_class = Qwen3NextTransformerLayerWeight
+
+    # infer class
+    transformer_layer_infer_class = Qwen3NextTransformerLayerInfer
+    post_layer_infer_class = Qwen3NextPostLayerInfer
+
+    def __init__(self, kvargs) -> None:
+        self.mem_manager: Qwen3NextMemoryManager = None
+
+        # Set Triton allocator for TMA descriptors
+        # This is required for kernels in qwen3next/triton_kernel/fla/ops/solve_tril.py
+        triton.set_allocator(_triton_allocator)
+        logger.info("Triton allocator set for Qwen3Next model")
+
+        super().__init__(kvargs)
+
+    @override
+    def autotune_layers(self):
+        return self.config["full_attention_interval"]
+
+    @override
+    def _init_config(self):
+        super()._init_config()
+        self.num_kv_heads = max(self.config["num_key_value_heads"] // self.tp_world_size_, 1)
+
+    @override
+    def _init_custom(self):
+        super()._init_custom()
+        dist_group_manager.new_deepep_group(self.config["num_experts"], self.config["hidden_size"])
+
+    @override
+    def _init_mem_manager(self):
+        assert self.config["num_attention_heads"] % self.tp_world_size_ == 0
+
+        start_args: StartArgs = get_env_start_args()
+
+        mtp_step = start_args.mtp_step
+        mamba_cache_size = start_args.mamba_cache_size
+        if mamba_cache_size is not None:
+            assert (
+                mamba_cache_size >= start_args.running_max_req_size
+            ), "mamba_cache_size must be greater than running_max_req_size"
+
+        self.num_linear_k_heads = self.config["linear_num_key_heads"]
+        self.num_linear_v_heads = self.config["linear_num_value_heads"]
+        self.head_linear_k_dim = self.config["linear_key_head_dim"]
+        self.head_linear_v_dim = self.config["linear_value_head_dim"]
+
+        conv_kernel_size = self.config["linear_conv_kernel_dim"]
+        conv_dim = (
+            self.head_linear_k_dim * self.num_linear_k_heads * 2 + self.head_linear_v_dim * self.num_linear_v_heads
+        )
+
+        ssm_dtype_dict = {"bfloat16": torch.bfloat16, "float32": torch.float32}
+        assert start_args.mamba_ssm_data_type in ssm_dtype_dict
+
+        self.mem_manager = Qwen3NextMemoryManager(
+            full_attn_cache_size=self.max_total_token_num,
+            linear_attn_cache_size=mamba_cache_size,
+            dtype=self.data_type,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.config["head_dim"],
+            layer_num=self.config["n_layer"],
+            mtp_layer_num=start_args.mtp_step,
+            full_attention_interval=self.config["full_attention_interval"],
+            conv_state_dtype=self.data_type,
+            conv_state_shape=(conv_dim // self.tp_world_size_, conv_kernel_size - 1 + mtp_step),
+            ssm_state_dtype=ssm_dtype_dict[start_args.mamba_ssm_data_type],
+            ssm_state_shape=(
+                # mtp_step + 1,
+                self.num_linear_v_heads // self.tp_world_size_,
+                self.head_linear_k_dim,
+                self.head_linear_v_dim,
+            ),
+            max_req_num=self.max_req_num,
+            mem_fraction=self.mem_fraction,
+        )
+
+    @override
+    def _init_req_manager(self):
+        create_max_seq_len = 0
+
+        if self.batch_max_tokens is not None:
+            create_max_seq_len = max(create_max_seq_len, self.batch_max_tokens)
+        if self.max_seq_length is not None:
+            create_max_seq_len = max(create_max_seq_len, self.max_seq_length)
+
+        self.req_manager = Qwen3NextReqManager(self.max_req_num, create_max_seq_len, self.mem_manager)
+        return
diff --git a/lightllm/models/qwen3next/req_manager.py b/lightllm/models/qwen3next/req_manager.py
new file mode 100644
index 000000000..ae1e961c6
--- /dev/null
+++ b/lightllm/models/qwen3next/req_manager.py
@@ -0,0 +1,50 @@
+from typing import override, List
+
+import torch
+
+from lightllm.common.req_manager import ReqManager
+from lightllm.models.qwen3next.mem_manager import Qwen3NextMemoryManager
+
+
+class Qwen3NextReqManager(ReqManager):
+    def __init__(self, max_request_num, max_sequence_length, mem_manager: Qwen3NextMemoryManager):
+        super().__init__(max_request_num, max_sequence_length, mem_manager)
+        self.EMPTY_BUFFER_INDEX = -1
+        self.req_to_buffer_indexes = torch.zeros((self.max_request_num + 1), dtype=torch.int32, device="cuda")
+        self.req_to_buffer_indexes[:-1] = self.EMPTY_BUFFER_INDEX
+        self.req_to_buffer_indexes[-1] = self.mem_manager.HOLD_BUFFER_INDEX
+
+    @override
+    def free(self, free_req_indexes: List[int], free_token_index):
+        self.free_buffer(free_req_indexes)
+        super().free(free_req_indexes, free_token_index)
+
+    @override
+    def free_all(self):
+        self.req_to_buffer_indexes[:-1] = self.EMPTY_BUFFER_INDEX
+        super().free_all()
+        return
+
+    @override
+    def alloc(self):
+        from lightllm.common.basemodel.infer_lock import g_infer_state_lock
+        from lightllm.server.router.model_infer.infer_batch import g_infer_context
+
+        req_index = super().alloc()
+
+        g_infer_state_lock.acquire()
+        if g_infer_context.radix_cache is not None:
+            g_infer_context.radix_cache.free_radix_cache_to_get_enough_buffer(1)
+        new_buffer_index = self.mem_manager.alloc_buffer(1)
+        self.req_to_buffer_indexes[req_index] = new_buffer_index
+        g_infer_state_lock.release()
+
+        return req_index
+
+    def free_buffer(self, free_req_indexes: List[int]):
+        from lightllm.server.router.model_infer.infer_batch import g_infer_context
+
+        if g_infer_context.radix_cache is None:
+            self.mem_manager.free_buffer(self.req_to_buffer_indexes[free_req_indexes])
+        self.req_to_buffer_indexes[free_req_indexes] = self.EMPTY_BUFFER_INDEX
+        return
diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
new file mode 100644
index 000000000..c6d099a2d
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -0,0 +1,122 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
+
+from typing import Optional
+
+import torch
+
+from sgl_kernel import causal_conv1d_fwd
+from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    query_start_loc: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = -1,
+    **kwargs,
+):
+    """
+    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    bias: (dim,)
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+    activation: either None or "silu" or "swish"
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+    causal_conv1d_fwd(
+        x,
+        weight,
+        bias,
+        conv_states,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        activation in ["silu", "swish"],
+        pad_slot_id,
+    )
+    return x
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    pad_slot_id: int = -1,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
+    activation_val = activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    causal_conv1d_update_kernel(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation_val,
+        cache_seqlens,
+        conv_state_indices,
+        pad_slot_id,
+    )
+    if unsqueeze:
+        x = x.squeeze(-1)
+    return x
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/__init__.py b/lightllm/models/qwen3next/triton_kernel/fla/__init__.py
new file mode 100644
index 000000000..2bde70bb9
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# Adapted from
+# https://github.com/vllm-project/vllm
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/__init__.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/__init__.py
new file mode 100644
index 000000000..cd3b0962a
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from .chunk import chunk_gated_delta_rule
+from .fused_recurrent import fused_recurrent_gated_delta_rule
+
+__all__ = [
+    "chunk_gated_delta_rule",
+    "fused_recurrent_gated_delta_rule",
+]
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk.py
new file mode 100644
index 000000000..db4969cb0
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk.py
@@ -0,0 +1,235 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import warnings
+
+import torch
+from einops import rearrange
+
+from .chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from .chunk_o import chunk_fwd_o
+from .chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
+from .cumsum import chunk_local_cumsum
+from .l2norm import l2norm_fwd
+from .solve_tril import solve_tril
+from .utils import SUPPRESS_LEVEL, input_guard
+from .wy_fast import recompute_w_u_fwd
+
+
+def chunk_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: torch.LongTensor | None = None,
+):
+    g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
+    # obtain WY representation. u is actually the new v.
+    A = chunk_scaled_dot_kkt_fwd(k=k, beta=beta, g=g, cu_seqlens=cu_seqlens, chunk_size=64, output_dtype=torch.float32)
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        g_cumsum=g,
+        cu_seqlens=cu_seqlens,
+    )
+    h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+        k=k,
+        w=w,
+        u=u,
+        g=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_size=64,
+    )
+    o = chunk_fwd_o(
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_size=64,
+    )
+    if SUPPRESS_LEVEL < 3:
+        return g, o, A, final_state, None, None, None
+    elif SUPPRESS_LEVEL >= 3:
+        return g, o, A, final_state, w, h, v_new
+
+
+class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: torch.LongTensor | None = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        if use_qk_l2norm_in_kernel:
+            q = l2norm_fwd(q)
+            k = l2norm_fwd(k)
+
+        g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            cu_seqlens=cu_seqlens,
+        )
+        ctx.scale = scale
+        ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel
+        return o.to(q.dtype), final_state
+
+
+@torch.compiler.disable
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: torch.LongTensor | None = None,
+    head_first: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        g (torch.Tensor):
+            (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, K, V = 4, 2048, 4, 512, 512
+        >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda')
+        >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid()
+        >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda'))
+        >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda')
+        >>> o, ht = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert q.dtype != torch.float32, "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    assert len(beta.shape) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+
+    if head_first:
+        raise DeprecationWarning(
+            "head_first is deprecated and will be removed in a future version. "
+            "Please use head_first=False for now instead.",
+            stacklevel=2,
+        )
+        q, k, v, beta, g = map(lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g))
+    if not head_first and q.shape[1] < q.shape[2]:
+        warnings.warn(
+            "Input tensor shape suggests potential"
+            f" format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
+            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+            "when head_first=False was specified. "
+            "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
+            stacklevel=2,
+        )
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    o, final_state = ChunkGatedDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
+    if head_first:
+        o = rearrange(o, "b t h ... -> b h t ...")
+    return o, final_state
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_delta_h.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_delta_h.py
new file mode 100644
index 000000000..b27fe7ada
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_delta_h.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .index import prepare_chunk_indices, prepare_chunk_offsets
+from .op import exp, safe_exp
+from .utils import use_cuda_graph
+from lightllm.common.triton_utils.autotuner import autotune
+
+NUM_WARPS = [2, 4, 8, 16]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_GK": lambda args: args["gk"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,
+    v,
+    w,
+    v_new,
+    g,
+    gk,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    h += ((boh * H + i_h) * K * V).to(tl.int64)
+    v += ((bos * H + i_h) * V).to(tl.int64)
+    k += ((bos * Hg + i_h // (H // Hg)) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
+    if SAVE_NEW_VALUE:
+        v_new += ((bos * H + i_h) * V).to(tl.int64)
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = Hg * K
+    stride_w = H * K
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_nh * K * V
+    if STORE_FINAL_STATE:
+        ht = ht + i_nh * K * V
+
+    # load initial state
+    if USE_INITIAL_STATE:
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+    # main recurrence
+    for i_t in range(NT):
+        p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+        p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0))
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_v = tl.dot(b_w, b_h1.to(b_w.dtype))
+        if K > 64:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h2.to(b_w.dtype))
+        if K > 128:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h3.to(b_w.dtype))
+        if K > 192:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h4.to(b_w.dtype))
+        p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1)) - b_v
+
+        if SAVE_NEW_VALUE:
+            p_v = tl.make_block_ptr(v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            tl.store(p_v, b_v.to(p_v.dtype.element_ty), boundary_check=(0, 1))
+
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if USE_G:
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            b_g = tl.load(p_g, boundary_check=(0,))
+            b_v = b_v * safe_exp(b_g_last - b_g)[:, None]
+            b_g_last = exp(b_g_last)
+            b_h1 = b_h1 * b_g_last
+            if K > 64:
+                b_h2 = b_h2 * b_g_last
+            if K > 128:
+                b_h3 = b_h3 * b_g_last
+            if K > 192:
+                b_h4 = b_h4 * b_g_last
+
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(
+                gk + (bos + last_idx) * H * K + i_h * K + o_k1,
+                mask=(o_k1 < K),
+                other=0.0,
+            )
+            b_h1 *= exp(b_gk_last1)[:, None]
+            if K > 64:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k2,
+                    mask=(o_k2 < K),
+                    other=0.0,
+                )
+                b_h2 *= exp(b_gk_last2)[:, None]
+            if K > 128:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k3,
+                    mask=(o_k3 < K),
+                    other=0.0,
+                )
+                b_h3 *= exp(b_gk_last3)[:, None]
+            if K > 192:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k4,
+                    mask=(o_k4 < K),
+                    other=0.0,
+                )
+                b_h4 *= exp(b_gk_last4)[:, None]
+        b_v = b_v.to(k.dtype.element_ty)
+
+        p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_h1 += tl.dot(b_k, b_v)
+        if K > 64:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h2 += tl.dot(b_k, b_v)
+        if K > 128:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h3 += tl.dot(b_k, b_v)
+        if K > 192:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h4 += tl.dot(b_k, b_v)
+    # epilogue
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+def _get_chunk_delta_h_configs():
+    return [
+        {"BV": BV, "num_warps": num_warps, "num_stages": num_stages}
+        for num_warps in [2, 4]
+        for num_stages in [2, 3, 4]
+        for BV in [32, 64]
+    ]
+
+
+def _get_chunk_delta_h_static_key(k, u, chunk_size):
+    B, T, Hg, K = k.shape
+    V = u.shape[-1]
+    H = u.shape[-2]
+    return {"H": H, "K": K, "V": V, "BT": chunk_size}
+
+
+def _get_chunk_delta_h_run_key(k, u):
+    # Return batch * heads as run key
+    return k.shape[0] * k.shape[2]
+
+
+@autotune(
+    kernel_name="chunk_gated_delta_rule_fwd_h",
+    configs_gen_func=_get_chunk_delta_h_configs,
+    static_key_func=_get_chunk_delta_h_static_key,
+    run_key_func=_get_chunk_delta_h_run_key,
+)
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: torch.Tensor | None = None,
+    gk: torch.Tensor | None = None,
+    initial_state: torch.Tensor | None = None,
+    output_final_state: bool = False,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    save_new_value: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    run_config=None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # This kernel is slightly different from fla to support Q/K with different head numbers.
+    # In fla, Q/K always have the same head number, so Hg is always equal to H.
+    B, T, Hg, K, V = *k.shape, u.shape[-1]
+    H = u.shape[-2]
+    BT = chunk_size
+
+    chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size) if cu_seqlens is not None else None
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = (
+            len(cu_seqlens) - 1,
+            len(chunk_indices),
+            prepare_chunk_offsets(cu_seqlens, BT),
+        )
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+
+    v_new = torch.empty_like(u) if save_new_value else None
+
+    # Extract config parameters
+    if run_config is None:
+        run_config = {"BV": 64, "num_warps": 2, "num_stages": 2}
+
+    BV = run_config.get("BV", 64)
+    num_warps = run_config.get("num_warps", 2)
+    num_stages = run_config.get("num_stages", 2)
+
+    grid = (triton.cdiv(V, BV), N * H)
+
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        gk=gk,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BV=BV,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return h, v_new, final_state
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_o.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_o.py
new file mode 100644
index 000000000..fc49763ec
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_o.py
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# ruff: noqa: E501
+
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .index import prepare_chunk_indices
+from .op import exp, safe_exp
+from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper
+from lightllm.common.triton_utils.autotuner import autotune
+
+BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_fwd_kernel_o(
+    q,
+    k,
+    v,
+    h,
+    g,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    # offset calculation
+    q += (bos * Hg + i_h // (H // Hg)) * K
+    k += (bos * Hg + i_h // (H // Hg)) * K
+    v += (bos * H + i_h) * V
+    o += (bos * H + i_h) * V
+    h += (i_tg * H + i_h).to(tl.int64) * K * V
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+
+        # [BT, BK] @ [BK, BV] -> [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_A += tl.dot(b_q, b_k)
+
+    if USE_G:
+        g += bos * H + i_h
+        p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
+        b_o = b_o * exp(b_g)[:, None]
+        b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :])
+
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0)
+
+    p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+
+    # to fix mma -> mma layout conversion
+    # already solved by triton v3.2 or higher
+    b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def _get_chunk_o_configs():
+    return [
+        {"BK": BK, "BV": BV, "num_warps": num_warps, "num_stages": num_stages}
+        for BK in BKV_LIST
+        for BV in BKV_LIST
+        for num_warps in NUM_WARPS
+        for num_stages in [2, 3, 4]
+    ]
+
+
+def _get_chunk_o_static_key(q, v, chunk_size):
+    B, T, Hg, K = q.shape
+    V = v.shape[-1]
+    H = v.shape[-2]
+    BT = 64 if FLA_GDN_FIX_BT else min(chunk_size, max(16, triton.next_power_of_2(T)))
+    return {"H": H, "K": K, "V": V, "BT": BT}
+
+
+def _get_chunk_o_run_key(q, v):
+    # Return batch * heads as run key
+    return q.shape[0] * q.shape[2]
+
+
+@autotune(
+    kernel_name="chunk_fwd_o",
+    configs_gen_func=_get_chunk_o_configs,
+    static_key_func=_get_chunk_o_static_key,
+    run_key_func=_get_chunk_o_run_key,
+)
+def chunk_fwd_o(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: torch.Tensor | None = None,  # cumsum of log decay
+    scale: float | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+    run_config=None,
+) -> torch.Tensor:
+    B, T, Hg, K, V = *q.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = 64 if FLA_GDN_FIX_BT else min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    o = torch.empty_like(v)
+
+    # Extract config parameters
+    if run_config is None:
+        run_config = {"BK": 64, "BV": 64, "num_warps": 2, "num_stages": 2}
+
+    BK = run_config.get("BK", 64)
+    BV = run_config.get("BV", 64)
+    num_warps = run_config.get("num_warps", 2)
+    num_stages = run_config.get("num_stages", 2)
+
+    grid = (triton.cdiv(V, BV), NT, B * H)
+
+    chunk_fwd_kernel_o[grid](
+        q,
+        k,
+        v,
+        h,
+        g,
+        o,
+        cu_seqlens,
+        chunk_indices,
+        scale,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return o
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_scaled_dot_kkt.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_scaled_dot_kkt.py
new file mode 100644
index 000000000..715d52dfa
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/chunk_scaled_dot_kkt.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .index import prepare_chunk_indices
+from .op import exp, safe_exp
+from lightllm.common.triton_utils.autotuner import autotune
+
+triton.set_allocator
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_scaled_dot_kkt_fwd_kernel(
+    k,
+    beta,
+    g,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    USE_G: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+
+    p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    b_beta = tl.load(p_beta, boundary_check=(0,))
+
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_A += tl.dot(b_k, tl.trans(b_k))
+
+    if USE_G:
+        p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
+        b_g_diff = b_g[:, None] - b_g[None, :]
+        b_A = b_A * safe_exp(b_g_diff)
+
+    b_A *= b_beta[:, None]
+    m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0)
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+
+
+def _get_chunk_scaled_dot_kkt_configs():
+    return [
+        {"BK": BK, "num_warps": num_warps, "num_stages": num_stages}
+        for BK in [32, 64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ]
+
+
+def _get_chunk_scaled_dot_kkt_static_key(k, beta, chunk_size=64, cu_seqlens=None):
+    B, T, Hg, K = k.shape
+    H = beta.shape[-1]
+    IS_VARLEN = cu_seqlens is not None
+    return {"H": H, "K": K, "BT": chunk_size, "IS_VARLEN": IS_VARLEN}
+
+
+def _get_chunk_scaled_dot_kkt_run_key(k, beta):
+    # Return batch * heads as run key
+    return k.shape[0] * k.shape[2]
+
+
+@autotune(
+    kernel_name="chunk_scaled_dot_kkt_fwd",
+    configs_gen_func=_get_chunk_scaled_dot_kkt_configs,
+    static_key_func=_get_chunk_scaled_dot_kkt_static_key,
+    run_key_func=_get_chunk_scaled_dot_kkt_run_key,
+)
+def chunk_scaled_dot_kkt_fwd(
+    k: torch.Tensor,
+    g: torch.Tensor | None = None,
+    beta: torch.Tensor | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+    output_dtype: torch.dtype = torch.float32,
+    run_config=None,
+) -> torch.Tensor:
+    r"""
+    Compute beta * K * K^T.
+
+    Args:
+        k (torch.Tensor):
+            The key tensor of shape `[B, T, H, K]`.
+        beta (torch.Tensor):
+            The beta tensor of shape `[B, T, H]`.
+        g (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H]`. Default: `None`.
+        cu_seqlens (torch.LongTensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None
+        chunk_size (int):
+            The chunk size. Default: 64.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float32`
+
+    Returns:
+        beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
+    """
+    # This kernel is slightly different from fla to support Q/K with different head numbers.
+    # In fla, Q/K always have the same head number, so Hg is always equal to H.
+    B, T, Hg, K = k.shape
+    H = beta.shape[-1]
+    BT = chunk_size
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    # Extract config parameters
+    if run_config is None:
+        run_config = {"BK": 64, "num_warps": 2, "num_stages": 2}
+
+    BK = run_config.get("BK", 64)
+    num_warps = run_config.get("num_warps", 2)
+    num_stages = run_config.get("num_stages", 2)
+
+    A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
+    chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)](
+        k=k,
+        g=g,
+        beta=beta,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        BT=BT,
+        BK=BK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return A
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/cumsum.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/cumsum.py
new file mode 100644
index 000000000..64ec2d6cd
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/cumsum.py
@@ -0,0 +1,317 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import warnings
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .index import prepare_chunk_indices
+from .utils import check_shared_mem, input_guard
+from lightllm.common.triton_utils.autotuner import autotune
+
+BS_LIST = [32, 64] if check_shared_mem() else [16, 32]
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_scalar_kernel(
+    s,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+    else:
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    # [BT]
+    b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+    b_o = tl.cumsum(b_s, axis=0)
+    if REVERSE:
+        b_z = tl.sum(b_s, axis=0)
+        b_o = -b_o + b_z[None] + b_s
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_vector_kernel(
+    s,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    o_i = tl.arange(0, BT)
+    if REVERSE:
+        m_s = tl.where(o_i[:, None] <= o_i[None, :], 1.0, 0.0)
+    else:
+        m_s = tl.where(o_i[:, None] >= o_i[None, :], 1.0, 0.0)
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+    else:
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    b_o = tl.dot(m_s, b_s, allow_tf32=False)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def _get_cumsum_scalar_configs():
+    return [{"num_warps": num_warps} for num_warps in [1, 2, 4, 8]]
+
+
+def _get_cumsum_scalar_static_key(g, chunk_size, reverse, cu_seqlens, head_first):
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    IS_VARLEN = cu_seqlens is not None
+    return {"B": B, "H": H, "BT": chunk_size, "IS_VARLEN": IS_VARLEN, "REVERSE": reverse}
+
+
+def _get_cumsum_scalar_run_key(g):
+    # Return total number of elements as run key
+    return g.shape[0] * g.shape[1]
+
+
+@autotune(
+    kernel_name="chunk_local_cumsum_scalar",
+    configs_gen_func=_get_cumsum_scalar_configs,
+    static_key_func=_get_cumsum_scalar_static_key,
+    run_key_func=_get_cumsum_scalar_run_key,
+)
+def chunk_local_cumsum_scalar(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor | None = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype | None = torch.float,
+    run_config=None,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    BT = chunk_size
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    # Extract config parameters
+    if run_config is None:
+        run_config = {"num_warps": 2}
+
+    num_warps = run_config.get("num_warps", 2)
+
+    grid = (NT, B * H)
+    chunk_local_cumsum_scalar_kernel[grid](
+        g_org,
+        g,
+        cu_seqlens,
+        chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+        num_warps=num_warps,
+    )
+    return g
+
+
+def _get_cumsum_vector_configs():
+    return [{"BS": BS, "num_warps": num_warps} for BS in BS_LIST for num_warps in [2, 4, 8]]
+
+
+def _get_cumsum_vector_static_key(g, chunk_size, reverse, cu_seqlens, head_first):
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    IS_VARLEN = cu_seqlens is not None
+    return {"B": B, "H": H, "S": S, "BT": chunk_size, "IS_VARLEN": IS_VARLEN, "REVERSE": reverse}
+
+
+def _get_cumsum_vector_run_key(g):
+    # Return batch * heads as run key
+    return g.shape[0] * g.shape[2] if len(g.shape) == 4 else g.shape[0]
+
+
+@autotune(
+    kernel_name="chunk_local_cumsum_vector",
+    configs_gen_func=_get_cumsum_vector_configs,
+    static_key_func=_get_cumsum_vector_static_key,
+    run_key_func=_get_cumsum_vector_run_key,
+)
+def chunk_local_cumsum_vector(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor | None = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype | None = torch.float,
+    run_config=None,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    BT = chunk_size
+    chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    # Extract config parameters
+    if run_config is None:
+        run_config = {"BS": 32, "num_warps": 2}
+
+    BS = run_config.get("BS", 32)
+    num_warps = run_config.get("num_warps", 2)
+
+    grid = (triton.cdiv(S, BS), NT, B * H)
+
+    # keep cumulative normalizer in fp32
+    # this kernel is equivalent to
+    # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
+    chunk_local_cumsum_vector_kernel[grid](
+        g_org,
+        g,
+        cu_seqlens,
+        chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BT=BT,
+        BS=BS,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+        num_warps=num_warps,
+    )
+    return g
+
+
+@input_guard
+def chunk_local_cumsum(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor | None = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype | None = torch.float,
+    **kwargs,
+) -> torch.Tensor:
+    if not head_first and g.shape[1] < g.shape[2]:
+        warnings.warn(
+            f"Input tensor shape suggests potential format mismatch: "
+            f"seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). "
+            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+            "when head_first=False was specified. "
+            "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
+            stacklevel=2,
+        )
+    if cu_seqlens is not None:
+        assert g.shape[0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(g.shape) == 3:
+        return chunk_local_cumsum_scalar(g, chunk_size, reverse, cu_seqlens, head_first, output_dtype)
+    elif len(g.shape) == 4:
+        return chunk_local_cumsum_vector(g, chunk_size, reverse, cu_seqlens, head_first, output_dtype)
+    else:
+        raise ValueError(
+            f"Unsupported input shape {g.shape}. "
+            f"which should be (B, T, H, D) if `head_first=False` "
+            f"or (B, H, T, D) otherwise"
+        )
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/fused_recurrent.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/fused_recurrent.py
new file mode 100644
index 000000000..e399b3c0a
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/fused_recurrent.py
@@ -0,0 +1,383 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .op import exp
+
+
+@triton.heuristics(
+    {
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+        "IS_CONTINUOUS_BATCHING": lambda args: args["ssm_state_indices"] is not None,
+        "IS_SPEC_DECODING": lambda args: args["num_accepted_tokens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["N", "T"])
+def fused_recurrent_gated_delta_rule_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0,
+    ht,
+    cu_seqlens,
+    ssm_state_indices,
+    num_accepted_tokens,
+    scale,
+    N: tl.int64,  # num of sequences
+    T: tl.int64,  # num of tokens
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    stride_init_state_token: tl.constexpr,
+    stride_final_state_token: tl.constexpr,
+    stride_indices_seq: tl.constexpr,
+    stride_indices_tok: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    INPLACE_FINAL_STATE: tl.constexpr,  # whether to store final state inplace
+    IS_BETA_HEADWISE: tl.constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+    IS_KDA: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int64),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int64),
+        )
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+
+    if T == 0:
+        # no tokens to process for this sequence
+        return
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    if IS_BETA_HEADWISE:
+        p_beta = beta + (bos * HV + i_hv) * V + o_v
+    else:
+        p_beta = beta + bos * HV + i_hv
+
+    if not IS_KDA:
+        p_g = g + bos * HV + i_hv
+    else:
+        p_gk = g + (bos * HV + i_hv) * K + o_k
+
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        if IS_CONTINUOUS_BATCHING:
+            if IS_SPEC_DECODING:
+                i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1
+            else:
+                i_t = 0
+            p_h0 = (
+                h0 + tl.load(ssm_state_indices + i_n * stride_indices_seq + i_t).to(tl.int64) * stride_init_state_token
+            )
+        else:
+            p_h0 = h0 + bos * HV * K * V
+        p_h0 = p_h0 + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    for i_t in range(0, T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6)
+            b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6)
+        b_q = b_q * scale
+        # [BK, BV]
+        if not IS_KDA:
+            b_g = tl.load(p_g).to(tl.float32)
+            b_h *= exp(b_g)
+        else:
+            b_gk = tl.load(p_gk).to(tl.float32)
+            b_h *= exp(b_gk[:, None])
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # keep the states for multi-query tokens
+        if INPLACE_FINAL_STATE:
+            p_ht = (
+                ht + tl.load(ssm_state_indices + i_n * stride_indices_seq + i_t).to(tl.int64) * stride_final_state_token
+            )
+        else:
+            p_ht = ht + (bos + i_t) * stride_final_state_token
+        p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        if not IS_KDA:
+            p_g += HV
+        else:
+            p_gk += HV * K
+        p_beta += HV * (V if IS_BETA_HEADWISE else 1)
+
+
+def fused_recurrent_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    inplace_final_state: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = q.new_empty(NK, *v.shape)
+    if inplace_final_state:
+        final_state = initial_state
+    else:
+        final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype)
+
+    stride_init_state_token = initial_state.stride(0)
+    stride_final_state_token = final_state.stride(0)
+
+    if ssm_state_indices is None:
+        stride_indices_seq, stride_indices_tok = 1, 1
+    elif ssm_state_indices.ndim == 1:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1
+    else:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride()
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        ssm_state_indices=ssm_state_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        scale=scale,
+        N=N,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        stride_init_state_token=stride_init_state_token,
+        stride_final_state_token=stride_final_state_token,
+        stride_indices_seq=stride_indices_seq,
+        stride_indices_tok=stride_indices_tok,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        INPLACE_FINAL_STATE=inplace_final_state,
+        IS_KDA=False,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o, final_state
+
+
+class FusedRecurrentFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        inplace_final_state: bool = True,
+        cu_seqlens: torch.LongTensor | None = None,
+        ssm_state_indices: torch.Tensor | None = None,
+        num_accepted_tokens: torch.Tensor | None = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        o, final_state = fused_recurrent_gated_delta_rule_fwd(
+            q=q.contiguous(),
+            k=k.contiguous(),
+            v=v.contiguous(),
+            g=g.contiguous(),
+            beta=beta.contiguous(),
+            scale=scale,
+            initial_state=initial_state,
+            inplace_final_state=inplace_final_state,
+            cu_seqlens=cu_seqlens,
+            ssm_state_indices=ssm_state_indices,
+            num_accepted_tokens=num_accepted_tokens,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+        )
+
+        return o, final_state
+
+
+def fused_recurrent_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    inplace_final_state: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, HV, V]`.
+            GVA is applied if `HV > H`.
+        g (torch.Tensor):
+            g (decays) of shape `[B, T, HV]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, HV]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, HV, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        inplace_final_state: bool:
+            Whether to store the final state in-place to save memory.
+            Default: `True`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        ssm_state_indices (Optional[torch.Tensor]):
+            Indices to map the input sequences to the initial/final states.
+        num_accepted_tokens (Optional[torch.Tensor]):
+            Number of accepted tokens for each sequence during decoding.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, HV, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, HV, K, V]`.
+
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, HV, V, device='cuda')
+        >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda'))
+        >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid()
+        >>> h0 = torch.randn(B, HV, K, V, device='cuda')
+        >>> o, ht = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    if cu_seqlens is not None and q.shape[0] != 1:
+        raise ValueError(
+            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+            f"Please flatten variable-length inputs before processing."
+        )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o, final_state = FusedRecurrentFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        inplace_final_state,
+        cu_seqlens,
+        ssm_state_indices,
+        num_accepted_tokens,
+        use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/index.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/index.py
new file mode 100644
index 000000000..8b1d59fc6
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/index.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import torch
+
+import triton
+
+from .utils import tensor_cache
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return cu_seqlens[1:] - cu_seqlens[:-1]
+
+
+@tensor_cache
+def prepare_chunk_indices(cu_seqlens: torch.LongTensor, chunk_size: int) -> torch.LongTensor:
+    indices = torch.cat([torch.arange(n) for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()])
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_offsets(cu_seqlens: torch.LongTensor, chunk_size: int) -> torch.LongTensor:
+    return torch.cat([cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)]).cumsum(-1)
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/l2norm.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/l2norm.py
new file mode 100644
index 000000000..29f892ef2
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/l2norm.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import os
+
+import torch
+
+import triton
+import triton.language as tl
+from lightllm.common.triton_utils.autotuner import autotune
+
+BT_LIST = [8, 16, 32, 64, 128]
+
+USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0"))
+
+
+@triton.jit
+def l2norm_fwd_kernel1(
+    x,
+    y,
+    D,
+    BD: tl.constexpr,
+    eps,
+):
+    i_t = tl.program_id(0)
+    x += i_t * D
+    y += i_t * D
+    # Compute mean and variance
+    cols = tl.arange(0, BD)
+    mask = cols < D
+    b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=0)
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+    # tl.store(Rstd + i_t, rstd)
+    # Normalize and apply linear transformation
+    b_y = b_x * b_rstd
+    tl.store(y + cols, b_y, mask=mask)
+
+
+@triton.jit(do_not_specialize=["NB"])
+def l2norm_fwd_kernel(
+    x,
+    y,
+    eps,
+    NB,
+    T,
+    D: tl.constexpr,
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+    p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=1)
+    b_y = b_x / tl.sqrt(b_var + eps)[:, None]
+    p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr):
+    xoffset = tl.program_id(0) * MBLOCK
+    row_idx = xoffset + tl.arange(0, MBLOCK)[:, None]
+    xmask = row_idx < M
+    rindex = tl.arange(0, N)[None, :]
+    xs = tl.load(X + (rindex + N * row_idx), xmask).to(tl.float32)
+    square = tl.broadcast_to(xs * xs, [MBLOCK, N])
+    square_sum = tl.sum(tl.where(xmask, square, 0), 1)[:, None]
+    rsqrt = tl.rsqrt(square_sum + eps)
+    tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask)
+
+
+def _get_l2norm_kernel1_configs():
+    return [{"num_warps": num_warps} for num_warps in [1, 2, 4, 8, 16, 32]]
+
+
+def _get_l2norm_kernel1_static_key(x):
+    D = x.shape[-1]
+    return {"D": D}
+
+
+def _get_l2norm_kernel1_run_key(x):
+    return x.shape[0]  # T
+
+
+@autotune(
+    kernel_name="l2norm_fwd_kernel1",
+    configs_gen_func=_get_l2norm_kernel1_configs,
+    static_key_func=_get_l2norm_kernel1_static_key,
+    run_key_func=_get_l2norm_kernel1_run_key,
+)
+def _l2norm_fwd_kernel1_wrapper(x, y, eps, D, BD, run_config=None):
+    if run_config is None:
+        run_config = {"num_warps": 4}
+
+    num_warps = run_config.get("num_warps", 4)
+    T = x.shape[0]
+
+    l2norm_fwd_kernel1[(T,)](x, y, eps=eps, D=D, BD=BD, num_warps=num_warps)
+
+
+def _get_l2norm_kernel_configs():
+    return [{"BT": BT, "num_warps": num_warps} for num_warps in [1, 2, 4, 8, 16] for BT in BT_LIST]
+
+
+def _get_l2norm_kernel_static_key(x):
+    D = x.shape[-1]
+    return {"D": D}
+
+
+def _get_l2norm_kernel_run_key(x):
+    return x.shape[0]  # T
+
+
+@autotune(
+    kernel_name="l2norm_fwd_kernel",
+    configs_gen_func=_get_l2norm_kernel_configs,
+    static_key_func=_get_l2norm_kernel_static_key,
+    run_key_func=_get_l2norm_kernel_run_key,
+)
+def _l2norm_fwd_kernel_wrapper(x, y, eps, T, D, BD, NB, run_config=None):
+    if run_config is None:
+        run_config = {"BT": 32, "num_warps": 4}
+
+    BT = run_config.get("BT", 32)
+    num_warps = run_config.get("num_warps", 4)
+
+    grid = (triton.cdiv(T, BT),)
+    l2norm_fwd_kernel[grid](x, y, eps, NB=NB, T=T, D=D, BT=BT, BD=BD, num_warps=num_warps)
+
+
+def l2norm_fwd(x: torch.Tensor, eps: float = 1e-6, output_dtype: torch.dtype | None = None):
+    x_shape_og = x.shape
+    x = x.view(-1, x.shape[-1])
+    # allocate output
+    if output_dtype is None:
+        y = torch.empty_like(x)
+    else:
+        y = torch.empty_like(x, dtype=output_dtype)
+    assert y.stride(-1) == 1
+    T, D = x.shape[0], x.shape[-1]
+    # rstd = torch.empty((T,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BD:
+        raise RuntimeError("This layer doesn't support feature dim >= 64KB.")
+
+    if not USE_DEFAULT_FLA_NORM:
+        MBLOCK = 32
+        # M, N = x.shape
+        l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK),)](
+            x,
+            y,
+            eps,
+            T,
+            D,
+            MBLOCK,
+        )
+    else:
+        if D <= 512:
+            NB = triton.cdiv(T, 2048)
+            _l2norm_fwd_kernel_wrapper(x, y, eps, T, D, BD, NB)
+        else:
+            _l2norm_fwd_kernel1_wrapper(x, y, eps, D, BD)
+
+    return y.view(x_shape_og)
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/op.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/op.py
new file mode 100644
index 000000000..f288b1f71
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/op.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import os
+
+import triton
+import triton.language as tl
+
+from .utils import is_gather_supported
+
+exp = tl.exp
+log = tl.log
+log2 = tl.log2
+
+
+@triton.jit
+def safe_exp(x):
+    """
+    Numerically stable exponential function.
+    Only applies exp to non-positive values, returns 0 for positive values.
+    This prevents numerical overflow and improves stability.
+    """
+    return exp(tl.where(x <= 0, x, float("-inf")))
+
+
+if not is_gather_supported:
+
+    @triton.jit
+    def gather(src, index, axis, _builder=None):
+        """
+        Gather operation that works when tl.gather is not supported.
+        This is a fallback implementation that returns None.
+        Just to make triton compiler happy.
+        """
+        return None
+
+else:
+    gather = tl.gather
+
+if hasattr(triton.language, "_experimental_make_tensor_descriptor"):
+    # For Triton 3.3.x
+    make_tensor_descriptor = triton.language._experimental_make_tensor_descriptor
+elif hasattr(triton.language, "make_tensor_descriptor"):
+    # For Triton 3.4.x and later
+    make_tensor_descriptor = triton.language.make_tensor_descriptor
+else:
+    """
+    Fallback implementation when TMA is not supported.
+    Returns None to indicate TMA descriptors are unavailable.
+    Just make triton compiler happy.
+    """
+
+    @triton.jit
+    def make_tensor_descriptor(
+        base,
+        shape,
+        strides,
+        block_shape,
+        _builder=None,
+    ):
+        return None
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/solve_tril.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/solve_tril.py
new file mode 100644
index 000000000..9b1cde861
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/solve_tril.py
@@ -0,0 +1,448 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+
+import os
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .index import prepare_chunk_indices
+from .op import make_tensor_descriptor
+from .utils import input_guard, is_amd, is_tma_supported
+from lightllm.common.triton_utils.autotuner import autotune
+
+FLA_TRIL_PRECISION = os.environ.get("FLA_TRIL_PRECISION", "ieee")
+ALLOWED_TRIL_PRECISIONS = ["ieee", "tf32"] if is_amd else ["ieee", "tf32", "tf32x3"]
+assert (
+    FLA_TRIL_PRECISION in ALLOWED_TRIL_PRECISIONS
+), f"FLA_TRIL_PRECISION must be one of {ALLOWED_TRIL_PRECISIONS}, but got {FLA_TRIL_PRECISION}"
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.jit(do_not_specialize=["T"])
+def solve_tril_16x16_kernel(
+    A,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    USE_TMA: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_i = tl.arange(0, 16)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+
+    A = A + (bos * H + i_h) * BT
+    Ai = Ai + (bos * H + i_h) * 16
+
+    offset = (i_t * 16) % BT
+    if not USE_TMA:
+        p_A = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * 16, offset), (16, 16), (1, 0))
+        # [16, 16]
+        b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16])
+        desc_o = make_tensor_descriptor(Ai, [T, 16], [H * 16, 1], [16, 16])
+        b_A = desc.load([i_t * 16, offset]).to(tl.float32)
+    b_A = -tl.where(m_A, b_A, 0)
+
+    for i in range(2, min(16, T - i_t * 16)):
+        # [16]
+        b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0)
+        b_A = tl.where((o_i == i)[:, None], b_a, b_A)
+    b_A += m_I
+    if not USE_TMA:
+        p_Ai = tl.make_block_ptr(Ai, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), (1, 0))
+        tl.store(
+            p_Ai,
+            b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    else:
+        desc_o.store([i_t * 16, 0], b_A.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_32x32_inverse_kernel(
+    A,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    USE_TMA: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    o_i = tl.arange(0, 16)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+    A += (bos * H + i_h) * BT
+    Ai += (bos * H + i_h) * BT
+
+    if not USE_TMA:
+        p_A_11 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0))
+        p_A_22 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0))
+        b_Ai_11 = tl.load(p_A_11, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_22 = tl.load(p_A_22, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16])
+        desc_o = make_tensor_descriptor(Ai, [T, BT], [H * BT, 1], [16, 16])
+        b_Ai_11 = desc.load([i_t * BT + 0, 0]).to(tl.float32)
+        b_Ai_22 = desc.load([i_t * BT + 16, 16]).to(tl.float32)
+
+    # [16, 16]
+    b_Ai_11 = -tl.where(m_A, b_Ai_11, 0)
+    b_Ai_22 = -tl.where(m_A, b_Ai_22, 0)
+
+    for i in range(2, min(16, T - i_t * BT)):
+        b_a_11 = -tl.load(A + (i_t * BT + i) * H * BT + o_i)
+        b_a_11 += tl.sum(b_a_11[:, None] * b_Ai_11, 0)
+        b_Ai_11 = tl.where((o_i == i)[:, None], b_a_11, b_Ai_11)
+    for i in range(16 + 2, min(32, T - i_t * BT)):
+        b_a_22 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 16)
+        b_a_22 += tl.sum(b_a_22[:, None] * b_Ai_22, 0)
+        b_Ai_22 = tl.where((o_i == i - 16)[:, None], b_a_22, b_Ai_22)
+
+    b_Ai_11 += m_I
+    b_Ai_22 += m_I
+
+    if not USE_TMA:
+        p_A_21 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0))
+        b_A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        b_A_21 = desc.load([i_t * BT + 16, 0]).to(tl.float32)
+
+    b_Ai_21 = -tl.dot(
+        tl.dot(b_Ai_22, b_A_21, input_precision=DOT_PRECISION),
+        b_Ai_11,
+        input_precision=DOT_PRECISION,
+    )
+
+    if not USE_TMA:
+        p_Ai_11 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0))
+        p_Ai_21 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0))
+        p_Ai_22 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0))
+        tl.store(
+            p_Ai_11,
+            b_Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_22,
+            b_Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_21,
+            b_Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    else:
+        desc_o.store([i_t * BT + 0, 0], b_Ai_11.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 16, 0], b_Ai_21.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 16, 16], b_Ai_22.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_64x64_inverse_kernel(
+    A,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    USE_TMA: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    o_i = tl.arange(0, 16)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+    A += (bos * H + i_h) * BT
+    Ai += (bos * H + i_h) * BT
+
+    if not USE_TMA:
+        p_A_11 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0))
+        p_A_22 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0))
+        p_A_33 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 32, 32), (16, 16), (1, 0))
+        p_A_44 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 48, 48), (16, 16), (1, 0))
+        b_Ai_11 = tl.load(p_A_11, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_22 = tl.load(p_A_22, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_33 = tl.load(p_A_33, boundary_check=(0, 1)).to(tl.float32)
+        b_Ai_44 = tl.load(p_A_44, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16])
+        desc_o = make_tensor_descriptor(Ai, [T, BT], [H * BT, 1], [16, 16])
+        b_Ai_11 = desc.load([i_t * BT + 0, 0]).to(tl.float32)
+        b_Ai_22 = desc.load([i_t * BT + 16, 16]).to(tl.float32)
+        b_Ai_33 = desc.load([i_t * BT + 32, 32]).to(tl.float32)
+        b_Ai_44 = desc.load([i_t * BT + 48, 48]).to(tl.float32)
+
+    # [16, 16]
+    b_Ai_11 = -tl.where(m_A, b_Ai_11, 0)
+    b_Ai_22 = -tl.where(m_A, b_Ai_22, 0)
+    b_Ai_33 = -tl.where(m_A, b_Ai_33, 0)
+    b_Ai_44 = -tl.where(m_A, b_Ai_44, 0)
+
+    for i in range(2, min(16, T - i_t * BT)):
+        b_a_11 = -tl.load(A + (i_t * BT + i) * H * BT + o_i)
+        b_a_11 += tl.sum(b_a_11[:, None] * b_Ai_11, 0)
+        b_Ai_11 = tl.where((o_i == i)[:, None], b_a_11, b_Ai_11)
+    for i in range(16 + 2, min(32, T - i_t * BT)):
+        b_a_22 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 16)
+        b_a_22 += tl.sum(b_a_22[:, None] * b_Ai_22, 0)
+        b_Ai_22 = tl.where((o_i == i - 16)[:, None], b_a_22, b_Ai_22)
+    for i in range(32 + 2, min(48, T - i_t * BT)):
+        b_a_33 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 32)
+        b_a_33 += tl.sum(b_a_33[:, None] * b_Ai_33, 0)
+        b_Ai_33 = tl.where((o_i == i - 32)[:, None], b_a_33, b_Ai_33)
+    for i in range(48 + 2, min(64, T - i_t * BT)):
+        b_a_44 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 48)
+        b_a_44 += tl.sum(b_a_44[:, None] * b_Ai_44, 0)
+        b_Ai_44 = tl.where((o_i == i - 48)[:, None], b_a_44, b_Ai_44)
+    b_Ai_11 += m_I
+    b_Ai_22 += m_I
+    b_Ai_33 += m_I
+    b_Ai_44 += m_I
+
+    if not USE_TMA:
+        p_A_21 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0))
+        p_A_31 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 32, 0), (16, 16), (1, 0))
+        p_A_32 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 32, 16), (16, 16), (1, 0))
+        p_A_41 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 48, 0), (16, 16), (1, 0))
+        p_A_42 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 48, 16), (16, 16), (1, 0))
+        p_A_43 = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * BT + 48, 32), (16, 16), (1, 0))
+        b_A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+        b_A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32)
+        b_A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32)
+        b_A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32)
+        b_A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32)
+        b_A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32)
+    else:
+        b_A_21 = desc.load([i_t * BT + 16, 0]).to(tl.float32)
+        b_A_31 = desc.load([i_t * BT + 32, 0]).to(tl.float32)
+        b_A_32 = desc.load([i_t * BT + 32, 16]).to(tl.float32)
+        b_A_41 = desc.load([i_t * BT + 48, 0]).to(tl.float32)
+        b_A_42 = desc.load([i_t * BT + 48, 16]).to(tl.float32)
+        b_A_43 = desc.load([i_t * BT + 48, 32]).to(tl.float32)
+
+    b_Ai_21 = -tl.dot(
+        tl.dot(b_Ai_22, b_A_21, input_precision=DOT_PRECISION),
+        b_Ai_11,
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_32 = -tl.dot(
+        tl.dot(b_Ai_33, b_A_32, input_precision=DOT_PRECISION),
+        b_Ai_22,
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_43 = -tl.dot(
+        tl.dot(b_Ai_44, b_A_43, input_precision=DOT_PRECISION),
+        b_Ai_33,
+        input_precision=DOT_PRECISION,
+    )
+
+    b_Ai_31 = -tl.dot(
+        b_Ai_33,
+        tl.dot(b_A_31, b_Ai_11, input_precision=DOT_PRECISION) + tl.dot(b_A_32, b_Ai_21, input_precision=DOT_PRECISION),
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_42 = -tl.dot(
+        b_Ai_44,
+        tl.dot(b_A_42, b_Ai_22, input_precision=DOT_PRECISION) + tl.dot(b_A_43, b_Ai_32, input_precision=DOT_PRECISION),
+        input_precision=DOT_PRECISION,
+    )
+    b_Ai_41 = -tl.dot(
+        b_Ai_44,
+        tl.dot(b_A_41, b_Ai_11, input_precision=DOT_PRECISION)
+        + tl.dot(b_A_42, b_Ai_21, input_precision=DOT_PRECISION)
+        + tl.dot(b_A_43, b_Ai_31, input_precision=DOT_PRECISION),
+        input_precision=DOT_PRECISION,
+    )
+
+    if not USE_TMA:
+        p_Ai_11 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0))
+        p_Ai_22 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0))
+        p_Ai_33 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 32), (16, 16), (1, 0))
+        p_Ai_44 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 48), (16, 16), (1, 0))
+        p_Ai_21 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0))
+        p_Ai_31 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 0), (16, 16), (1, 0))
+        p_Ai_32 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 16), (16, 16), (1, 0))
+        p_Ai_41 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 0), (16, 16), (1, 0))
+        p_Ai_42 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 16), (16, 16), (1, 0))
+        p_Ai_43 = tl.make_block_ptr(Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 32), (16, 16), (1, 0))
+        tl.store(
+            p_Ai_11,
+            b_Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_22,
+            b_Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_33,
+            b_Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_44,
+            b_Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_21,
+            b_Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_31,
+            b_Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_32,
+            b_Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_41,
+            b_Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_42,
+            b_Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+        tl.store(
+            p_Ai_43,
+            b_Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    else:
+        desc_o.store([i_t * BT + 0, 0], b_Ai_11.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 16, 16], b_Ai_22.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 32, 32], b_Ai_33.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 48, 48], b_Ai_44.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 16, 0], b_Ai_21.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 32, 0], b_Ai_31.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 32, 16], b_Ai_32.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 48, 0], b_Ai_41.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 48, 16], b_Ai_42.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+        desc_o.store([i_t * BT + 48, 32], b_Ai_43.to(desc_o.dtype, fp_downcast_rounding="rtne"))
+
+
+@input_guard
+def solve_tril(
+    A: torch.Tensor,
+    cu_seqlens: torch.Tensor | None = None,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    """
+    Compute the inverse of the matrix I + A
+    A should be strictly lower triangular, i.e., A.triu() == 0.
+
+    Args:
+        A (torch.Tensor):
+            [B, T, H, BT], where BT should only be 16, 32, or 64.
+        cu_seqlens (torch.Tensor):
+            The cumulative sequence lengths of the input tensor. Default: `None`.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float`.
+            If `None`, the output dtype will be the same as the input dtype.
+
+    Returns:
+        (I + A)^-1 with the same shape as A
+    """
+    assert A.shape[-1] in [16, 32, 64]
+    output_dtype = A.dtype if output_dtype is None else output_dtype
+
+    B, T, H, BT = A.shape
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
+
+    Ai = torch.zeros_like(A, dtype=output_dtype)
+    if BT == 16:
+        merge_fn = solve_tril_16x16_kernel
+    elif BT == 32:
+        merge_fn = merge_16x16_to_32x32_inverse_kernel
+    elif BT == 64:
+        merge_fn = merge_16x16_to_64x64_inverse_kernel
+
+    merge_fn[NT, B * H](
+        A=A,
+        Ai=Ai,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+        USE_TMA=is_tma_supported,
+        DOT_PRECISION=FLA_TRIL_PRECISION,
+    )
+    return Ai
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/utils.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/utils.py
new file mode 100644
index 000000000..a890d7010
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/utils.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import contextlib
+import functools
+import logging
+import os
+from collections.abc import Callable
+from enum import Enum
+from typing import Any, Literal
+
+import torch
+
+import triton
+
+logger = logging.getLogger(__name__)
+
+COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1"
+FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1"
+FLA_GDN_FIX_BT = os.getenv("FLA_GDN_FIX_BT", "0") == "1"
+
+SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0"))
+
+
+def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent results of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed.
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+
+    cache_entries: tuple[tuple | None, dict | None, Any] = []
+    cache_size = 8
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal cache_entries, cache_size
+        for i, entry in enumerate(cache_entries):
+            last_args, last_kwargs, last_result = entry
+            if (
+                len(args) == len(last_args)
+                and len(kwargs) == len(last_kwargs)
+                and all(a is b for a, b in zip(args, last_args))
+                and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+            ):
+                cache_entries = cache_entries[:i] + cache_entries[i + 1 :] + [(args, kwargs, last_result)]
+                return last_result
+
+        result = fn(*args, **kwargs)
+
+        if len(cache_entries) >= cache_size:
+            cache_entries = cache_entries[1:]
+        cache_entries.append((args, kwargs, result))
+        return result
+
+    return wrapper
+
+
+def input_guard(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args)
+        contiguous_kwargs = {k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) for k, v in kwargs.items()}
+
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+
+        if tensor is not None:
+            ctx = torch.cuda.device(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+
+    return wrapper
+
+
+@functools.cache
+def get_available_device() -> str:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend
+    except BaseException:
+        return "cpu"
+
+
+@functools.cache
+def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]:
+    device = get_available_device()
+    mapping = {
+        "cuda": "nvidia",
+        "hip": "amd",
+        "xpu": "intel",
+    }
+    # return the mapped value, or the original if not found
+    return mapping.get(device, device)
+
+
+# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
+# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
+# Therefore, we need to check the triton backend to determine the actual GPU vendor.
+device = "cuda"
+device_torch_lib = getattr(torch, device, None)
+device_platform = _check_platform()
+
+is_amd = device_platform == "amd"
+is_intel = device_platform == "intel"
+is_nvidia = device_platform == "nvidia"
+is_intel_alchemist = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0)
+is_nvidia_hopper = is_nvidia and (
+    "NVIDIA H" in torch.cuda.get_device_name(0) or torch.cuda.get_device_capability()[0] >= 9
+)
+use_cuda_graph = True
+is_gather_supported = hasattr(triton.language, "gather")
+is_tma_supported = (is_nvidia and torch.cuda.get_device_capability(0)[0] >= 9) and (
+    hasattr(triton.language, "_experimental_make_tensor_descriptor")
+    or hasattr(triton.language, "make_tensor_descriptor")
+)
+
+
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)["max_shared_mem"]
+            for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        return [-1]
+
+
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+
+
+@functools.cache
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
diff --git a/lightllm/models/qwen3next/triton_kernel/fla/ops/wy_fast.py b/lightllm/models/qwen3next/triton_kernel/fla/ops/wy_fast.py
new file mode 100644
index 000000000..fb67297e4
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fla/ops/wy_fast.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# ruff: noqa: E501
+
+import torch
+
+import triton
+import triton.language as tl
+
+from .index import prepare_chunk_indices
+from lightllm.common.triton_utils.autotuner import autotune
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    k,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    g,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_beta = tl.load(p_beta, boundary_check=(0,))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_g = tl.exp(tl.load(p_g, boundary_check=(0,)))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_beta[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, allow_tf32=False)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype)
+        b_w = tl.dot(b_A, b_kb)
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    g_cumsum: torch.Tensor,
+    A: torch.Tensor,
+    cu_seqlens: torch.LongTensor | None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = A.shape[-1]
+
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = 64
+    BV = 64
+    u = torch.empty_like(v)
+    w = k.new_empty(B, T, H, K)
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        k=k,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        g=g_cumsum,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return w, u
diff --git a/lightllm/models/qwen3next/triton_kernel/fused_gdn_gating.py b/lightllm/models/qwen3next/triton_kernel/fused_gdn_gating.py
new file mode 100644
index 000000000..e1a112c5a
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/fused_gdn_gating.py
@@ -0,0 +1,89 @@
+# Adapted from https://github.com/sgl-project/sglang/python/sglang/srt/layers/attention/fla/fused_gdn_gating.py
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from lightllm.common.triton_utils.autotuner import autotune
+
+# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+# beta_output = b.sigmoid()
+@triton.jit
+def fused_gdn_gating_kernel(
+    g,
+    beta_output,
+    A_log,
+    a,
+    b,
+    dt_bias,
+    seq_len,
+    NUM_HEADS: tl.constexpr,
+    beta: tl.constexpr,
+    threshold: tl.constexpr,
+    BLK_HEADS: tl.constexpr,
+):
+    i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS)
+    off = i_b * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off
+    mask = head_off < NUM_HEADS
+    blk_A_log = tl.load(A_log + head_off, mask=mask)
+    blk_a = tl.load(a + off, mask=mask)
+    blk_b = tl.load(b + off, mask=mask)
+    blk_bias = tl.load(dt_bias + head_off, mask=mask)
+    x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)
+    softplus_x = tl.where(beta * x <= threshold, (1 / beta) * tl.log(1 + tl.exp(beta * x)), x)
+    blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
+    tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+    blk_beta_output = tl.sigmoid(blk_b.to(tl.float32))
+    tl.store(beta_output + off, blk_beta_output.to(b.dtype.element_ty), mask=mask)
+
+
+def _get_fused_gdn_gating_configs():
+    return [{"BLK_HEADS": bh, "num_warps": nw} for bh in [4, 8, 16, 32, 64] for nw in [1, 2, 4]]
+
+
+def _get_fused_gdn_gating_static_key(a: torch.Tensor):
+    # group by head size and input dtype
+    return {"NUM_HEADS": a.shape[1], "a_dtype": str(a.dtype)}
+
+
+@autotune(
+    kernel_name="fused_gdn_gating:v1",
+    configs_gen_func=_get_fused_gdn_gating_configs,
+    static_key_func=_get_fused_gdn_gating_static_key,
+    run_key_func=lambda a: a.shape[0],
+)
+def fused_gdn_gating(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+    run_config: Optional[dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    if run_config is None:
+        run_config = {"BLK_HEADS": 8, "num_warps": 1}
+
+    batch, num_heads = a.shape
+    seq_len = 1
+    grid = (batch, seq_len, triton.cdiv(num_heads, run_config["BLK_HEADS"]))
+    g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device)
+    beta_output = torch.empty(1, batch, num_heads, dtype=torch.float32, device=b.device)
+    fused_gdn_gating_kernel[grid](
+        g,
+        beta_output,
+        A_log,
+        a,
+        b,
+        dt_bias,
+        seq_len,
+        num_heads,
+        beta,
+        threshold,
+        run_config["BLK_HEADS"],
+        num_warps=run_config["num_warps"],
+    )
+    return g, beta_output
diff --git a/lightllm/models/qwen3next/triton_kernel/gated_rmsnorm.py b/lightllm/models/qwen3next/triton_kernel/gated_rmsnorm.py
new file mode 100644
index 000000000..89db5e00c
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/gated_rmsnorm.py
@@ -0,0 +1,174 @@
+import triton
+import triton.language as tl
+import torch
+from lightllm.common.triton_utils.autotuner import autotune
+
+
+@triton.heuristics(
+    {
+        "HAS_BIAS": lambda args: args["B"] is not None,
+    }
+)
+@triton.jit
+def gated_rmsnorm_forward_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Z,  # pointer to the other branch (required, not optional)
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_z_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    NORM_BEFORE_GATE: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    group = tl.program_id(1)
+    X += row * stride_x_row + group * N
+    Y += row * stride_y_row + group * N
+    Z += row * stride_z_row + group * N
+    Rstd += group * M
+    W += group * N
+    if HAS_BIAS:
+        B += group * N
+    # Compute variance (RMS norm doesn't use mean)
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+    if not NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        x *= z * tl.sigmoid(z)
+    # RMS norm: compute variance directly without mean subtraction
+    xbar = tl.where(cols < N, x, 0.0)
+    var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    # RMS norm: normalize without mean subtraction
+    x_hat = x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    if NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        y *= z * tl.sigmoid(z)
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+
+
+def _get_gated_rmsnorm_configs():
+    """Generate configurations for autotuning gated RMSNorm kernel."""
+    configs = []
+    # Different BLOCK_N sizes (powers of 2)
+    for block_n in [64, 128, 256, 512, 1024, 2048, 4096]:
+        # Different number of warps
+        for num_warps in [1, 2, 4, 8]:
+            # Skip configurations that are likely to be inefficient
+            if block_n >= 2048 and num_warps > 4:
+                continue
+            if block_n <= 128 and num_warps > 2:
+                continue
+            configs.append({"BLOCK_N": block_n, "num_warps": num_warps})
+    return configs
+
+
+def _get_gated_rmsnorm_static_key(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor):
+    """Generate static key for caching autotuned configurations."""
+    M, N = x.shape
+    return {
+        "x_dtype": str(x.dtype),
+        "weight_dtype": str(weight.dtype),
+        "N": N,
+        "has_bias": bias is not None,
+    }
+
+
+@autotune(
+    kernel_name="gated_rmsnorm_forward:v1",
+    configs_gen_func=_get_gated_rmsnorm_configs,
+    static_key_func=_get_gated_rmsnorm_static_key,
+    run_key_func=lambda x: x.shape[0],
+)
+def gated_rmsnorm_forward(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    z: torch.Tensor,
+    out: torch.Tensor = None,
+    group_size: int = None,
+    norm_before_gate: bool = True,
+    run_config: dict = None,
+):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    # z is required for gated_rmsnorm
+    assert z is not None, "z cannot be None for gated_rmsnorm_forward"
+    assert z.stride(-1) == 1
+    assert z.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    # For RMS norm, we still need rstd for the kernel
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+
+    # Default heuristic when autotune is disabled or no config provided
+    if not run_config:
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+        if group_size > BLOCK_N:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+        # heuristics for number of warps
+        num_warps = min(max(BLOCK_N // 256, 1), 8)
+        run_config = {"BLOCK_N": BLOCK_N, "num_warps": num_warps}
+
+    BLOCK_N = run_config["BLOCK_N"]
+    num_warps = run_config["num_warps"]
+
+    # Validate BLOCK_N against group_size
+    if group_size > BLOCK_N:
+        # Fall back to largest valid BLOCK_N
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+        if group_size > BLOCK_N:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    grid = (M, ngroups)
+    gated_rmsnorm_forward_kernel[grid](
+        x,
+        out,
+        weight,
+        bias,
+        z,
+        rstd,
+        x.stride(0),
+        out.stride(0),
+        z.stride(0),
+        M,
+        group_size,
+        eps,
+        BLOCK_N=BLOCK_N,
+        NORM_BEFORE_GATE=norm_before_gate,
+        num_warps=num_warps,
+    )
+    return out
diff --git a/lightllm/models/qwen3next/triton_kernel/gemma_rmsnorm.py b/lightllm/models/qwen3next/triton_kernel/gemma_rmsnorm.py
new file mode 100644
index 000000000..210e78db1
--- /dev/null
+++ b/lightllm/models/qwen3next/triton_kernel/gemma_rmsnorm.py
@@ -0,0 +1,144 @@
+import torch
+
+import triton
+import triton.language as tl
+import os
+
+from lightllm.common.triton_utils.autotuner import autotune
+
+
+@triton.jit
+def _gemma_rmsnorm_fwd_kernel(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    x_stride0,
+    x_stride1,
+    y_stride0,
+    y_stride1,
+    N: tl.constexpr,
+    EPS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0)
+    x_ptr = x_ptr + row * x_stride0
+    y_ptr = y_ptr + row * y_stride0
+
+    _sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(x_ptr + cols * x_stride1, mask=cols < N, other=0.0).to(tl.float32)
+        _sum += x * x
+
+    var = tl.sum(_sum, axis=0) / N
+    rstd = 1 / tl.sqrt(var + EPS)
+    # Normalize and apply linear transformation
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        w = tl.load(w_ptr + cols, mask=mask).to(tl.float32)
+        x = tl.load(x_ptr + cols * x_stride1, mask=mask, other=0.0).to(tl.float32)
+        x_hat = x * rstd
+        w = w + 1.0
+        y = x_hat * w
+        # Write output
+        tl.store(y_ptr + cols * y_stride1, y.to(y_ptr.dtype.element_ty), mask=mask)
+
+
+def _get_gemma_rmsnorm_configs():
+    """Generate configurations for autotuning gemma RMSNorm kernel."""
+    configs = []
+    # Different BLOCK_SIZE values (powers of 2)
+    for block_size in [128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 65536 * 2]:
+        # Different number of warps
+        for num_warps in [1, 2, 4, 8]:
+            for num_stages in [1, 2, 3, 4, 5]:
+                configs.append({"BLOCK_SIZE": block_size, "num_warps": num_warps, "num_stages": num_stages})
+    return configs
+
+
+def _get_gemma_rmsnorm_static_key(x: torch.Tensor, w: torch.Tensor):
+    """Generate static key for caching autotuned configurations."""
+    N = x.shape[-1]
+    return {
+        "x_dtype": str(x.dtype),
+        "weight_dtype": str(w.dtype),
+        "N": N,
+    }
+
+
+@autotune(
+    kernel_name="gemma_rmsnorm_forward:v1",
+    configs_gen_func=_get_gemma_rmsnorm_configs,
+    static_key_func=_get_gemma_rmsnorm_static_key,
+    run_key_func=lambda x: x.shape[-1],
+)
+def gemma_rmsnorm_forward(x, w, eps, out=None, run_config: dict = None):
+    # Inplace gemma RMS Norm
+    # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16)
+    # See https://github.com/huggingface/transformers/pull/29402
+    N = x.shape[-1]
+    y = torch.empty_like(x) if out is None else out
+    x_arg = x.view(-1, N)
+    y_arg = y.view(-1, N)
+
+    M, _ = x_arg.shape
+
+    # Default heuristic when autotune is disabled or no config provided
+    if not run_config:
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+        if N > BLOCK_SIZE:
+            raise RuntimeError("This gemma rmsnorm doesn't support feature dim >= 64KB.")
+        # heuristics for number of warps
+        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+        run_config = {"BLOCK_SIZE": BLOCK_SIZE, "num_warps": num_warps, "num_stages": 1}
+
+    BLOCK_SIZE = run_config["BLOCK_SIZE"]
+    num_warps = run_config["num_warps"]
+    num_stages = run_config["num_stages"]
+
+    _gemma_rmsnorm_fwd_kernel[(M,)](
+        x_arg,
+        w,
+        y_arg,
+        x_stride0=x.stride(0),
+        x_stride1=x.stride(1),
+        y_stride0=y.stride(0),
+        y_stride1=y.stride(1),
+        N=N,
+        EPS=eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return y
+
+
+def _gemma_rmsnorm_fwd_torch(x, weight, eps):
+    original_dtype = x.dtype
+    x = x.to(torch.float32)
+    x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+    x = x * (1.0 + weight.float())
+    return x.to(original_dtype)
+
+
+def test_rms_norm(M, N, dtype, eps=1e-5, device="cuda"):
+    # create data
+    x_shape = (M, N)
+    w_shape = (x_shape[-1],)
+    weight = torch.rand(w_shape, dtype=dtype, device="cuda")
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device="cuda")
+    # forward pass
+    y_tri = gemma_rmsnorm_forward(x, weight, eps)
+    y_ref = _gemma_rmsnorm_fwd_torch(x, weight, eps)
+
+    # compare
+    print("type:", y_tri.dtype, y_ref.dtype)
+    print("max delta:", torch.max(torch.abs(y_tri - y_ref)))
+    # Use appropriate tolerance based on dtype
+    atol = 1e-2 if dtype == torch.float32 else 5e-2
+    assert torch.allclose(y_tri, y_ref, atol=atol, rtol=0)
+    return
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index bf0e89887..4f0c3cace 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -572,4 +572,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default=False,
         help="""Enable prefix prompt cache fetch for data parallel inference, disabled by default.""",
     )
+    parser.add_argument("--mamba_cache_size", type=int, default=3000, help="""The size of linear attn cache. """)
+    parser.add_argument(
+        "--mamba_ssm_data_type",
+        type=str,
+        choices=["bfloat16", "float32"],
+        default="float32",
+        help="the data type of the model weight",
+    )
     return parser
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 8bda50fb7..2dab18dda 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -33,7 +33,7 @@
 import uuid
 from PIL import Image
 import multiprocessing as mp
-from typing import AsyncGenerator, Union
+from typing import Any, AsyncGenerator, Union
 from typing import Callable
 from lightllm.server import TokenLoad
 from fastapi import BackgroundTasks, FastAPI, Request, WebSocket, WebSocketDisconnect
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index abd29dc92..d378dd6c5 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -72,7 +72,7 @@ class CompletionRequest(BaseModel):
     # prompt: string or tokens
     prompt: Union[str, List[str], List[int], List[List[int]]]
     suffix: Optional[str] = None
-    max_tokens: Optional[int] = 16
+    max_tokens: Optional[int] = 16000
     temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
     n: Optional[int] = 1
@@ -112,7 +112,7 @@ class ChatCompletionRequest(BaseModel):
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     stop: Optional[Union[str, List[str]]] = None
-    max_tokens: Optional[int] = 16
+    max_tokens: Optional[int] = 16000
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
diff --git a/lightllm/server/api_server.py b/lightllm/server/api_server.py
index b4447d808..b0a1189d3 100755
--- a/lightllm/server/api_server.py
+++ b/lightllm/server/api_server.py
@@ -1,15 +1,33 @@
 import torch
 from .api_cli import make_argument_parser
+from lightllm.server.core.objs.start_args_type import StartArgs
+from lightllm.utils.log_utils import init_logger
 
-if __name__ == "__main__":
-    torch.multiprocessing.set_start_method("spawn")  # this code will not be ok for settings to fork to subprocess
-    parser = make_argument_parser()
-    args = parser.parse_args()
+logger = init_logger(__name__)
+
+
+def launch_server(args: StartArgs):
     from .api_start import pd_master_start, normal_or_p_d_start, config_server_start
 
+    try:
+        # this code will not be ok for settings to fork to subprocess
+        torch.multiprocessing.set_start_method("spawn")
+    except RuntimeError as e:
+        logger.warning(f"Failed to set start method: {e}")
+    except Exception as e:
+        logger.error(f"Failed to set start method: {e}")
+        raise e
+
     if args.run_mode == "pd_master":
         pd_master_start(args)
     elif args.run_mode == "config_server":
         config_server_start(args)
     else:
         normal_or_p_d_start(args)
+
+
+if __name__ == "__main__":
+    parser = make_argument_parser()
+    args = parser.parse_args()
+
+    launch_server(StartArgs(**vars(args)))
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
index 9cc3d38c2..ddff04615 100644
--- a/lightllm/server/api_start.py
+++ b/lightllm/server/api_start.py
@@ -16,6 +16,7 @@
 from lightllm.utils.process_check import is_process_active
 from lightllm.utils.multinode_utils import send_and_receive_node_ip
 from lightllm.utils.shm_size_check import check_recommended_shm_size
+from lightllm.server.core.objs.start_args_type import StartArgs
 
 logger = init_logger(__name__)
 
@@ -51,20 +52,38 @@ def signal_handler(sig, frame):
             process_manager.terminate_all_processes()
             logger.info("All processes have been terminated gracefully.")
             sys.exit(0)
+        elif sig == signal.SIGHUP:
+            logger.info("Received SIGHUP (terminal closed), shutting down gracefully...")
+            if http_server_process and http_server_process.poll() is None:
+                http_server_process.send_signal(signal.SIGTERM)
+
+                start_time = time.time()
+                while (time.time() - start_time) < 60:
+                    if not is_process_active(http_server_process.pid):
+                        logger.info("httpserver exit")
+                        break
+                    time.sleep(1)
+
+                if time.time() - start_time < 60:
+                    logger.info("HTTP server has exited gracefully")
+                else:
+                    logger.warning("HTTP server did not exit in time, killing it...")
+                    kill_recursive(http_server_process)
+
+            process_manager.terminate_all_processes()
+            logger.info("All processes have been terminated gracefully due to terminal closure.")
+            sys.exit(0)
 
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGHUP, signal_handler)
 
     logger.info(f"start process pid {os.getpid()}")
     logger.info(f"http server pid {http_server_process.pid}")
     return
 
 
-def normal_or_p_d_start(args):
-    from lightllm.server.core.objs.start_args_type import StartArgs
-
-    args: StartArgs = args
-
+def normal_or_p_d_start(args: StartArgs):
     set_unique_server_name(args)
 
     if not args.disable_shm_warning:
@@ -376,7 +395,7 @@ def normal_or_p_d_start(args):
     return
 
 
-def pd_master_start(args):
+def pd_master_start(args: StartArgs):
     set_unique_server_name(args)
     if args.run_mode != "pd_master":
         return
@@ -439,7 +458,7 @@ def pd_master_start(args):
     http_server_process.wait()
 
 
-def config_server_start(args):
+def config_server_start(args: StartArgs):
     set_unique_server_name(args)
     if args.run_mode != "config_server":
         return
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index 71cafd6c4..6a4ca401d 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -1,37 +1,42 @@
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
-# 只是为了更好的编程提示
+# 服务启动参数
 
 
 @dataclass
 class StartArgs:
     run_mode: str = field(
         default="normal",
-        metadata={"choices": ["normal", "prefill", "decode", "pd_master", "nixl_prefill", "nixl_decode"]},
+        metadata={
+            "choices": ["normal", "prefill", "decode", "nixl_prefill", "nixl_decode", "pd_master", "config_server"]
+        },
     )
     host: str = field(default="127.0.0.1")
     port: int = field(default=8000)
+    httpserver_workers: int = field(default=1)
     zmq_mode: str = field(
         default="ipc:///tmp/",
         metadata={"help": "use socket mode or ipc mode, only can be set in ['tcp://', 'ipc:///tmp/']"},
     )
-    pd_master_ip: str = field(default="127.0.0.1")
+    pd_master_ip: str = field(default="0.0.0.0")
     pd_master_port: int = field(default=1212)
     config_server_host: str = field(default=None)
     config_server_port: int = field(default=None)
     pd_decode_rpyc_port: int = field(default=42000)
-    select_p_d_node_strategy: str = field(default=None)
+    select_p_d_node_strategy: str = field(
+        default="round_robin", metadata={"choices": ["random", "round_robin", "adaptive_load"]}
+    )
     model_name: str = field(default="default_model_name")
     model_dir: Optional[str] = field(default=None)
-    tokenizer_mode: str = field(default="slow")
+    tokenizer_mode: str = field(default="fast")
     load_way: str = field(default="HF")
     max_total_token_num: Optional[int] = field(default=None)
     mem_fraction: float = field(default=0.9)
     batch_max_tokens: Optional[int] = field(default=None)
-    eos_id: List[int] = field(default_factory=list)
+    eos_id: Optional[List[int]] = field(default=None)
     tool_call_parser: Optional[str] = field(
-        default=None, metadata={"choices": ["llama3", "qwen25", "mistral", "deepseekv3", "kimi_k2", "qwen"]}
+        default=None, metadata={"choices": ["qwen25", "llama3", "mistral", "deepseekv3", "qwen"]}
     )
     chat_template: Optional[str] = field(default=None)
     running_max_req_size: int = field(default=1000)
@@ -39,11 +44,11 @@ class StartArgs:
     dp: int = field(default=1)
     nnodes: int = field(default=1)
     node_rank: int = field(default=0)
-    max_req_total_len: int = field(default=2048 + 1024)
+    max_req_total_len: int = field(default=16384)
     nccl_host: str = field(default="127.0.0.1")
     nccl_port: int = field(default=28765)
     use_config_server_to_init_nccl: bool = field(default=False)
-    mode: List[str] = field(default_factory=list)
+    mode: List[str] = field(default_factory=lambda: [])
     trust_remote_code: bool = field(default=False)
     disable_log_stats: bool = field(default=False)
     log_stats_interval: int = field(default=10)
@@ -52,14 +57,12 @@ class StartArgs:
     router_max_wait_tokens: int = field(default=1)
     disable_aggressive_schedule: bool = field(default=False)
     disable_dynamic_prompt_cache: bool = field(default=False)
-    chunked_prefill_size: int = field(default=8192)
+    chunked_prefill_size: int = field(default=4096)
     disable_chunked_prefill: bool = field(default=False)
     diverse_mode: bool = field(default=False)
     token_healing_mode: bool = field(default=False)
-    output_constraint_mode: str = field(default="none", metadata={"choices": ["none", "simple", "xgrammar"]})
+    output_constraint_mode: str = field(default="none", metadata={"choices": ["outlines", "xgrammar", "none"]})
     first_token_constraint_mode: bool = field(default=False)
-    enable_multimodal: bool = field(default=False)
-    enable_multimodal_audio: bool = field(default=False)
     enable_tpsp_mix_mode: bool = field(default=False)
     enable_dp_prefill_balance: bool = field(default=False)
     enable_decode_microbatch_overlap: bool = field(default=False)
@@ -75,11 +78,11 @@ class StartArgs:
     health_monitor: bool = field(default=False)
     metric_gateway: Optional[str] = field(default=None)
     job_name: str = field(default="lightllm")
-    grouping_key: List[str] = field(default_factory=list)
+    grouping_key: List[str] = field(default_factory=lambda: [])
     push_interval: int = field(default=10)
     visual_infer_batch_size: int = field(default=1)
     visual_send_batch_size: int = field(default=1)
-    visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
+    visual_gpu_ids: Optional[List[int]] = field(default=None)
     visual_tp: int = field(default=1)
     visual_dp: int = field(default=1)
     visual_nccl_ports: List[int] = field(default_factory=lambda: [29500])
@@ -88,10 +91,10 @@ class StartArgs:
     graph_max_batch_size: int = field(default=256)
     graph_split_batch_size: int = field(default=32)
     graph_grow_step_size: int = field(default=16)
-    graph_max_len_in_batch: int = field(default=8192)
-    quant_type: Optional[str] = field(default=None)
+    graph_max_len_in_batch: int = field(default=0)
+    quant_type: Optional[str] = field(default="none")
     quant_cfg: Optional[str] = field(default=None)
-    vit_quant_type: Optional[str] = field(default=None)
+    vit_quant_type: Optional[str] = field(default="none")
     vit_quant_cfg: Optional[str] = field(default=None)
     enable_flashinfer_prefill: bool = field(default=False)
     enable_flashinfer_decode: bool = field(default=False)
@@ -101,7 +104,9 @@ class StartArgs:
     )
     ep_redundancy_expert_config_path: Optional[str] = field(default=None)
     auto_update_redundancy_expert: bool = field(default=False)
-    mtp_mode: Optional[str] = field(default=None)
+    mtp_mode: Optional[str] = field(
+        default=None, metadata={"choices": ["deepseekv3_vanilla", "deepseekv3_eagle", None]}
+    )
     mtp_draft_model_dir: Optional[str] = field(default=None)
     mtp_step: int = field(default=0)
     kv_quant_calibration_config_path: Optional[str] = field(default=None)
@@ -110,7 +115,7 @@ class StartArgs:
     pd_node_id: int = field(default=-1)
     enable_cpu_cache: bool = field(default=False)
     cpu_cache_storage_size: float = field(default=2)
-    cpu_cache_token_page_size: int = field(default=64)
+    cpu_cache_token_page_size: int = field(default=256)
     enable_disk_cache: bool = field(default=False)
     disk_cache_storage_size: float = field(default=10)
     disk_cache_dir: Optional[str] = field(default=None)
@@ -131,3 +136,18 @@ class StartArgs:
 
     # kernel setting
     enable_fa3: bool = field(default=False)
+
+    httpserver_workers: int = field(default=1)
+    disable_shm_warning: bool = field(default=False)
+    dp_balancer: str = field(default="bs_balancer", metadata={"choices": ["round_robin", "bs_balancer"]})
+    enable_custom_allgather: bool = field(default=False)
+    enable_fused_shared_experts: bool = field(default=False)
+    enable_mps: bool = field(default=False)
+    multinode_router_gloo_port: int = field(default=20001)
+    schedule_time_interval: float = field(default=0.03)
+    use_dynamic_prompt_cache: bool = field(default=False)
+    disable_custom_allreduce: bool = field(default=False)
+
+    # hybrid attention model
+    mamba_cache_size: int = field(default=2000)
+    mamba_ssm_data_type: Optional[str] = field(default="float32", metadata={"choices": ["bfloat16", "float32"]})
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e1cb32b88..434848f09 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -599,7 +599,7 @@ async def _wait_to_token_package(
                             (out_token_counter - metadata["mtp_accepted_token_num"]), 1
                         )
                         format_start_time = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
-                        logger.info(
+                        logger.debug(
                             f"X-Request-Id:{x_request_id} "
                             f"X-Session-Id:{x_session_id} start_time:{format_start_time} "
                             f"lightllm_req_id:{group_request_id} first_token_cost:{first_token_cost_ms}ms "
diff --git a/lightllm/server/router/dynamic_prompt/hybrid_radix_cache.py b/lightllm/server/router/dynamic_prompt/hybrid_radix_cache.py
new file mode 100644
index 000000000..e77363c76
--- /dev/null
+++ b/lightllm/server/router/dynamic_prompt/hybrid_radix_cache.py
@@ -0,0 +1,143 @@
+from typing import Set, Protocol, List, Optional, Tuple
+
+import torch
+from sortedcontainers import SortedSet
+
+from lightllm.server.router.dynamic_prompt.radix_cache import RadixCache, TreeNode
+from lightllm.common.kv_cache_mem_manager.mem_manager import MemoryManager
+
+
+class HybridMemManager(MemoryManager):
+    def alloc_buffer(self, need_size):
+        ...
+
+    def free_buffer(self, free_buffer_indexes):
+        ...
+
+    def get_buffer(self, layer_index):
+        ...
+
+    def get_buffer_can_use_size(self):
+        ...
+
+    def copy_buffer(self, src_idx, tgt_idx):
+        ...
+
+
+class HybridRadixCache(RadixCache):
+    def __init__(self, unique_name, total_token_num, rank_in_node, mem_manager=None):
+        self.mem_manager: HybridMemManager = mem_manager
+        super().__init__(unique_name, total_token_num, rank_in_node, mem_manager)
+        self.evict_buffer_set: Set[TreeNode] = SortedSet(key=lambda x: (x.time_id,))
+
+    def free_radix_cache_to_get_enough_buffer(self, need_buffer_num):
+        if need_buffer_num > self.mem_manager.get_buffer_can_use_size():
+            need_evict_buffer_num = need_buffer_num - self.mem_manager.get_buffer_can_use_size()
+
+            release_mems = []
+
+            def release_mem(mem_index):
+                release_mems.append(mem_index)
+                return
+
+            release_buffers = []
+
+            def release_buffer(buffer_idx):
+                release_buffers.append(buffer_idx)
+                return
+
+            self.evict_buffer(need_evict_buffer_num, release_buffer, release_mem)
+            self.mem_manager.free_buffer(release_buffers)
+            if len(release_mems) > 0:
+                mem_index = torch.concat(release_mems)
+                self.mem_manager.free(mem_index)
+        return
+
+    def evict_buffer(self, need_evict_buffer_num, evict_buffer_callback, evict_token_callback):
+        while need_evict_buffer_num > 0:
+            node = self.evict_buffer_set.pop(0)
+            assert node.buffer_idx is not None
+            evict_buffer_callback(node.buffer_idx)
+            evict_token_callback(node.token_mem_index_value)
+            need_evict_buffer_num -= 1
+            self._remove_leaf_node(node)
+        return
+
+    def insert_for_hybrid_radix_cache(self, reqs):
+        from lightllm.server.router.model_infer.infer_batch import g_infer_context
+        from lightllm.common.basemodel.infer_lock import g_infer_state_lock
+
+        # 确保有足够的空间用于新的 buffer
+        g_infer_state_lock.acquire()
+        self.free_radix_cache_to_get_enough_buffer(len(reqs))
+        new_buffer_indexes = self.mem_manager.alloc_buffer(len(reqs))
+        g_infer_state_lock.release()
+
+        for i, req in enumerate(reqs):
+            input_token_ids = req.get_input_token_ids()
+            key = torch.tensor(input_token_ids[0 : req.cur_kv_len], dtype=torch.int64, device="cpu")
+            value = g_infer_context.req_manager.req_to_token_indexs[req.req_idx][: req.cur_kv_len].cpu()
+            cur_buffer_idx = g_infer_context.req_manager.req_to_buffer_indexes[req.req_idx]
+            # 分配新的 buffer 并复制当前 buffer 的内容
+            self.mem_manager.copy_buffer(cur_buffer_idx, new_buffer_indexes[i])
+
+            _, new_shared_kv_node = self.insert(key, value)
+            new_shared_kv_node.buffer_idx = new_buffer_indexes[i]
+            self.dec_node_ref_counter(req.shared_kv_node)
+            self.add_node_ref_counter(new_shared_kv_node)
+            if req.shared_kv_node is not None and req.shared_kv_node.buffer_idx is not None:
+                self.update_buffer_evict_set(req.shared_kv_node)
+            req.shared_kv_node = new_shared_kv_node
+
+    def match_prefix(self, key, update_refs=False):
+        assert len(key) != 0
+        ans_value_list = []
+        tree_node = self._match_prefix_helper(self.root_node, key, ans_value_list, update_refs=update_refs)
+
+        while tree_node != self.root_node and tree_node.buffer_idx is None:
+            self.dec_node_ref_counter(tree_node)
+            if tree_node.is_leaf() and tree_node.ref_counter == 0:
+                tree_node = self._remove_leaf_node(tree_node)
+            else:
+                tree_node = tree_node.parent
+            ans_value_list.pop()
+
+        if tree_node == self.root_node:
+            return None, 0, None
+
+        value = torch.concat(ans_value_list)
+        self.update_buffer_evict_set(tree_node)
+        return tree_node, len(value), value
+
+    def _remove_leaf_node(self, node: TreeNode):
+        self.evict_tree_set.discard(node)
+        self.evict_buffer_set.discard(node)
+        self.tree_total_tokens_num.arr[0] -= len(node.token_mem_index_value)
+        parent_node: TreeNode = node.parent
+        parent_node.remove_child(node)
+        if parent_node.is_leaf():
+            self.evict_tree_set.add(parent_node)
+            if parent_node.buffer_idx is not None:
+                self.update_buffer_evict_set(parent_node)
+        return parent_node
+
+    def insert(self, key, value=None) -> Tuple[int, Optional[TreeNode]]:
+        prefix_len, node = super().insert(key, value)
+        if node is not None:
+            node.update_buffer_time()
+        self.evict_buffer_set.add(node)
+        return prefix_len, node
+
+    def update_buffer_evict_set(self, node: TreeNode):
+        if node is None or node.buffer_idx is None:
+            return
+
+        if node not in self.evict_buffer_set:
+            self.evict_buffer_set.add(node)
+            return
+
+        self.evict_buffer_set.discard(node)
+        node.update_buffer_time()
+        self.evict_buffer_set.add(node)
+
+        self.update_buffer_evict_set(node.parent)
diff --git a/lightllm/server/router/dynamic_prompt/radix_cache.py b/lightllm/server/router/dynamic_prompt/radix_cache.py
index c51774898..12b15e7dc 100644
--- a/lightllm/server/router/dynamic_prompt/radix_cache.py
+++ b/lightllm/server/router/dynamic_prompt/radix_cache.py
@@ -31,6 +31,12 @@ def __init__(self):
         self.node_value_len = 0
         self.node_prefix_total_len = 0
 
+        # 专门用于管理混合注意力模型（例如 Qwen3Next），
+        # 该类模型每个请求需要管理一个唯一的buffer_idx，
+        # 放在这里让该类模型能够复用当前的radix_cache代码。
+        # 纯注意力模型该 buffer_idx 始终保持为 None
+        self.buffer_idx = None
+
     def get_compare_key(self):
         return (0 if self.ref_counter == 0 else 1, len(self.children), self.time_id)
 
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
index ab2965887..c2df005fe 100644
--- a/lightllm/server/router/model_infer/infer_batch.py
+++ b/lightllm/server/router/model_infer/infer_batch.py
@@ -111,7 +111,9 @@ def free_a_req_mem(self, free_token_index: List, req: "InferReq"):
             # .cpu() 是 流内阻塞操作
             value = self.req_manager.req_to_token_indexs[req.req_idx][: req.cur_kv_len].detach().cpu()
 
-            prefix_len, _ = self.radix_cache.insert(key, value)
+            prefix_len, node = self.radix_cache.insert(key, value)
+            if hasattr(self.req_manager, "req_to_buffer_indexes"):
+                node.buffer_idx = self.req_manager.req_to_buffer_indexes[req.req_idx]
             old_prefix_len = 0 if req.shared_kv_node is None else req.shared_kv_node.node_prefix_total_len
             free_token_index.append(self.req_manager.req_to_token_indexs[req.req_idx][old_prefix_len:prefix_len])
             if req.shared_kv_node is not None:
@@ -180,8 +182,10 @@ def pause_reqs(self, pause_reqs: List["InferReq"], is_master_in_dp: bool):
         if pause_reqs:
             g_infer_state_lock.acquire()
 
+            pause_req_ids = []
             free_token_index = []
             for req in pause_reqs:
+                pause_req_ids.append(req.req_id)
                 if self.args.diverse_mode:
                     # 发生暂停的时候，需要清除 diverse 模式下的主从关系
                     req.clear_master_slave_state()
@@ -198,6 +202,9 @@ def pause_reqs(self, pause_reqs: List["InferReq"], is_master_in_dp: bool):
                 free_token_index = custom_cat(free_token_index)
                 self.req_manager.free_token(free_token_index)
 
+            if hasattr(self.req_manager, "free_buffer"):
+                self.req_manager.free_buffer(pause_req_ids)
+
             g_infer_state_lock.release()
         return self
 
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
index a780c4da0..caba3e36b 100644
--- a/lightllm/server/router/model_infer/mode_backend/base_backend.py
+++ b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -10,6 +10,7 @@
 from lightllm.utils.log_utils import init_logger
 from lightllm.models import get_model
 from lightllm.server.router.dynamic_prompt.radix_cache import RadixCache
+from lightllm.server.router.dynamic_prompt.hybrid_radix_cache import HybridRadixCache
 from lightllm.server.router.model_infer.infer_batch import InferReq, InferReqUpdatePack
 from lightllm.server.router.token_load import TokenLoad
 from lightllm.common.basemodel.infer_lock import g_infer_state_lock, InferStateLock
@@ -40,6 +41,8 @@
 from lightllm.server.pd_io_struct import NIXLChunckedTransTaskRet
 from .multi_level_kv_cache import MultiLevelKvCacheModule
 
+logger = init_logger(__name__)
+
 
 class ModeBackend:
     def __init__(self) -> None:
@@ -138,6 +141,7 @@ def init_model(self, kvargs):
             wait_events.append(self.multi_level_cache_module)
 
         model_cfg, _ = PretrainedConfig.get_config_dict(self.weight_dir)
+        self.is_hybrid_model = model_cfg.get("model_type", "") in ["qwen3_next"]
 
         model_kvargs = {
             "weight_dir": self.weight_dir,
@@ -163,8 +167,9 @@ def init_model(self, kvargs):
         self.model, self.is_multimodal = get_model(model_cfg, model_kvargs)
         self.model: TpPartBaseModel = self.model  # for easy typing
         set_random_seed(2147483647)
+        radix_cache_class = HybridRadixCache if self.is_hybrid_model else RadixCache
         self.radix_cache = (
-            RadixCache(
+            radix_cache_class(
                 get_unique_server_name(),
                 self.model.mem_manager.size,
                 self.rank_in_node,
@@ -186,7 +191,6 @@ def init_model(self, kvargs):
             shm_req_manager=self.shm_req_manager,
             vocab_size=self.model.vocab_size,
         )
-
         # 初始化 dp 模式使用的通信 tensor, 对于非dp模式，不会使用到
         if self.dp_size > 1:
             self.dp_reduce_tensor = torch.tensor([0], dtype=torch.int32, device="cuda", requires_grad=False)
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
index 2c3cfaf11..23586d92f 100644
--- a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
+++ b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -24,6 +24,7 @@
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.server.router.dynamic_prompt.hybrid_radix_cache import HybridRadixCache
 from .control_state import ControlState
 
 logger = init_logger(__name__)
@@ -127,6 +128,9 @@ def prefill_normal(
         event_pack.notify_post_handle_and_wait_pre_post_handle()
         update_packs = self._pre_post_handle(run_reqs, is_chuncked_mode=not self.disable_chunked_prefill)
 
+        if isinstance(g_infer_context.radix_cache, HybridRadixCache):
+            g_infer_context.radix_cache.insert_for_hybrid_radix_cache(run_reqs)
+
         # 第三阶段
         event_pack.notify_forward_and_wait_post_handle()
         sync_event.synchronize()
diff --git a/lightllm/utils/log_utils.py b/lightllm/utils/log_utils.py
index f15309d5c..799786fba 100644
--- a/lightllm/utils/log_utils.py
+++ b/lightllm/utils/log_utils.py
@@ -10,7 +10,7 @@
 _FORMAT = "%(levelname)s %(asctime)s [%(filename)s:%(lineno)d] %(message)s"
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
-_LOG_LEVEL = os.environ.get("LIGHTLLM_LOG_LEVEL", "debug")
+_LOG_LEVEL = os.environ.get("LIGHTLLM_LOG_LEVEL", "info")
 _LOG_LEVEL = getattr(logging, _LOG_LEVEL.upper(), 0)
 _LOG_DIR = os.environ.get("LIGHTLLM_LOG_DIR", None)
 
diff --git a/test/benchmark/service/benchmark_gsm8k.py b/test/benchmark/service/benchmark_gsm8k.py
new file mode 100644
index 000000000..def3fbcb5
--- /dev/null
+++ b/test/benchmark/service/benchmark_gsm8k.py
@@ -0,0 +1,231 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
+import argparse
+import ast
+import json
+import os
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
+
+import numpy as np
+import requests
+from tqdm import tqdm
+
+INVALID = -9999999
+
+
+def read_jsonl(filename: str):
+    """Read a JSONL file."""
+    with open(filename) as fin:
+        for line in fin:
+            if line.startswith("#"):
+                continue
+            yield json.loads(line)
+
+
+def dump_state_text(filename: str, states: list, mode: str = "w"):
+    """Dump program state in a text file."""
+    with open(filename, mode) as fout:
+        for i, s in enumerate(states):
+            if isinstance(s, str):
+                fout.write(f"==== {i} ====\n{s}\n")
+            else:
+                fout.write(f"==== {i} ====\n{str(s)}\n")
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    # Check if the cache file already exists
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as file, tqdm(
+        desc="Downloading",
+        total=total_size,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            size = file.write(chunk)
+            bar.update(size)
+
+    return filename
+
+
+def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
+    """Call LightLLM API for text generation."""
+    assert url is not None
+
+    data = {
+        "inputs": prompt,
+        "parameters": {
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "stop_sequences": stop,
+        },
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200, f"API request failed with status code {res.status_code}: {res.text}"
+
+    response_json = res.json()
+    if "generated_text" not in response_json:
+        raise ValueError(f"Invalid API response format. Expected 'generated_text' key, got: {response_json.keys()}")
+    if not isinstance(response_json["generated_text"], list) or len(response_json["generated_text"]) == 0:
+        raise ValueError(
+            f"Invalid API response format. 'generated_text' should be a non-empty list,"
+            f" got: {response_json['generated_text']}"
+        )
+
+    pred = response_json["generated_text"][0]
+    return pred
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--parallel", type=int, default=64)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--num-questions", type=int, default=200)
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    return parser.parse_args()
+
+
+def main(args):
+    # LightLLM API URL
+    url = f"{args.host}:{args.port}/generate"
+
+    # Read data
+    url_data = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url_data)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    # Ensure we have enough samples and avoid data leakage
+    # Test questions should start after few-shot examples
+    max_available = len(lines) - num_shots
+    if num_questions > max_available:
+        print(
+            f"Warning: Requested {num_questions} questions, "
+            f"but only {max_available} available after reserving {num_shots} for few-shot. "
+            f"Using {max_available} questions."
+        )
+        num_questions = max_available
+
+    questions = []
+    labels = []
+    for i in range(num_shots, num_shots + num_questions):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    # assert all(l != INVALID for l in labels)
+
+    states = [None] * len(labels)
+
+    # Run requests using thread pool
+    def get_one_answer(i):
+        answer = call_generate_lightllm(
+            prompt=few_shot_examples + questions[i],
+            temperature=0,
+            max_tokens=256,
+            stop=["Question", "Assistant:", "<|separator|>"],
+            url=url,
+        )
+        states[i] = answer
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(questions))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(questions)))),
+                    total=len(questions),
+                )
+            )
+
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+
+    # Dump results
+    dump_state_text("tmp_output_lightllm.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": "lightllm",
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/test/test_api/test_chat.py b/test/test_api/test_chat.py
new file mode 100644
index 000000000..8ad7d04fe
--- /dev/null
+++ b/test/test_api/test_chat.py
@@ -0,0 +1,183 @@
+from openai import OpenAI
+from datetime import datetime
+import argparse
+import threading
+import random
+from typing import List
+
+
+class OpenAIMultiTurnChat:
+    def __init__(self, api_key: str, model: str = "gpt-3.5-turbo", base_url: str = None, client_id: int = 0):
+        """
+        初始化 OpenAI 多轮对话
+
+        Args:
+            api_key: OpenAI API 密钥
+            model: 使用的模型名称
+            base_url: API 基础 URL（如果使用代理或其他服务）
+            client_id: 客户端 ID（用于并发测试）
+        """
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.model = model
+        self.conversation_history = []
+        self.client_id = client_id
+
+    def add_message(self, role: str, content: str):
+        """添加消息到对话历史"""
+        self.conversation_history.append({"role": role, "content": content})
+
+    def get_response(self, user_message: str, verbose: bool = True) -> str:
+        """获取 AI 回复（流式）"""
+        self.add_message("user", user_message)
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model, messages=self.conversation_history, max_tokens=1000, stream=True
+            )
+
+            assistant_reply = ""
+            if verbose:
+                print(f"AI (用户_{self.client_id}): ", end="", flush=True)
+
+            for chunk in response:
+                if chunk.choices[0].delta.content is not None:
+                    content = chunk.choices[0].delta.content
+                    assistant_reply += content
+                    if verbose:
+                        print(content, end="", flush=True)
+
+            if verbose:
+                print()  # 换行
+            self.add_message("assistant", assistant_reply)
+
+            return assistant_reply
+
+        except Exception as e:
+            if verbose:
+                print(f"请求失败: {e}")
+            return "请求失败，请检查网络连接或 API 密钥"
+
+    def start_conversation(self, system_prompt: str = None):
+        """开始新的对话"""
+        self.conversation_history = []
+
+        if system_prompt:
+            self.add_message("system", system_prompt)
+
+        print(f"开始多轮对话 - 用户_{self.client_id} (输入 'quit' 或 'exit' 退出)")
+        print("-" * 50)
+
+        while True:
+            user_input = input(f"用户_{self.client_id}: ").strip()
+
+            if user_input.lower() in ["quit", "exit", "退出"]:
+                print("对话结束")
+                break
+
+            if not user_input:
+                continue
+
+            self.get_response(user_input)
+            print()
+
+
+class ParallelChatManager:
+    """并发对话管理器"""
+
+    def __init__(self, api_key: str, model: str, base_url: str, parallel: int, system_prompt: str = None):
+        """
+        初始化并发对话管理器
+
+        Args:
+            api_key: OpenAI API 密钥
+            model: 使用的模型名称
+            base_url: API 基础 URL
+            parallel: 并发客户端数量
+            system_prompt: 系统提示词
+        """
+        self.clients: List[OpenAIMultiTurnChat] = []
+        self.parallel = parallel
+        self.system_prompt = system_prompt
+
+        # 创建多个客户端实例
+        for i in range(parallel):
+            client = OpenAIMultiTurnChat(api_key=api_key, model=model, base_url=base_url, client_id=i)
+            if system_prompt:
+                client.add_message("system", system_prompt)
+            self.clients.append(client)
+
+    def parallel_request(self, user_message: str):
+        """并发发送请求"""
+        responses = [None] * self.parallel
+        threads = []
+
+        # 随机选择一个客户端来打印输出
+        verbose_client_id = random.randint(0, self.parallel - 1)
+
+        def worker(client_idx: int):
+            verbose = client_idx == verbose_client_id
+            response = self.clients[client_idx].get_response(user_message, verbose=verbose)
+            responses[client_idx] = response
+
+        # 启动所有线程
+        for i in range(self.parallel):
+            thread = threading.Thread(target=worker, args=(i,))
+            threads.append(thread)
+            thread.start()
+
+        # 等待所有线程完成
+        for thread in threads:
+            thread.join()
+
+        return responses
+
+    def start_conversation(self):
+        """开始并发对话"""
+        print(f"开始并发多轮对话 (并发数: {self.parallel})")
+        print("所有客户端输入相同内容，随机显示其中一个客户端的输出")
+        print("输入 'quit' 或 'exit' 退出")
+        print("-" * 50)
+
+        while True:
+            user_input = input("用户输入: ").strip()
+
+            if user_input.lower() in ["quit", "exit", "退出"]:
+                print("对话结束")
+                break
+
+            if not user_input:
+                continue
+
+            print(f"\n[并发请求中... 并发数: {self.parallel}]")
+            self.parallel_request(user_input)
+            print()
+
+
+# 使用示例
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="OpenAI 多轮对话客户端")
+    parser.add_argument("--port", type=int, default=13688, help="服务端口号 (默认: 13688)")
+    parser.add_argument("--host", type=str, default="localhost", help="服务主机地址 (默认: localhost)")
+    parser.add_argument("--api-key", type=str, default="", help="API 密钥 (默认: 空)")
+    parser.add_argument("--model", type=str, default="gpt-3.5-turbo", help="模型名称 (默认: gpt-3.5-turbo)")
+    parser.add_argument("--system-prompt", type=str, default="你是一个有用的助手。", help="系统提示词")
+    parser.add_argument("--parallel", type=int, default=1, help="并发客户端数量 (默认: 1, 不并发)")
+
+    args = parser.parse_args()
+
+    base_url = f"http://{args.host}:{args.port}/v1"
+
+    if args.parallel > 1:
+        # 并发模式
+        manager = ParallelChatManager(
+            api_key=args.api_key,
+            model=args.model,
+            base_url=base_url,
+            parallel=args.parallel,
+            system_prompt=args.system_prompt,
+        )
+        manager.start_conversation()
+    else:
+        # 单客户端模式
+        chat = OpenAIMultiTurnChat(api_key=args.api_key, model=args.model, base_url=base_url)
+        chat.start_conversation(args.system_prompt)
diff --git a/test/test_api/test_gsmk.py b/test/test_api/test_gsmk.py
new file mode 100644
index 000000000..866dd0f01
--- /dev/null
+++ b/test/test_api/test_gsmk.py
@@ -0,0 +1,230 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
+import argparse
+import ast
+import json
+import os
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
+
+import numpy as np
+import requests
+from tqdm import tqdm
+
+INVALID = -9999999
+
+
+def read_jsonl(filename: str):
+    """Read a JSONL file."""
+    with open(filename) as fin:
+        for line in fin:
+            if line.startswith("#"):
+                continue
+            yield json.loads(line)
+
+
+def dump_state_text(filename: str, states: list, mode: str = "w"):
+    """Dump program state in a text file."""
+    with open(filename, mode) as fout:
+        for i, s in enumerate(states):
+            if isinstance(s, str):
+                fout.write(f"==== {i} ====\n{s}\n")
+            else:
+                fout.write(f"==== {i} ====\n{str(s)}\n")
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    # Check if the cache file already exists
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as file, tqdm(
+        desc="Downloading",
+        total=total_size,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            size = file.write(chunk)
+            bar.update(size)
+
+    return filename
+
+
+def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
+    """Call LightLLM API for text generation."""
+    assert url is not None
+
+    data = {
+        "inputs": prompt,
+        "parameters": {
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "stop_sequences": stop,
+        },
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200, f"API request failed with status code {res.status_code}: {res.text}"
+
+    response_json = res.json()
+    if "generated_text" not in response_json:
+        raise ValueError(f"Invalid API response format. Expected 'generated_text' key, got: {response_json.keys()}")
+    if not isinstance(response_json["generated_text"], list) or len(response_json["generated_text"]) == 0:
+        raise ValueError(
+            "Invalid API response format. 'generated_text' should be a non-empty list, "
+            f"got: {response_json['generated_text']}"
+        )
+
+    pred = response_json["generated_text"][0]
+    return pred
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--parallel", type=int, default=256)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--num-questions", type=int, default=200)
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    return parser.parse_args()
+
+
+def main(args):
+    # LightLLM API URL
+    url = f"{args.host}:{args.port}/generate"
+
+    # Read data
+    url_data = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url_data)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    # Ensure we have enough samples and avoid data leakage
+    # Test questions should start after few-shot examples
+    max_available = len(lines) - num_shots
+    if num_questions > max_available:
+        print(
+            "Warning: Requested {} questions, but only {} available after reserving {} for few-shot. "
+            "Using {} questions.".format(num_questions, max_available, num_shots, max_available)
+        )
+        num_questions = max_available
+
+    questions = []
+    labels = []
+    for i in range(num_shots, num_shots + num_questions):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(label != INVALID for label in labels)
+
+    states = [None] * len(labels)
+
+    # Run requests using thread pool
+    def get_one_answer(i):
+        answer = call_generate_lightllm(
+            prompt=few_shot_examples + questions[i],
+            temperature=0,
+            max_tokens=1024,
+            stop=["Question", "Assistant:", "<|separator|>"],
+            url=url,
+        )
+        states[i] = answer
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(questions))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(questions)))),
+                    total=len(questions),
+                )
+            )
+
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+
+    # Dump results
+    dump_state_text("tmp_output_lightllm.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": "lightllm",
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)