From 1c454e51b4f7de39f5f58f39ca9cc28945259f13 Mon Sep 17 00:00:00 2001 From: Orr Zohar Date: Tue, 14 Apr 2026 18:20:07 -0700 Subject: [PATCH] fix: apply QK norm in MultiheadAttention hook methods q_norm and k_norm modules were instantiated from attention_qk_norm_layer config but never invoked in the forward pass. Apply them in process_q_before_logits_calc and process_k_before_logits_calc so that QK normalization actually affects attention logits when configured. Made-with: Cursor --- src/cerebras/modelzoo/layers/AttentionLayer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cerebras/modelzoo/layers/AttentionLayer.py b/src/cerebras/modelzoo/layers/AttentionLayer.py index 782285c5..23c57239 100644 --- a/src/cerebras/modelzoo/layers/AttentionLayer.py +++ b/src/cerebras/modelzoo/layers/AttentionLayer.py @@ -590,11 +590,13 @@ def apply_rotary_position_embedding( return vector def process_q_before_logits_calc(self, q): - # May get overriden by other attention schemas + if self.q_norm is not None: + q = self.q_norm(q) return q def process_k_before_logits_calc(self, k): - # May get overriden by other attention schemas + if self.k_norm is not None: + k = self.k_norm(k) return k def process_v_before_logits_calc(self, v):