diff --git a/src/cerebras/modelzoo/layers/AttentionLayer.py b/src/cerebras/modelzoo/layers/AttentionLayer.py index 782285c5..23c57239 100644 --- a/src/cerebras/modelzoo/layers/AttentionLayer.py +++ b/src/cerebras/modelzoo/layers/AttentionLayer.py @@ -590,11 +590,13 @@ def apply_rotary_position_embedding( return vector def process_q_before_logits_calc(self, q): - # May get overriden by other attention schemas + if self.q_norm is not None: + q = self.q_norm(q) return q def process_k_before_logits_calc(self, k): - # May get overriden by other attention schemas + if self.k_norm is not None: + k = self.k_norm(k) return k def process_v_before_logits_calc(self, v):