Added access to flash attention internals to regular attention

stollem · copybara-github · commit 7bf21b7f9ae8 · 2025-11-19T07:38:41.000-08:00
PiperOrigin-RevId: 833353546
diff --git a/gemma/activations.h b/gemma/activations.h
@@ -46,7 +46,8 @@ static inline float ChooseQueryScale(const ModelConfig& config) {
 struct AttentionActivations {
   AttentionActivations(
       const ModelConfig& config, const LayerConfig& layer_config,
-      size_t batch_size, size_t seq_len, const Allocator& allocator,
+      size_t batch_size, size_t seq_len, AttentionImpl attention_impl,
+      const Allocator& allocator,
       std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>>& row_ptrs)
       :  // `vocab_size == 0` means it is for Vit part, VitAttention is still
          // MHA and does not use an external KV cache.
@@ -74,13 +75,16 @@ struct AttentionActivations {
                            allocator)),
         att_sums(
             MatFactory("att_sums", batch_size, config.model_dim, allocator)),
+        softmax_state(MatFactory("softmax_state", batch_size,
+                                 layer_config.heads, allocator)),
 
         inv_timescale(
             CreateInvTimescale(allocator, layer_config.qkv_dim,
                                layer_config.post_qk == PostQKType::HalfRope)),
         inv_timescale_global(CreateInvTimescale(
             allocator, layer_config.qkv_dim,
-            layer_config.post_qk == PostQKType::HalfRope, 1000000.0)) {
+            layer_config.post_qk == PostQKType::HalfRope, 1000000.0))
+           {
     // Batch size can be 0 in experimental code so do not assert.
     if (batch_size == 0) {
       static std::atomic_flag warned = ATOMIC_FLAG_INIT;
@@ -108,6 +112,7 @@ struct AttentionActivations {
     att.OverrideRows(batch_size);
     att_out.OverrideRows(batch_size);
     att_sums.OverrideRows(batch_size);
+    softmax_state.OverrideRows(batch_size);
 
     // `inv_timescale*` are not batched.
   }
@@ -121,6 +126,7 @@ struct AttentionActivations {
   MatStorageT<float> att_out;  // attention output
   // Accumulation of attention outputs over heads
   MatStorageT<BF16> att_sums;
+  MatStorageT<OnlineSoftmaxState> softmax_state;
 
   // Rope
   MatStorageT<float> inv_timescale;
@@ -145,6 +151,7 @@ struct AttentionActivationsPtrs {
     att = activations.att;
     att_out = activations.att_out;
     att_sums = activations.att_sums;
+    softmax_state = activations.softmax_state;
     inv_timescale = activations.inv_timescale;
     inv_timescale_global = activations.inv_timescale_global;
   }
@@ -157,6 +164,7 @@ struct AttentionActivationsPtrs {
     att.OverrideRows(batch_size);
     att_out.OverrideRows(batch_size);
     att_sums.OverrideRows(batch_size);
+    softmax_state.OverrideRows(batch_size);
     // `inv_timescale*` are not batched.
   }
 
@@ -182,6 +190,8 @@ struct AttentionActivationsPtrs {
   // Accumulation of attention outputs over heads, size batch_size x
   // model_dim.
   MatPtrT<BF16> att_sums;
+  // State for online softmax computation, size batch_size x q_heads.
+  MatPtrT<OnlineSoftmaxState> softmax_state;
   // Inverse timescales for RoPE computation.
   MatPtrT<float> inv_timescale;
   // Inverse timescales for global RoPE computation.
@@ -217,7 +227,8 @@ struct Activations {
 
         attention_impl(runtime_config.attention_impl),
         attention_storage(config, layer_config, batch_size, seq_len,
-                          ctx.allocator, row_ptrs),
+                          runtime_config.attention_impl, ctx.allocator,
+                          row_ptrs),
         attention(config, seq_len, attention_storage) {
     HWY_ASSERT(batch_size != 0);
 
diff --git a/gemma/attention.cc b/gemma/attention.cc
@@ -123,7 +123,8 @@ void SingleDotSoftmaxWeightedSum(
     float* HWY_RESTRICT q, const MatPtrT<KV_t>& k, const MatPtrT<KV_t>& v,
     const MatPtr& query_norm_scale, const size_t layer_idx,
     const AttentionActivationsPtrs& activations, float* HWY_RESTRICT att,
-    float* HWY_RESTRICT att_out, ThreadingContext& ctx, const size_t worker) {
+    float* HWY_RESTRICT att_out, OnlineSoftmaxState* state_out,
+    ThreadingContext& ctx, const size_t worker) {
   const float att_cap = activations.config.att_cap;
   const float query_scale = activations.query_scale;
   // --seq_len must be large enough to avoid wraparound.
@@ -146,7 +147,7 @@ void SingleDotSoftmaxWeightedSum(
   // SoftMax with optional SoftCap yields "probabilities" in att.
   const Logits logits(att, last_pos + 1);
   MaybeLogitsSoftCap(att_cap, logits, ctx, worker);
-  Softmax(logits, ctx, worker, /*temperature=*/1.0f);
+  Softmax(logits, ctx, worker, /*temperature=*/1.0f, state_out);
 
   WeightedSumV(start_pos, last_pos, activations.div_seq_len, att, v, att_out,
                ctx, worker);
@@ -203,6 +204,8 @@ void DotSoftmaxWeightedSum(const size_t num_tokens, const size_t layer_idx,
     float* HWY_RESTRICT att = activations.att.Row(tq_idx) + head * seq_len;
     float* HWY_RESTRICT att_out =
         activations.att_out.Row(tq_idx) + head * qkv_dim;
+    OnlineSoftmaxState* state_out =
+        activations.softmax_state.Row(tq_idx) + head;
 
     // Make strided read-only views into the kv cache for
     // this query and head.
@@ -215,7 +218,7 @@ void DotSoftmaxWeightedSum(const size_t num_tokens, const size_t layer_idx,
 
     SingleDotSoftmaxWeightedSum(pos, start_pos, last_pos, q, k, v,
                                 query_norm_scale, layer_idx, activations, att,
-                                att_out, ctx, worker);
+                                att_out, state_out, ctx, worker);
   };
 
   {
diff --git a/gemma/flash_attention_test.cc b/gemma/flash_attention_test.cc
@@ -124,7 +124,8 @@ void TestFlashAttention(size_t target_parallelism) {
   const size_t batch_size = kOuter;
   std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>> row_ptrs;
   AttentionActivations attention_storage(config, layer_config, batch_size,
-                                         kOuter, ctx.allocator, row_ptrs);
+                                         kOuter, AttentionImpl::kFlash,
+                                         ctx.allocator, row_ptrs);
   AttentionActivationsPtrs attention(config, kOuter, attention_storage);
   const size_t qkv_dim = layer_config.qkv_dim;
   ASSERT_EQ(qkv_dim, kInner);
diff --git a/gemma/flash_structs.h b/gemma/flash_structs.h
@@ -3,23 +3,10 @@
 
 #include <stddef.h>
 
-#include <limits>
+#include "util/basics.h"
 
 namespace gcpp {
 
-// State for computing softmax in a streaming ("online") manner,
-// avoiding large intermediate values by subtracting the running maximum.
-// For a sequence x_1, ..., x_n:
-// m_i = max(m_{i-1}, x_i)
-// d_i = d_{i-1} * exp(m_{i-1} - m_i) + exp(x_i - m_i)
-// softmax_i = exp(x_i - m_i) / d_i
-struct OnlineSoftmaxState {
-  // Maximum logit value encountered so far.
-  float max = -std::numeric_limits<float>::max() / 2.0f;
-  // Sum of exponentials scaled by exp(-max).
-  float d = 0.0f;
-};
-
 static constexpr size_t kVTileSize4 = 4;
 
 struct Tile4FlashState {
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
@@ -1125,9 +1125,23 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddVector(
 
 // See below for a specialized version for top-1 sampling.
 // TODO: support bf16 logits using Decompress2.
+// Computes softmax probabilities for the given logits, normalizing in-place.
+// The calculation is numerically stable, using the max-subtraction trick to
+// compute exp(logits[i] - max(logits)) before normalizing by the sum.
+// If temperature is provided and not 1.0, each intermediate exp() result is
+// divided by temperature before normalization; however, this division by
+// temperature cancels out during the final normalization step, meaning
+// temperature currently has no effect on the output probabilities.
+// @param logits In-out: on input, contains logits; on output, overwritten with
+// probabilities.
+// @param state Optional output: if not null, stores the max logit and sum of
+//   exp(logit - max) for use in online softmax computation.
+// @param ctx Input: threading context for parallelism and profiling.
+// @param worker Input: worker thread index.
+// @param temperature Input: softmax temperature.
 static HWY_NOINLINE void Softmax(Logits logits, ThreadingContext& ctx,
-                                 const size_t worker,
-                                 float temperature = 1.0f) {
+                                 const size_t worker, float temperature = 1.0f,
+                                 OnlineSoftmaxState* state = nullptr) {
   GCPP_ZONE(ctx, worker, Zones::kOpsSoftmax);
   HWY_DASSERT(logits.size() != 0);
 
@@ -1171,6 +1185,10 @@ static HWY_NOINLINE void Softmax(Logits logits, ThreadingContext& ctx,
   // Double-precision reciprocal does not appear to affect the results.
   const float mul = 1.0f / sum_exp;
   MulByConst(mul, logits.data(), logits.size());
+  if (state) {
+    state->max = hn::GetLane(vmax);
+    state->d = sum_exp;
+  }
 }
 
 // Note: https://arxiv.org/pdf/2001.04438 proposes to replace the three max /
diff --git a/ops/ops_test.cc b/ops/ops_test.cc
@@ -346,6 +346,49 @@ void TestAllSoftmax() {
   hn::ForPartialVectors<ForeachCountAndMisalign<TestSoftmax>>()(float());
 }
 
+class TestSoftmaxState {
+ public:
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  hwy::RandomState& rng) {
+    if (count == 0) return;  // *Softmax would assert
+    if (misalign_b == 0) return;
+    using T = hn::TFromD<D>;
+
+    hwy::AlignedFreeUniquePtr<T[]> px =
+        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    hwy::AlignedFreeUniquePtr<T[]> pe =
+        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    HWY_ASSERT(px && pe);
+
+    T* x = px.get() + misalign_a;
+    T* initial_logits = pe.get() + misalign_a;
+
+    for (size_t i = 0; i < count; ++i) {
+      x[i] = Random<T>(rng);
+      initial_logits[i] = x[i];
+    }
+
+    OnlineSoftmaxState state;
+    Softmax(Logits(x, count), Ctx(), /*worker=*/0, /*temperature=*/1.0f, &state);
+
+    const float maxval =
+        *std::max_element(initial_logits, initial_logits + count);
+
+    float sum_exp = 0.0f;
+    for (size_t i = 0; i < count; ++i) {
+      sum_exp += std::exp(initial_logits[i] - maxval);
+    }
+
+    ASSERT_NEAR(state.max, maxval, 1e-6);
+    ASSERT_NEAR(state.d, sum_exp, 1e-6);
+  }
+};
+
+void TestAllSoftmaxState() {
+  hn::ForPartialVectors<ForeachCountAndMisalign<TestSoftmaxState>>()(float());
+}
+
 template <size_t k>
 struct TestCreateDistribution {
   void operator()(hwy::RandomState& rng) {
@@ -769,6 +812,7 @@ HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstTo);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmaxState);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSigmoid);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllGelu);
diff --git a/util/basics.h b/util/basics.h
@@ -89,6 +89,19 @@ struct TokenAndProb {
 };
 #pragma pack(pop)
 
+// State for computing softmax in a streaming ("online") manner,
+// avoiding large intermediate values by subtracting the running maximum.
+// For a sequence x_1, ..., x_n:
+// m_i = max(m_{i-1}, x_i)
+// d_i = d_{i-1} * exp(m_{i-1} - m_i) + exp(x_i - m_i)
+// softmax_i = exp(x_i - m_i) / d_i
+struct OnlineSoftmaxState {
+  // Maximum logit value encountered so far.
+  float max = -std::numeric_limits<float>::max() / 2.0f;
+  // Sum of exponentials scaled by exp(-max).
+  float d = 0.0f;
+};
+
 // Entire size of a 2D array.
 struct Extents2D {
   constexpr Extents2D() : rows(0), cols(0) {}