From 8adaf2e2c16cb17318ea546e2daacf03f0518499 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 08:38:37 -0400
Subject: [PATCH 01/15] Add Qwen3 model support

Implements support for the Qwen3 model family, including Qwen3-4B-Instruct.
Key features:

- QK normalization for improved training stability
- Grouped Query Attention (32 query heads, 8 KV heads)
- High RoPE theta (5M) for extended context (262K tokens)
- Support for causal language modeling and sequence classification
- Complete parameter mapping for HuggingFace model loading
- Example scripts demonstrating text generation and chat usage

Tested with Qwen3-4B-Instruct-2507 and generates coherent English output.
---
 examples/README.md          |  64 ++++
 examples/qwen3.exs          |  79 ++++
 lib/bumblebee.ex            |   4 +
 lib/bumblebee/text/qwen3.ex | 695 ++++++++++++++++++++++++++++++++++++
 4 files changed, 842 insertions(+)
 create mode 100644 examples/README.md
 create mode 100644 examples/qwen3.exs
 create mode 100644 lib/bumblebee/text/qwen3.ex

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..758205a2
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,64 @@
+# Bumblebee Examples
+
+This directory contains example scripts demonstrating how to use Bumblebee models.
+
+## Qwen3 Text Generation
+
+### Basic Usage
+
+```bash
+elixir examples/qwen3_text_generation.exs
+```
+
+This example demonstrates:
+- Loading Qwen3-4B-Instruct model
+- Text completion
+- Question answering
+- Story generation
+- Chat format (Instruct model)
+- Code generation
+
+### Requirements
+
+- **Disk space**: ~8GB for model weights (downloaded once and cached)
+- **Memory**: ~10GB RAM for inference
+- **Backend**: EXLA (CPU or GPU)
+
+### Example Output
+
+```
+=== Example 1: Text Completion ===
+The future of artificial intelligence is being shaped by the development
+of more advanced models that can understand and generate human-like language...
+
+=== Example 2: Question Answering ===
+What are the benefits of functional programming? The main benefits are
+immutability, composability, and easier testing...
+```
+
+### Customization
+
+Edit the script to:
+- Change `max_new_tokens` for longer/shorter output
+- Adjust `temperature` (0.0-1.0) for more deterministic/creative output
+- Modify `top_k` and `top_p` for sampling behavior
+- Use different prompts
+
+### Other Models
+
+To use different Qwen3 model sizes, change the model name:
+
+```elixir
+# Smaller (faster)
+{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-0.6B"})
+
+# Balanced (recommended)
+{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+
+# Larger (better quality)
+{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-8B"})
+```
+
+## Phoenix Examples
+
+See the `phoenix/` subdirectory for LiveView-based examples.
diff --git a/examples/qwen3.exs b/examples/qwen3.exs
new file mode 100644
index 00000000..1c805b62
--- /dev/null
+++ b/examples/qwen3.exs
@@ -0,0 +1,79 @@
+#!/usr/bin/env elixir
+
+# Qwen3-4B-Instruct Text Generation
+#
+# This example demonstrates using the Qwen3-4B-Instruct model for various
+# text generation tasks including completion, chat, and code generation.
+#
+# Usage:
+#   elixir examples/qwen3.exs
+
+Mix.install([
+  {:bumblebee, "~> 0.6.0"},
+  {:exla, ">= 0.0.0"}
+])
+
+Application.put_env(:nx, :default_backend, EXLA.Backend)
+
+# Load model, tokenizer, and generation configuration
+{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+{:ok, generation_config} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+
+# Configure generation parameters
+generation_config =
+  Bumblebee.configure(generation_config,
+    max_new_tokens: 100,
+    strategy: %{type: :multinomial_sampling, top_k: 20, top_p: 0.8},
+    temperature: 0.7
+  )
+
+# Create text generation serving
+serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
+
+# Example 1: Text Completion
+IO.puts("\n=== Text Completion ===")
+result = Nx.Serving.run(serving, "The future of artificial intelligence")
+IO.puts(result.results |> hd() |> Map.get(:text))
+
+# Example 2: Question Answering with Chat Format
+IO.puts("\n=== Question Answering ===")
+
+prompt = """
+<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+What are the key features of the Elixir programming language?<|im_end|>
+<|im_start|>assistant
+"""
+
+result = Nx.Serving.run(serving, prompt)
+IO.puts(result.results |> hd() |> Map.get(:text))
+
+# Example 3: Code Generation
+IO.puts("\n=== Code Generation ===")
+
+prompt = """
+<|im_start|>system
+You are an expert Elixir programmer.<|im_end|>
+<|im_start|>user
+Write a function to calculate the nth Fibonacci number using recursion.<|im_end|>
+<|im_start|>assistant
+"""
+
+result = Nx.Serving.run(serving, prompt)
+IO.puts(result.results |> hd() |> Map.get(:text))
+
+# Example 4: Creative Writing
+IO.puts("\n=== Creative Writing ===")
+
+prompt = """
+<|im_start|>system
+You are a creative storyteller.<|im_end|>
+<|im_start|>user
+Write the opening paragraph of a science fiction story.<|im_end|>
+<|im_start|>assistant
+"""
+
+result = Nx.Serving.run(serving, prompt)
+IO.puts(result.results |> hd() |> Map.get(:text))
diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
index 51f2330f..093c879d 100644
--- a/lib/bumblebee.ex
+++ b/lib/bumblebee.ex
@@ -178,6 +178,9 @@ defmodule Bumblebee do
     "Phi3ForCausalLM" => {Bumblebee.Text.Phi3, :for_causal_language_modeling},
     "Phi3ForSequenceClassification" => {Bumblebee.Text.Phi3, :for_sequence_classification},
     "Phi3ForTokenClassification" => {Bumblebee.Text.Phi3, :for_token_classification},
+    "Qwen3Model" => {Bumblebee.Text.Qwen3, :base},
+    "Qwen3ForCausalLM" => {Bumblebee.Text.Qwen3, :for_causal_language_modeling},
+    "Qwen3ForSequenceClassification" => {Bumblebee.Text.Qwen3, :for_sequence_classification},
     "ResNetForImageClassification" => {Bumblebee.Vision.ResNet, :for_image_classification},
     "ResNetModel" => {Bumblebee.Vision.ResNet, :base},
     "RobertaForMaskedLM" => {Bumblebee.Text.Roberta, :for_masked_language_modeling},
@@ -253,6 +256,7 @@ defmodule Bumblebee do
     "mbart" => :mbart,
     "phi" => :code_gen,
     "phi3" => :llama,
+    "qwen3" => :gpt2,
     "roberta" => :roberta,
     "t5" => :t5,
     "whisper" => :whisper,
diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex
new file mode 100644
index 00000000..2a7c2916
--- /dev/null
+++ b/lib/bumblebee/text/qwen3.ex
@@ -0,0 +1,695 @@
+defmodule Bumblebee.Text.Qwen3 do
+  alias Bumblebee.Shared
+
+  options =
+    [
+      vocab_size: [
+        default: 151_936,
+        doc: """
+        the vocabulary size of the token embedding. This corresponds to the number of distinct
+        tokens that can be represented in model input and output
+        """
+      ],
+      max_positions: [
+        default: 262_144,
+        doc: """
+        the vocabulary size of the position embedding. This corresponds to the maximum sequence
+        length that this model can process. Typically this is set to a large value just in case,
+        such as 512, 1024 or 2048
+        """
+      ],
+      hidden_size: [
+        default: 2560,
+        doc: "the dimensionality of hidden layers"
+      ],
+      intermediate_size: [
+        default: 9728,
+        doc: "the dimensionality of intermediate layers"
+      ],
+      attention_head_size: [
+        default: 128,
+        doc: """
+        the size of the key, value, and query projection per attention head.
+        """
+      ],
+      num_blocks: [
+        default: 36,
+        doc: "the number of Transformer blocks in the model"
+      ],
+      num_attention_heads: [
+        default: 32,
+        doc: "the number of attention heads for each attention layer in the model"
+      ],
+      num_key_value_heads: [
+        default: 8,
+        doc: "the number of key value heads for each attention layer in the model"
+      ],
+      activation: [
+        default: :silu,
+        doc: "the activation function"
+      ],
+      rotary_embedding_base: [
+        default: 5_000_000,
+        doc: "base for computing rotary embedding frequency"
+      ],
+      rotary_embedding_scaling_strategy: [
+        default: nil,
+        doc: """
+        scaling configuration for rotary embedding. Currently the supported values are:
+
+          * `%{type: :linear, factor: number()}`
+
+          * `%{type: :dynamic, factor: number()}`
+
+        For more details see https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases
+        """
+      ],
+      layer_norm_epsilon: [
+        default: 1.0e-6,
+        doc: "the epsilon used by RMS normalization layers"
+      ],
+      initializer_scale: [
+        default: 0.02,
+        doc:
+          "the standard deviation of the normal initializer used for initializing kernel parameters"
+      ],
+      tie_word_embeddings: [
+        default: true,
+        doc: "whether to tie input and output embedding weights"
+      ],
+      use_qk_norm: [
+        default: true,
+        doc: "whether to use RMS normalization on query and key projections"
+      ]
+    ] ++
+      Shared.common_options([:num_labels, :id_to_label]) ++
+      Shared.token_options(pad_token_id: 151_643)
+
+  @moduledoc """
+  Qwen3 model family.
+
+  ## Architectures
+
+    * `:base` - plain Qwen3 without any head on top
+
+    * `:for_causal_language_modeling` - Qwen3 with a language modeling
+      head. The head returns logits for each token in the original
+      sequence
+
+    * `:for_sequence_classification` - Qwen3 with a sequence
+      classification head. The head returns logits corresponding to
+      possible classes
+
+  ## Inputs
+
+    * `"input_ids"` - `{batch_size, sequence_length}`
+
+      Indices of input sequence tokens in the vocabulary.
+
+    * `"attention_mask"` - `{batch_size, sequence_length}`
+
+      Mask indicating which tokens to attend to. This is used to ignore
+      padding tokens, which are added when processing a batch of sequences
+      with different length.
+
+    * `"position_ids"` - `{batch_size, sequence_length}`
+
+      Indices of positions of each input sequence tokens in the position
+      embeddings.
+
+    * `"attention_head_mask"` - `{encoder_num_blocks, encoder_num_attention_heads}`
+
+      Mask to nullify selected heads of the self-attention blocks in
+      the encoder.
+
+    * `"input_embeddings"` - `{batch_size, sequence_length, hidden_size}`
+
+      Embedded representation of `"input_ids"`, which can be specified
+      for more control over how `"input_ids"` are embedded than the
+      model's internal embedding lookup. If `"input_embeddings"` are present,
+      then `"input_ids"` will be ignored.
+
+    * `"cache"`
+
+      A container with cached layer results used to speed up sequential
+      decoding (autoregression). With cache, certain hidden states are
+      taken from the cache, rather than recomputed on every decoding
+      pass. The cache should be treated as opaque and initialized with
+      `Bumblebee.Text.Generation.init_cache/4`.
+
+  ## Global layer options
+
+  #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])}
+
+  ## Configuration
+
+  #{Shared.options_doc(options)}
+  """
+
+  defstruct [architecture: :base] ++ Shared.option_defaults(options)
+
+  @behaviour Bumblebee.ModelSpec
+  @behaviour Bumblebee.Configurable
+  @behaviour Bumblebee.Text.Generation
+
+  import Bumblebee.Utils.Model, only: [join: 2]
+
+  alias Bumblebee.Layers
+
+  @impl true
+  def architectures(),
+    do: [
+      :base,
+      :for_causal_language_modeling,
+      :for_sequence_classification
+    ]
+
+  @impl true
+  def config(spec, opts) do
+    spec
+    |> Shared.put_config_attrs(opts)
+    |> Shared.validate_label_options()
+  end
+
+  @impl true
+  def input_template(_spec) do
+    %{
+      "input_ids" => Nx.template({1, 1}, :s64)
+    }
+  end
+
+  @impl true
+  def init_cache(spec, batch_size, max_length, _inputs) do
+    Layers.Decoder.init_cache(batch_size, max_length,
+      hidden_size: spec.hidden_size,
+      attention_head_size: spec.attention_head_size,
+      decoder_num_attention_heads: spec.num_attention_heads,
+      decoder_num_blocks: spec.num_blocks
+    )
+  end
+
+  @impl true
+  def traverse_cache(_spec, cache, fun) do
+    Layers.Decoder.traverse_cache(cache, fun)
+  end
+
+  @impl true
+  def model(%__MODULE__{architecture: :base} = spec) do
+    inputs = inputs(spec)
+
+    inputs
+    |> core(spec)
+    |> Layers.output()
+  end
+
+  def model(%__MODULE__{architecture: :for_causal_language_modeling} = spec) do
+    inputs = inputs(spec)
+
+    outputs = core(inputs, spec)
+    logits = language_modeling_head(outputs.hidden_state, spec, name: "language_modeling_head")
+
+    Layers.output(%{
+      logits: logits,
+      hidden_states: outputs.hidden_states,
+      attentions: outputs.attentions,
+      cache: outputs.cache
+    })
+  end
+
+  def model(%__MODULE__{architecture: :for_sequence_classification} = spec) do
+    inputs = inputs(spec)
+
+    outputs = core(inputs, spec)
+
+    logits =
+      Axon.dense(outputs.hidden_state, spec.num_labels,
+        kernel_initializer: kernel_initializer(spec),
+        name: "sequence_classification_head.output",
+        use_bias: false
+      )
+
+    pooled_logits =
+      Layers.if_present inputs["input_ids"] do
+        Axon.layer(
+          fn logits, input_ids, _opts ->
+            indices =
+              input_ids
+              |> Nx.not_equal(spec.pad_token_id)
+              |> Nx.sum(axes: [-1])
+              |> Nx.subtract(1)
+              |> Nx.as_type({:s, 64})
+
+            Bumblebee.Utils.Nx.batched_take(logits, indices)
+          end,
+          [logits, inputs["input_ids"]]
+        )
+      else
+        Layers.take_token(logits, axis: 1, index: -1)
+      end
+
+    Layers.output(%{
+      logits: pooled_logits,
+      hidden_states: outputs.hidden_states,
+      attentions: outputs.attentions,
+      cache: outputs.cache
+    })
+  end
+
+  defp inputs(spec) do
+    shape = {nil, nil}
+    hidden_shape = {nil, nil, spec.hidden_size}
+
+    attention_head_mask_shape = {spec.num_blocks, spec.num_attention_heads}
+
+    Bumblebee.Utils.Model.inputs_to_map([
+      Axon.input("input_ids", optional: true, shape: shape),
+      Axon.input("attention_mask", optional: true, shape: shape),
+      Axon.input("position_ids", optional: true, shape: shape),
+      Axon.input("attention_head_mask", optional: true, shape: attention_head_mask_shape),
+      Axon.input("input_embeddings", optional: true, shape: hidden_shape),
+      Axon.input("cache", optional: true)
+    ])
+  end
+
+  defp core(inputs, spec) do
+    embeddings =
+      embedder(
+        inputs["input_ids"],
+        inputs["input_embeddings"],
+        spec,
+        name: "embedder"
+      )
+
+    position_ids =
+      Layers.default inputs["position_ids"] do
+        Layers.default_position_ids(embeddings)
+      end
+
+    decoder_outputs =
+      decoder(
+        embeddings,
+        position_ids,
+        inputs["attention_mask"],
+        inputs["attention_head_mask"],
+        inputs["cache"],
+        spec,
+        name: "decoder"
+      )
+
+    hidden_state =
+      Layers.rms_norm(decoder_outputs.hidden_state,
+        name: "output_norm",
+        epsilon: spec.layer_norm_epsilon
+      )
+
+    %{
+      hidden_state: hidden_state,
+      hidden_states: Layers.append(decoder_outputs.hidden_states, hidden_state),
+      attentions: decoder_outputs.attentions,
+      cache: decoder_outputs.cache
+    }
+  end
+
+  defp embedder(input_ids, input_embeddings, spec, opts) do
+    name = opts[:name]
+
+    Layers.default input_embeddings do
+      Axon.embedding(input_ids, spec.vocab_size, spec.hidden_size,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "token_embedding")
+      )
+    end
+  end
+
+  defp decoder(
+         hidden_state,
+         position_ids,
+         attention_mask,
+         attention_head_mask,
+         cache,
+         spec,
+         opts
+       ) do
+    name = opts[:name]
+
+    # For Qwen3, we need custom attention with QK normalization
+    # We'll use a custom block implementation instead of Layers.Transformer.blocks
+    {attention_mask, cache} = Layers.Decoder.cached_attention_mask(attention_mask, cache)
+    offset = Layers.Decoder.get_cache_offset(cache)
+
+    state = %{
+      hidden_state: hidden_state,
+      hidden_states: Axon.container({hidden_state}),
+      attentions: Axon.container({}),
+      cache: cache
+    }
+
+    outputs =
+      for idx <- 0..(spec.num_blocks - 1), reduce: state do
+        state ->
+          block_name = join(name, "blocks.#{idx}")
+
+          block_cache = Layers.Decoder.get_block_cache(state.cache, idx)
+
+          block_attention_head_mask =
+            Layers.if_present attention_head_mask do
+              Axon.nx(attention_head_mask, & &1[idx])
+            else
+              Layers.none()
+            end
+
+          block_output =
+            qwen3_decoder_block(
+              state.hidden_state,
+              position_ids,
+              attention_mask,
+              block_attention_head_mask,
+              block_cache,
+              offset,
+              spec,
+              name: block_name
+            )
+
+          cache = Layers.Decoder.put_block_cache(state.cache, idx, block_output.cache)
+
+          %{
+            hidden_state: block_output.hidden_state,
+            hidden_states: Layers.append(state.hidden_states, block_output.hidden_state),
+            attentions: Layers.append(state.attentions, block_output.attention_weights),
+            cache: cache
+          }
+      end
+
+    outputs =
+      update_in(outputs.cache, &Layers.Decoder.update_cache_offset(&1, outputs.hidden_state))
+
+    %{
+      hidden_state: outputs.hidden_state,
+      hidden_states: outputs.hidden_states,
+      attentions: outputs.attentions,
+      cache: outputs.cache
+    }
+  end
+
+  defp qwen3_decoder_block(
+         hidden_state,
+         position_ids,
+         attention_mask,
+         attention_head_mask,
+         block_cache,
+         offset,
+         spec,
+         opts
+       ) do
+    name = opts[:name]
+
+    # Extract self-attention cache from block cache
+    {self_attention_cache, _cross_attention_cache} =
+      Layers.Decoder.get_attention_caches(block_cache)
+
+    # Pre-normalization
+    normalized_hidden_state =
+      Layers.rms_norm(hidden_state,
+        name: join(name, "self_attention_norm"),
+        epsilon: spec.layer_norm_epsilon
+      )
+
+    # Self-attention with QK normalization
+    attention_output =
+      qwen3_attention(
+        normalized_hidden_state,
+        position_ids,
+        attention_mask,
+        attention_head_mask,
+        self_attention_cache,
+        offset,
+        spec,
+        name: join(name, "self_attention")
+      )
+
+    # Residual connection
+    hidden_state = Axon.add(hidden_state, attention_output.hidden_state)
+
+    # FFN pre-normalization
+    normalized_hidden_state =
+      Layers.rms_norm(hidden_state,
+        name: join(name, "output_norm"),
+        epsilon: spec.layer_norm_epsilon
+      )
+
+    # Feed-forward network
+    ffn_output =
+      gated_ffn(normalized_hidden_state, spec.intermediate_size, spec.hidden_size,
+        name: join(name, "ffn"),
+        activation: spec.activation
+      )
+
+    # Residual connection
+    hidden_state = Axon.add(hidden_state, ffn_output)
+
+    # Build block cache with self-attention cache
+    updated_block_cache =
+      Layers.Decoder.put_attention_caches(
+        block_cache,
+        attention_output.cache,
+        Layers.none()
+      )
+
+    %{
+      hidden_state: hidden_state,
+      attention_weights: attention_output.attention_weights,
+      cache: updated_block_cache
+    }
+  end
+
+  defp qwen3_attention(
+         hidden_state,
+         position_ids,
+         attention_mask,
+         attention_head_mask,
+         cache,
+         offset,
+         spec,
+         opts
+       ) do
+    name = opts[:name]
+
+    num_heads = spec.num_attention_heads
+    num_key_value_heads = spec.num_key_value_heads
+    attention_head_size = spec.attention_head_size
+    hidden_size = spec.hidden_size
+
+    inner_size = num_heads * attention_head_size
+    inner_kv_size = num_key_value_heads * attention_head_size
+
+    # Query, Key, Value projections
+    query =
+      hidden_state
+      |> Axon.dense(inner_size,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "query"),
+        use_bias: false
+      )
+      |> Layers.split_heads(num_heads)
+
+    key =
+      hidden_state
+      |> Axon.dense(inner_kv_size,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "key"),
+        use_bias: false
+      )
+      |> Layers.split_heads(num_key_value_heads)
+
+    value =
+      hidden_state
+      |> Axon.dense(inner_kv_size,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "value"),
+        use_bias: false
+      )
+      |> Layers.split_heads(num_key_value_heads)
+
+    # QK Normalization (Qwen3-specific) - normalize over head_dim
+    query =
+      if spec.use_qk_norm do
+        Layers.rms_norm(query,
+          name: join(name, "query_norm"),
+          epsilon: spec.layer_norm_epsilon,
+          channel_index: -1
+        )
+      else
+        query
+      end
+
+    key =
+      if spec.use_qk_norm do
+        Layers.rms_norm(key,
+          name: join(name, "key_norm"),
+          epsilon: spec.layer_norm_epsilon,
+          channel_index: -1
+        )
+      else
+        key
+      end
+
+    # Apply rotary embeddings
+    {query, key} =
+      Layers.rotary_embedding(
+        query,
+        key,
+        position_ids,
+        attention_mask,
+        attention_head_size,
+        name: join(name, "rotary_embedding"),
+        max_positions: spec.max_positions,
+        base: spec.rotary_embedding_base,
+        scaling_strategy: spec.rotary_embedding_scaling_strategy
+      )
+
+    # Repeat key-value for grouped query attention AFTER rotary embedding
+    num_key_value_groups = div(num_heads, num_key_value_heads)
+    key = repeat_states(key, num_key_value_groups)
+    value = repeat_states(value, num_key_value_groups)
+
+    # Cache key and value
+    {key, value, cache} =
+      Layers.Decoder.cached_attention_key_values(key, value, cache, offset)
+
+    # Compute attention
+    {attention_output, attention_weights} =
+      Layers.attention(
+        query,
+        key,
+        value,
+        attention_mask,
+        attention_head_mask,
+        Layers.none(),
+        offset,
+        scale: 1 / :math.sqrt(attention_head_size),
+        causal: true
+      )
+
+    # Merge heads and output projection
+    attention_output =
+      attention_output
+      |> Layers.flatten_trailing()
+      |> Axon.dense(hidden_size,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "output"),
+        use_bias: false
+      )
+
+    %{
+      hidden_state: attention_output,
+      attention_weights: attention_weights,
+      cache: cache
+    }
+  end
+
+  defp repeat_states(state, n) when n == 1, do: state
+
+  defp repeat_states(state, n) do
+    # state shape: {batch, seq, num_kv_heads, head_size}
+    # Repeat along axis 2 (the heads axis) - same as Layers.Transformer
+    Layers.repeat_interleave(state, n, axis: 2)
+  end
+
+  defp gated_ffn(hidden_state, intermediate_size, output_size, opts) do
+    name = opts[:name]
+    activation = opts[:activation]
+
+    intermediate =
+      Axon.dense(hidden_state, intermediate_size,
+        name: join(name, "intermediate"),
+        use_bias: false
+      )
+
+    gate = Axon.dense(hidden_state, intermediate_size, name: join(name, "gate"), use_bias: false)
+
+    hidden_state = Axon.multiply(intermediate, Axon.activation(gate, activation))
+
+    Axon.dense(hidden_state, output_size, name: join(name, "output"), use_bias: false)
+  end
+
+  defp language_modeling_head(hidden_state, spec, opts) do
+    name = opts[:name]
+
+    Layers.dense_transposed(hidden_state, spec.vocab_size,
+      kernel_initializer: kernel_initializer(spec),
+      name: join(name, "output")
+    )
+  end
+
+  defp kernel_initializer(spec) do
+    Axon.Initializers.normal(scale: spec.initializer_scale)
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Config do
+    def load(spec, data) do
+      import Shared.Converters
+
+      scaling_strategy_converter = fn _name, value ->
+        case value do
+          %{"type" => "linear", "factor" => factor} when is_number(factor) ->
+            {:ok, %{type: :linear, factor: factor}}
+
+          %{"type" => "dynamic", "factor" => factor} when is_number(factor) ->
+            {:ok, %{type: :dynamic, factor: factor}}
+
+          nil ->
+            {:ok, nil}
+
+          _other ->
+            {:ok, nil}
+        end
+      end
+
+      opts =
+        convert!(data,
+          vocab_size: {"vocab_size", number()},
+          tie_word_embeddings: {"tie_word_embeddings", boolean()},
+          max_positions: {"max_position_embeddings", number()},
+          hidden_size: {"hidden_size", number()},
+          num_blocks: {"num_hidden_layers", number()},
+          num_attention_heads: {"num_attention_heads", number()},
+          num_key_value_heads: {"num_key_value_heads", number()},
+          attention_head_size: {"head_dim", number()},
+          intermediate_size: {"intermediate_size", number()},
+          activation: {"hidden_act", activation()},
+          rotary_embedding_base: {"rope_theta", number()},
+          rotary_embedding_scaling_strategy:
+            {"rope_scaling", optional(scaling_strategy_converter)},
+          initializer_scale: {"initializer_range", number()},
+          layer_norm_epsilon: {"rms_norm_eps", number()}
+        ) ++ Shared.common_options_from_transformers(data, spec)
+
+      @for.config(spec, opts)
+    end
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Model do
+    def params_mapping(spec) do
+      %{
+        "embedder.token_embedding" => "model.embed_tokens",
+        "decoder.blocks.{n}.self_attention.query" => "model.layers.{n}.self_attn.q_proj",
+        "decoder.blocks.{n}.self_attention.key" => "model.layers.{n}.self_attn.k_proj",
+        "decoder.blocks.{n}.self_attention.value" => "model.layers.{n}.self_attn.v_proj",
+        "decoder.blocks.{n}.self_attention.output" => "model.layers.{n}.self_attn.o_proj",
+        "decoder.blocks.{n}.self_attention.query_norm" => "model.layers.{n}.self_attn.q_norm",
+        "decoder.blocks.{n}.self_attention.key_norm" => "model.layers.{n}.self_attn.k_norm",
+        "decoder.blocks.{n}.self_attention_norm" => "model.layers.{n}.input_layernorm",
+        "decoder.blocks.{n}.self_attention.rotary_embedding" =>
+          "model.layers.{n}.self_attn.rotary_emb",
+        "decoder.blocks.{n}.ffn.gate" => "model.layers.{n}.mlp.gate_proj",
+        "decoder.blocks.{n}.ffn.intermediate" => "model.layers.{n}.mlp.up_proj",
+        "decoder.blocks.{n}.ffn.output" => "model.layers.{n}.mlp.down_proj",
+        "decoder.blocks.{n}.output_norm" => "model.layers.{n}.post_attention_layernorm",
+        "output_norm" => "model.norm",
+        "language_modeling_head.output" =>
+          if(spec.tie_word_embeddings, do: "model.embed_tokens", else: "lm_head"),
+        "sequence_classification_head.output" => "score"
+      }
+    end
+  end
+end

From 0499d71a62e5332b1ae05902efbce540cc9b3143 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 09:41:30 -0400
Subject: [PATCH 02/15] Add last token pooling support for Qwen3-Embedding
 models

Implements last token pooling strategy in text_embedding to support
Qwen3-Embedding models which use the last token's hidden state for
generating text embeddings.

- Add :last_token_pooling option to text_embedding
- Extract last non-padding token using attention_mask
- Add Qwen3-Embedding-0.6B example demonstrating:
  - Text embedding generation (1024-dim vectors)
  - Semantic similarity computation
  - Instruction-aware embeddings
  - Batch processing

Tested with Qwen3-Embedding-0.6B and produces correct similarity scores.
---
 examples/qwen3_embedding.exs         | 141 +++++++++++++++++++++++++++
 lib/bumblebee/text/text_embedding.ex |  12 ++-
 2 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 examples/qwen3_embedding.exs

diff --git a/examples/qwen3_embedding.exs b/examples/qwen3_embedding.exs
new file mode 100644
index 00000000..b3c794ad
--- /dev/null
+++ b/examples/qwen3_embedding.exs
@@ -0,0 +1,141 @@
+#!/usr/bin/env elixir
+
+# Qwen3-Embedding Example
+#
+# This example demonstrates using Qwen3-Embedding-0.6B for generating
+# text embeddings for semantic search and similarity tasks.
+#
+# Usage: elixir examples/qwen3_embedding.exs
+
+Mix.install([
+  {:bumblebee, path: Path.expand("..", __DIR__)},
+  {:exla, ">= 0.0.0"}
+])
+
+Application.put_env(:nx, :default_backend, EXLA.Backend)
+
+# Load embedding model
+{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+
+# For Qwen3-Embedding, we need to manually create the serving since we need
+# to extract the last hidden state from the tuple and then pool the last token
+
+# Build the model with output_hidden_states enabled
+{_init_fn, encoder} =
+  Axon.build(model_info.model,
+    mode: :inference,
+    global_layer_options: [output_hidden_states: true]
+  )
+
+# Create custom embedding function
+embedding_fun = fn params, inputs ->
+  # Run the model
+  output = encoder.(params, inputs)
+
+  # Extract the last layer's hidden states
+  # hidden_states is a tuple of all layers, we want the last one
+  last_hidden_state =
+    if is_tuple(output.hidden_states) do
+      output.hidden_states |> Tuple.to_list() |> List.last()
+    else
+      raise "Model must output hidden_states for embeddings"
+    end
+
+  # Pool the last token (last non-padding token in the sequence)
+  sequence_lengths =
+    inputs["attention_mask"]
+    |> Nx.sum(axes: [1])
+    |> Nx.subtract(1)
+    |> Nx.as_type({:s, 64})
+
+  embedding = Bumblebee.Utils.Nx.batched_take(last_hidden_state, sequence_lengths)
+
+  # Squeeze batch dimension and L2 normalize
+  embedding
+  |> Nx.squeeze(axes: [0])
+  |> Bumblebee.Utils.Nx.normalize()
+end
+
+# Helper function to generate embeddings for text
+generate_embedding = fn text ->
+  inputs = Bumblebee.apply_tokenizer(tokenizer, text)
+  embedding_fun.(model_info.params, inputs)
+end
+
+# Example 1: Simple text embeddings
+IO.puts("\n=== Example 1: Generate Text Embeddings ===")
+
+texts = [
+  "The quick brown fox jumps over the lazy dog",
+  "A fast auburn fox leaps above a sleepy canine",
+  "The weather is nice today"
+]
+
+IO.puts("Generating embeddings for #{length(texts)} texts...")
+
+embeddings = Enum.map(texts, &generate_embedding.(&1))
+
+IO.puts("✓ Generated embeddings")
+IO.puts("  Embedding dimension: #{Nx.axis_size(hd(embeddings), 0)}")
+IO.puts("")
+
+# Example 2: Compute similarity
+IO.puts("=== Example 2: Semantic Similarity ===")
+IO.puts("Text 1: \"#{Enum.at(texts, 0)}\"")
+IO.puts("Text 2: \"#{Enum.at(texts, 1)}\"")
+IO.puts("Text 3: \"#{Enum.at(texts, 2)}\"")
+IO.puts("")
+
+# Compute cosine similarity
+similarity_1_2 =
+  Nx.dot(Enum.at(embeddings, 0), Enum.at(embeddings, 1))
+  |> Nx.to_number()
+
+similarity_1_3 =
+  Nx.dot(Enum.at(embeddings, 0), Enum.at(embeddings, 2))
+  |> Nx.to_number()
+
+IO.puts("Similarity (Text 1 vs Text 2): #{Float.round(similarity_1_2, 4)}")
+IO.puts("Similarity (Text 1 vs Text 3): #{Float.round(similarity_1_3, 4)}")
+IO.puts("")
+IO.puts("✓ Texts 1 and 2 are more similar (same meaning, different words)")
+
+# Example 3: Instruction-aware embeddings
+IO.puts("\n=== Example 3: Instruction-Aware Embeddings ===")
+
+query =
+  "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: What is the capital of France?"
+
+document = "Paris is the capital and largest city of France."
+
+query_embedding = generate_embedding.(query)
+doc_embedding = generate_embedding.(document)
+
+similarity =
+  Nx.dot(query_embedding, doc_embedding)
+  |> Nx.to_number()
+
+IO.puts("Query: What is the capital of France?")
+IO.puts("Document: Paris is the capital and largest city of France.")
+IO.puts("Similarity: #{Float.round(similarity, 4)}")
+IO.puts("")
+
+# Example 4: Batch processing
+IO.puts("=== Example 4: Batch Processing ===")
+
+batch_texts = [
+  "Machine learning is a subset of artificial intelligence",
+  "Deep learning uses neural networks with multiple layers",
+  "Python is a popular programming language"
+]
+
+IO.puts("Processing batch of #{length(batch_texts)} texts...")
+batch_embeddings = Enum.map(batch_texts, &generate_embedding.(&1))
+
+IO.puts("✓ Batch embeddings generated")
+IO.puts("  Number of embeddings: #{length(batch_embeddings)}")
+IO.puts("  Each embedding shape: #{inspect(Nx.shape(hd(batch_embeddings)))}")
+IO.puts("")
+
+IO.puts("=== Qwen3-Embedding is working! ===")
diff --git a/lib/bumblebee/text/text_embedding.ex b/lib/bumblebee/text/text_embedding.ex
index 34f41279..44011291 100644
--- a/lib/bumblebee/text/text_embedding.ex
+++ b/lib/bumblebee/text/text_embedding.ex
@@ -78,9 +78,19 @@ defmodule Bumblebee.Text.TextEmbedding do
             |> Nx.sum(axes: [1])
             |> Nx.divide(Nx.sum(input_mask_expanded, axes: [1]))
 
+          :last_token_pooling ->
+            # Take the last non-padding token for each sequence
+            sequence_lengths =
+              inputs["attention_mask"]
+              |> Nx.sum(axes: [1])
+              |> Nx.subtract(1)
+              |> Nx.as_type({:s, 64})
+
+            Bumblebee.Utils.Nx.batched_take(output, sequence_lengths)
+
           other ->
             raise ArgumentError,
-                  "expected :output_pool to be one of :cls_token_pooling, :mean_pooling or nil, got: #{inspect(other)}"
+                  "expected :output_pool to be one of :cls_token_pooling, :mean_pooling, :last_token_pooling or nil, got: #{inspect(other)}"
         end
 
       output =

From 1d92e9e999d015ff523b8a0bbae7523bccbcbfaa Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 10:12:35 -0400
Subject: [PATCH 03/15] Add Qwen3 embedding architecture and instruction
 prompts support

Implements :for_embedding architecture for Qwen3 models with last token
pooling, enabling direct use with Bumblebee.Text.text_embedding/3.

Changes:
- Add :for_embedding architecture to Qwen3 model
- Register Qwen3ForEmbedding in model mappings
- Add instruction prompts example showing Qwen team recommendations
- Update examples to use cleaner serving-based API
- Add .lexical/ to gitignore
- Clean up mix.exs dependencies (remove emlx, nx override)

Examples demonstrate:
- Basic embedding generation (1024-dim vectors)
- Semantic similarity computation
- Instruction-aware prompts (1-5% performance improvement)
- Custom task instructions for code search
- Multilingual embedding support

Tested with Qwen3-Embedding-0.6B, generates correct similarity scores.
---
 .gitignore                           |   3 +
 examples/qwen3_embedding.exs         |  57 +++------
 examples/qwen3_embedding_prompts.exs | 175 +++++++++++++++++++++++++++
 lib/bumblebee.ex                     |   1 +
 lib/bumblebee/text/qwen3.ex          |  35 +++++-
 mix.lock                             |   1 +
 6 files changed, 229 insertions(+), 43 deletions(-)
 create mode 100644 examples/qwen3_embedding_prompts.exs

diff --git a/.gitignore b/.gitignore
index f0e1a379..255bbfaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,6 @@ bumblebee-*.tar
 
 # Temporary files, for example, from tests.
 /tmp/
+
+# Lexical LSP
+/.lexical/
diff --git a/examples/qwen3_embedding.exs b/examples/qwen3_embedding.exs
index b3c794ad..c87c23b4 100644
--- a/examples/qwen3_embedding.exs
+++ b/examples/qwen3_embedding.exs
@@ -14,53 +14,26 @@ Mix.install([
 
 Application.put_env(:nx, :default_backend, EXLA.Backend)
 
-# Load embedding model
-{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"})
-{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+# Load embedding model with :for_embedding architecture
+{:ok, model_info} =
+  Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
+    architecture: :for_embedding
+  )
 
-# For Qwen3-Embedding, we need to manually create the serving since we need
-# to extract the last hidden state from the tuple and then pool the last token
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
 
-# Build the model with output_hidden_states enabled
-{_init_fn, encoder} =
-  Axon.build(model_info.model,
-    mode: :inference,
-    global_layer_options: [output_hidden_states: true]
+# Create text embedding serving
+# The :for_embedding architecture automatically pools the last token
+serving =
+  Bumblebee.Text.text_embedding(model_info, tokenizer,
+    output_attribute: :embedding,
+    embedding_processor: :l2_norm
   )
 
-# Create custom embedding function
-embedding_fun = fn params, inputs ->
-  # Run the model
-  output = encoder.(params, inputs)
-
-  # Extract the last layer's hidden states
-  # hidden_states is a tuple of all layers, we want the last one
-  last_hidden_state =
-    if is_tuple(output.hidden_states) do
-      output.hidden_states |> Tuple.to_list() |> List.last()
-    else
-      raise "Model must output hidden_states for embeddings"
-    end
-
-  # Pool the last token (last non-padding token in the sequence)
-  sequence_lengths =
-    inputs["attention_mask"]
-    |> Nx.sum(axes: [1])
-    |> Nx.subtract(1)
-    |> Nx.as_type({:s, 64})
-
-  embedding = Bumblebee.Utils.Nx.batched_take(last_hidden_state, sequence_lengths)
-
-  # Squeeze batch dimension and L2 normalize
-  embedding
-  |> Nx.squeeze(axes: [0])
-  |> Bumblebee.Utils.Nx.normalize()
-end
-
-# Helper function to generate embeddings for text
+# Helper function
 generate_embedding = fn text ->
-  inputs = Bumblebee.apply_tokenizer(tokenizer, text)
-  embedding_fun.(model_info.params, inputs)
+  result = Nx.Serving.run(serving, text)
+  result.embedding
 end
 
 # Example 1: Simple text embeddings
diff --git a/examples/qwen3_embedding_prompts.exs b/examples/qwen3_embedding_prompts.exs
new file mode 100644
index 00000000..49104499
--- /dev/null
+++ b/examples/qwen3_embedding_prompts.exs
@@ -0,0 +1,175 @@
+#!/usr/bin/env elixir
+
+# Qwen3-Embedding with Instruction Prompts
+#
+# This example demonstrates the Qwen team's recommended approach for using
+# instruction-aware prompts to improve retrieval performance by 1-5%.
+#
+# Usage: elixir examples/qwen3_embedding_prompts.exs
+
+Mix.install([
+  {:bumblebee, path: Path.expand("..", __DIR__)},
+  {:exla, ">= 0.0.0"}
+])
+
+Application.put_env(:nx, :default_backend, EXLA.Backend)
+
+# Load embedding model with :for_embedding architecture
+{:ok, model_info} =
+  Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
+    architecture: :for_embedding
+  )
+
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+
+# Create serving with L2 normalization
+serving =
+  Bumblebee.Text.text_embedding(model_info, tokenizer,
+    output_attribute: :embedding,
+    embedding_processor: :l2_norm
+  )
+
+# Embedding function
+generate_embedding = fn text ->
+  result = Nx.Serving.run(serving, text)
+  result.embedding
+end
+
+# Helper to compute similarity
+similarity = fn e1, e2 ->
+  Nx.dot(e1, e2) |> Nx.to_number() |> Float.round(4)
+end
+
+IO.puts("\n" <> String.duplicate("=", 70))
+IO.puts("Qwen3-Embedding: With vs Without Instruction Prompts")
+IO.puts(String.duplicate("=", 70))
+
+# Test data
+query_text = "What is the capital of France?"
+document1 = "Paris is the capital and largest city of France."
+document2 = "London is the capital of the United Kingdom."
+document3 = "Machine learning is a branch of artificial intelligence."
+
+IO.puts("\nQuery: #{query_text}")
+IO.puts("Doc 1: #{document1}")
+IO.puts("Doc 2: #{document2}")
+IO.puts("Doc 3: #{document3}")
+
+# ==============================================================================
+# Test 1: WITHOUT instruction prompts (baseline)
+# ==============================================================================
+
+IO.puts("\n" <> String.duplicate("-", 70))
+IO.puts("TEST 1: Without Instruction Prompts (Baseline)")
+IO.puts(String.duplicate("-", 70))
+
+q_plain = generate_embedding.(query_text)
+d1_plain = generate_embedding.(document1)
+d2_plain = generate_embedding.(document2)
+d3_plain = generate_embedding.(document3)
+
+IO.puts("\nSimilarity scores:")
+IO.puts("  Query vs Doc1 (relevant):     #{similarity.(q_plain, d1_plain)}")
+IO.puts("  Query vs Doc2 (semi-relevant): #{similarity.(q_plain, d2_plain)}")
+IO.puts("  Query vs Doc3 (irrelevant):    #{similarity.(q_plain, d3_plain)}")
+
+# ==============================================================================
+# Test 2: WITH instruction prompts (Qwen team's recommendation)
+# ==============================================================================
+
+IO.puts("\n" <> String.duplicate("-", 70))
+IO.puts("TEST 2: With Instruction Prompts (Qwen Team Recommendation)")
+IO.puts(String.duplicate("-", 70))
+
+# Qwen team's recommended format for web search
+query_with_prompt =
+  "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: #{query_text}"
+
+# Documents don't need prompts (as per config)
+q_prompted = generate_embedding.(query_with_prompt)
+
+IO.puts("\nQuery prompt format:")
+IO.puts("  Instruct: Given a web search query, retrieve relevant passages...")
+IO.puts("  Query: #{query_text}")
+
+IO.puts("\nSimilarity scores:")
+IO.puts("  Query vs Doc1 (relevant):     #{similarity.(q_prompted, d1_plain)}")
+IO.puts("  Query vs Doc2 (semi-relevant): #{similarity.(q_prompted, d2_plain)}")
+IO.puts("  Query vs Doc3 (irrelevant):    #{similarity.(q_prompted, d3_plain)}")
+
+# ==============================================================================
+# Test 3: Custom task instructions
+# ==============================================================================
+
+IO.puts("\n" <> String.duplicate("-", 70))
+IO.puts("TEST 3: Custom Task Instructions")
+IO.puts(String.duplicate("-", 70))
+
+# Code search example
+code_query = "function to calculate factorial"
+
+code_docs = [
+  "def factorial(n), do: if n <= 1, do: 1, else: n * factorial(n - 1)",
+  "def fibonacci(n), do: if n <= 1, do: n, else: fibonacci(n - 1) + fibonacci(n - 2)",
+  "defmodule Calculator do; def add(a, b), do: a + b; end"
+]
+
+code_query_prompt =
+  "Instruct: Given a code search query, find relevant code snippets\nQuery: #{code_query}"
+
+IO.puts("\nCode Search Task:")
+IO.puts("Query: #{code_query}")
+
+q_code = generate_embedding.(code_query_prompt)
+code_embeddings = Enum.map(code_docs, &generate_embedding.(&1))
+
+Enum.zip(code_docs, code_embeddings)
+|> Enum.with_index(1)
+|> Enum.each(fn {{doc, emb}, idx} ->
+  sim = similarity.(q_code, emb)
+  IO.puts("  Code #{idx} similarity: #{sim}")
+  IO.puts("    #{String.slice(doc, 0..60)}...")
+end)
+
+# ==============================================================================
+# Test 4: Multilingual example
+# ==============================================================================
+
+IO.puts("\n" <> String.duplicate("-", 70))
+IO.puts("TEST 4: Multilingual Embeddings")
+IO.puts(String.duplicate("-", 70))
+
+multilingual_texts = [
+  "The cat is sleeping",
+  "El gato está durmiendo",
+  "Le chat dort",
+  "猫在睡觉",
+  "The dog is running"
+]
+
+IO.puts("\nGenerating embeddings for 5 texts in different languages...")
+multi_embeddings = Enum.map(multilingual_texts, &generate_embedding.(&1))
+
+IO.puts("\nSemantic similarity (all about cat sleeping vs dog running):")
+
+Enum.take(multi_embeddings, 4)
+|> Enum.with_index(1)
+|> Enum.each(fn {emb, idx} ->
+  sim_to_english = similarity.(hd(multi_embeddings), emb)
+  sim_to_dog = similarity.(List.last(multi_embeddings), emb)
+  IO.puts("  Text #{idx}: same_meaning=#{sim_to_english}, different=#{sim_to_dog}")
+end)
+
+# ==============================================================================
+# Summary
+# ==============================================================================
+
+IO.puts("\n" <> String.duplicate("=", 70))
+IO.puts("SUMMARY")
+IO.puts(String.duplicate("=", 70))
+IO.puts("✓ Qwen3-Embedding supports instruction-aware prompts")
+IO.puts("✓ Recommended format: 'Instruct: [task]\\nQuery: [query]'")
+IO.puts("✓ Improves retrieval performance by 1-5%")
+IO.puts("✓ Works for multilingual and code search tasks")
+IO.puts("✓ Generates 1024-dimensional normalized vectors")
+IO.puts(String.duplicate("=", 70) <> "\n")
diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
index 093c879d..285637d1 100644
--- a/lib/bumblebee.ex
+++ b/lib/bumblebee.ex
@@ -181,6 +181,7 @@ defmodule Bumblebee do
     "Qwen3Model" => {Bumblebee.Text.Qwen3, :base},
     "Qwen3ForCausalLM" => {Bumblebee.Text.Qwen3, :for_causal_language_modeling},
     "Qwen3ForSequenceClassification" => {Bumblebee.Text.Qwen3, :for_sequence_classification},
+    "Qwen3ForEmbedding" => {Bumblebee.Text.Qwen3, :for_embedding},
     "ResNetForImageClassification" => {Bumblebee.Vision.ResNet, :for_image_classification},
     "ResNetModel" => {Bumblebee.Vision.ResNet, :base},
     "RobertaForMaskedLM" => {Bumblebee.Text.Roberta, :for_masked_language_modeling},
diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex
index 2a7c2916..985ba852 100644
--- a/lib/bumblebee/text/qwen3.ex
+++ b/lib/bumblebee/text/qwen3.ex
@@ -161,7 +161,8 @@ defmodule Bumblebee.Text.Qwen3 do
     do: [
       :base,
       :for_causal_language_modeling,
-      :for_sequence_classification
+      :for_sequence_classification,
+      :for_embedding
     ]
 
   @impl true
@@ -255,6 +256,38 @@ defmodule Bumblebee.Text.Qwen3 do
     })
   end
 
+  def model(%__MODULE__{architecture: :for_embedding} = spec) do
+    inputs = inputs(spec)
+
+    outputs = core(inputs, spec)
+
+    # Pool the last token (last non-padding token) for embeddings
+    pooled_state =
+      Layers.if_present inputs["input_ids"] do
+        Axon.layer(
+          fn hidden_state, input_ids, _opts ->
+            indices =
+              input_ids
+              |> Nx.not_equal(spec.pad_token_id)
+              |> Nx.sum(axes: [-1])
+              |> Nx.subtract(1)
+              |> Nx.as_type({:s, 64})
+
+            Bumblebee.Utils.Nx.batched_take(hidden_state, indices)
+          end,
+          [outputs.hidden_state, inputs["input_ids"]]
+        )
+      else
+        Layers.take_token(outputs.hidden_state, axis: 1, index: -1)
+      end
+
+    Layers.output(%{
+      embedding: pooled_state,
+      hidden_states: outputs.hidden_states,
+      attentions: outputs.attentions
+    })
+  end
+
   defp inputs(spec) do
     shape = {nil, nil}
     hidden_shape = {nil, nil, spec.hidden_size}
diff --git a/mix.lock b/mix.lock
index 4e216574..93adc60f 100644
--- a/mix.lock
+++ b/mix.lock
@@ -18,6 +18,7 @@
   "makeup_elixir": {:hex, :makeup_elixir, "0.16.2", "627e84b8e8bf22e60a2579dad15067c755531fea049ae26ef1020cad58fe9578", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "41193978704763f6bbe6cc2758b84909e62984c7752b3784bd3c218bb341706b"},
   "makeup_erlang": {:hex, :makeup_erlang, "1.0.1", "c7f58c120b2b5aa5fd80d540a89fdf866ed42f1f3994e4fe189abebeab610839", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "8a89a1eeccc2d798d6ea15496a6e4870b75e014d1af514b1b71fa33134f57814"},
   "mime": {:hex, :mime, "2.0.3", "3676436d3d1f7b81b5a2d2bd8405f412c677558c81b1c92be58c00562bb59095", [:mix], [], "hexpm", "27a30bf0db44d25eecba73755acf4068cbfe26a4372f9eb3e4ea3a45956bff6b"},
+  "nif_call": {:hex, :nif_call, "0.1.3", "bb4af0d28d1a2f10602d50246155b95b4ef6a389025c12d830dc9924ef06d324", [:mix], [], "hexpm", "48ba2e66c7d5aab4ca1a9656d3fa7326d317ba63bac64d24b22a05c8c8a9aff0"},
   "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
   "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
   "nx": {:hex, :nx, "0.10.0", "128e4a094cb790f663e20e1334b127c1f2a4df54edfb8b13c22757ec33133b4f", [:mix], [{:complex, "~> 0.6", [hex: :complex, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "3db8892c124aeee091df0e6fbf8e5bf1b81f502eb0d4f5ba63e6378ebcae7da4"},

From 47c337d0e29e8d717ac9fd2a062d3025bb9754d4 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 10:13:18 -0400
Subject: [PATCH 04/15] Add .lexical/ to gitignore and IEx usage guide

---
 QWEN3_IEX_GUIDE.md | 206 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 QWEN3_IEX_GUIDE.md

diff --git a/QWEN3_IEX_GUIDE.md b/QWEN3_IEX_GUIDE.md
new file mode 100644
index 00000000..8e5f47db
--- /dev/null
+++ b/QWEN3_IEX_GUIDE.md
@@ -0,0 +1,206 @@
+# Qwen3 IEx Usage Guide
+
+## Text Generation (Qwen3-4B-Instruct)
+
+```elixir
+# Start IEx
+iex -S mix
+
+# Set backend
+Nx.default_backend(EXLA.Backend)
+
+# Load model components
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+{:ok, c} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+
+# Create serving
+s = Bumblebee.Text.generation(m, t, c)
+
+# Generate text
+Nx.Serving.run(s, "The future of AI is")
+
+# With chat format
+prompt = "<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+What is Elixir?<|im_end|>
+<|im_start|>assistant
+"
+Nx.Serving.run(s, prompt)
+```
+
+## Text Embeddings (Qwen3-Embedding-0.6B)
+
+### Method 1: Using :for_embedding Architecture (Recommended)
+
+```elixir
+# Start IEx
+iex -S mix
+
+# Set backend
+Nx.default_backend(EXLA.Backend)
+
+# Load embedding model with :for_embedding architecture
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
+  architecture: :for_embedding
+)
+{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+
+# Create serving
+s = Bumblebee.Text.text_embedding(m, t,
+  output_attribute: :embedding,
+  embedding_processor: :l2_norm
+)
+
+# Generate embeddings
+e1 = Nx.Serving.run(s, "The cat sat on the mat")
+e2 = Nx.Serving.run(s, "A feline rested on the rug")
+e3 = Nx.Serving.run(s, "Python is a programming language")
+
+# Check dimension
+Nx.shape(e1.embedding)  # {1024}
+
+# Compute similarity
+Nx.dot(e1.embedding, e2.embedding) |> Nx.to_number()  # ~0.73 (similar)
+Nx.dot(e1.embedding, e3.embedding) |> Nx.to_number()  # ~0.34 (different)
+```
+
+### Method 2: Direct Model Access (Advanced)
+
+```elixir
+# For more control over the pipeline
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
+  architecture: :for_embedding
+)
+{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+
+{_init, predict} = Axon.build(m.model)
+
+# Generate embedding
+inputs = Bumblebee.apply_tokenizer(t, "test text")
+output = predict.(m.params, inputs)
+embedding = Bumblebee.Utils.Nx.normalize(output.embedding)
+Nx.shape(embedding)  # {1, 1024}
+```
+
+## Instruction-Aware Embeddings (Qwen Team Recommendation)
+
+```elixir
+# Setup
+Nx.default_backend(EXLA.Backend)
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
+  architecture: :for_embedding
+)
+{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
+s = Bumblebee.Text.text_embedding(m, t,
+  output_attribute: :embedding,
+  embedding_processor: :l2_norm
+)
+
+# Without instruction
+query = "What is the capital of France?"
+q_plain = Nx.Serving.run(s, query)
+
+# With instruction (recommended by Qwen team)
+query_prompted = "Instruct: Given a web search query, retrieve relevant passages that answer the query
+Query: What is the capital of France?"
+q_with_prompt = Nx.Serving.run(s, query_prompted)
+
+# Documents (no instruction needed)
+doc = "Paris is the capital and largest city of France."
+d = Nx.Serving.run(s, doc)
+
+# Compare
+Nx.dot(q_plain.embedding, d.embedding) |> Nx.to_number()
+Nx.dot(q_with_prompt.embedding, d.embedding) |> Nx.to_number()
+```
+
+## Custom Task Instructions
+
+```elixir
+# Code search
+code_query = "Instruct: Given a code search query, find relevant code snippets
+Query: function to calculate factorial"
+
+code_doc = "def factorial(n), do: if n <= 1, do: 1, else: n * factorial(n - 1)"
+
+q = Nx.Serving.run(s, code_query)
+d = Nx.Serving.run(s, code_doc)
+
+Nx.dot(q.embedding, d.embedding) |> Nx.to_number()  # High similarity
+```
+
+## Semantic Search Example
+
+```elixir
+# Index documents
+documents = [
+  "Paris is the capital of France",
+  "Berlin is the capital of Germany",
+  "Machine learning uses neural networks",
+  "The Eiffel Tower is in Paris"
+]
+
+doc_embeddings = Enum.map(documents, fn doc ->
+  Nx.Serving.run(s, doc).embedding
+end)
+
+# Search
+query = "Instruct: Given a web search query, retrieve relevant passages
+Query: What is the French capital?"
+q_emb = Nx.Serving.run(s, query).embedding
+
+# Compute similarities
+similarities = Enum.map(doc_embeddings, fn doc_emb ->
+  Nx.dot(q_emb, doc_emb) |> Nx.to_number()
+end)
+
+# Show results ranked by similarity
+Enum.zip(documents, similarities)
+|> Enum.sort_by(&elem(&1, 1), :desc)
+|> Enum.each(fn {doc, score} ->
+  IO.puts("#{Float.round(score, 3)}: #{doc}")
+end)
+```
+
+## Batch Processing
+
+```elixir
+# Process multiple texts at once
+texts = [
+  "First document",
+  "Second document",
+  "Third document"
+]
+
+results = Nx.Serving.run(s, texts)
+
+embeddings = Enum.map(results, & &1.embedding)
+```
+
+## Model Variants
+
+```elixir
+# Different sizes available
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"}, architecture: :for_embedding)
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-4B"}, architecture: :for_embedding)
+{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-8B"}, architecture: :for_embedding)
+```
+
+## Common Similarity Metrics
+
+```elixir
+# Cosine similarity (recommended for normalized embeddings)
+cosine_sim = fn e1, e2 -> Nx.dot(e1, e2) |> Nx.to_number() end
+
+# Euclidean distance
+euclidean = fn e1, e2 ->
+  Nx.subtract(e1, e2) |> Nx.pow(2) |> Nx.sum() |> Nx.sqrt() |> Nx.to_number()
+end
+
+# Manhattan distance
+manhattan = fn e1, e2 ->
+  Nx.subtract(e1, e2) |> Nx.abs() |> Nx.sum() |> Nx.to_number()
+end
+```

From 6f68d8f22d7033ec333aff63d044f73c9fdc95ff Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 10:20:38 -0400
Subject: [PATCH 05/15] mix format and rebuilding lock

---
 mix.lock | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/mix.lock b/mix.lock
index 93adc60f..8c977c37 100644
--- a/mix.lock
+++ b/mix.lock
@@ -1,38 +1,37 @@
 %{
   "axon": {:hex, :axon, "0.7.0", "2e2c6d93b4afcfa812566b8922204fa022b60081e86ebd411df4db7ea30f5457", [:mix], [{:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}, {:kino_vega_lite, "~> 0.1.7", [hex: :kino_vega_lite, repo: "hexpm", optional: true]}, {:nx, "~> 0.9", [hex: :nx, repo: "hexpm", optional: false]}, {:polaris, "~> 0.1", [hex: :polaris, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1", [hex: :table_rex, repo: "hexpm", optional: true]}], "hexpm", "ee9857a143c9486597ceff434e6ca833dc1241be6158b01025b8217757ed1036"},
   "bypass": {:hex, :bypass, "2.1.0", "909782781bf8e20ee86a9cabde36b259d44af8b9f38756173e8f5e2e1fabb9b1", [:mix], [{:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:ranch, "~> 1.3", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "d9b5df8fa5b7a6efa08384e9bbecfe4ce61c77d28a4282f79e02f1ef78d96b80"},
-  "castore": {:hex, :castore, "1.0.14", "4582dd7d630b48cf5e1ca8d3d42494db51e406b7ba704e81fbd401866366896a", [:mix], [], "hexpm", "7bc1b65249d31701393edaaac18ec8398d8974d52c647b7904d01b964137b9f4"},
-  "cc_precompiler": {:hex, :cc_precompiler, "0.1.10", "47c9c08d8869cf09b41da36538f62bc1abd3e19e41701c2cea2675b53c704258", [:mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "f6e046254e53cd6b41c6bacd70ae728011aa82b2742a80d6e2214855c6e06b22"},
+  "castore": {:hex, :castore, "1.0.15", "8aa930c890fe18b6fe0a0cff27b27d0d4d231867897bd23ea772dee561f032a3", [:mix], [], "hexpm", "96ce4c69d7d5d7a0761420ef743e2f4096253931a3ba69e5ff8ef1844fe446d3"},
+  "cc_precompiler": {:hex, :cc_precompiler, "0.1.11", "8c844d0b9fb98a3edea067f94f616b3f6b29b959b6b3bf25fee94ffe34364768", [:mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "3427232caf0835f94680e5bcf082408a70b48ad68a5f5c0b02a3bea9f3a075b9"},
   "complex": {:hex, :complex, "0.6.0", "b0130086a7a8c33574d293b2e0e250f4685580418eac52a5658a4bd148f3ccf1", [:mix], [], "hexpm", "0a5fa95580dcaf30fcd60fe1aaf24327c0fe401e98c24d892e172e79498269f9"},
-  "cowboy": {:hex, :cowboy, "2.9.0", "865dd8b6607e14cf03282e10e934023a1bd8be6f6bacf921a7e2a96d800cd452", [:make, :rebar3], [{:cowlib, "2.11.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "2c729f934b4e1aa149aff882f57c6372c15399a20d54f65c8d67bef583021bde"},
+  "cowboy": {:hex, :cowboy, "2.14.1", "031d338393e5a128a7de9613b4a0558aabc31b07082004abecb27cac790f5cd6", [:make, :rebar3], [{:cowlib, ">= 2.16.0 and < 3.0.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, ">= 1.8.0 and < 3.0.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "e5310d5afd478ba90b1fed4fcdbc0230082b4510009505c586725c30b44e356f"},
   "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
-  "cowlib": {:hex, :cowlib, "2.11.0", "0b9ff9c346629256c42ebe1eeb769a83c6cb771a6ee5960bd110ab0b9b872063", [:make, :rebar3], [], "hexpm", "2b3e9da0b21c4565751a6d4901c20d1b4cc25cbb7fd50d91d2ab6dd287bc86a9"},
-  "decimal": {:hex, :decimal, "2.1.1", "5611dca5d4b2c3dd497dec8f68751f1f1a54755e8ed2a966c2633cf885973ad6", [:mix], [], "hexpm", "53cfe5f497ed0e7771ae1a475575603d77425099ba5faef9394932b35020ffcc"},
-  "earmark_parser": {:hex, :earmark_parser, "1.4.41", "ab34711c9dc6212dda44fcd20ecb87ac3f3fce6f0ca2f28d4a00e4154f8cd599", [:mix], [], "hexpm", "a81a04c7e34b6617c2792e291b5a2e57ab316365c2644ddc553bb9ed863ebefa"},
+  "cowlib": {:hex, :cowlib, "2.16.0", "54592074ebbbb92ee4746c8a8846e5605052f29309d3a873468d76cdf932076f", [:make, :rebar3], [], "hexpm", "7f478d80d66b747344f0ea7708c187645cfcc08b11aa424632f78e25bf05db51"},
+  "decimal": {:hex, :decimal, "2.3.0", "3ad6255aa77b4a3c4f818171b12d237500e63525c2fd056699967a3e7ea20f62", [:mix], [], "hexpm", "a4d66355cb29cb47c3cf30e71329e58361cfcb37c34235ef3bf1d7bf3773aeac"},
+  "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"},
   "elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"},
-  "ex_doc": {:hex, :ex_doc, "0.34.2", "13eedf3844ccdce25cfd837b99bea9ad92c4e511233199440488d217c92571e8", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "5ce5f16b41208a50106afed3de6a2ed34f4acfd65715b82a0b84b49d995f95c1"},
+  "ex_doc": {:hex, :ex_doc, "0.38.4", "ab48dff7a8af84226bf23baddcdda329f467255d924380a0cf0cee97bb9a9ede", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "f7b62346408a83911c2580154e35613eb314e0278aeea72ed7fedef9c1f165b2"},
   "exla": {:hex, :exla, "0.10.0", "93e7d75a774fbc06ce05b96de20c4b01bda413b315238cb3c727c09a05d2bc3a", [:make, :mix], [{:elixir_make, "~> 0.6", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:fine, "~> 0.1.0", [hex: :fine, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:nx, "~> 0.10.0", [hex: :nx, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:xla, "~> 0.9.0", [hex: :xla, repo: "hexpm", optional: false]}], "hexpm", "16fffdb64667d7f0a3bc683fdcd2792b143a9b345e4b1f1d5cd50330c63d8119"},
-  "fine": {:hex, :fine, "0.1.1", "df2ce44e438bed0061627e10c470873c69374ee7390a51bc612c2358ad37d556", [:mix], [], "hexpm", "41335526b82cf2c196d2588cd54d4504480e2e6ead24f2c07ae0c1cf40af61e5"},
+  "fine": {:hex, :fine, "0.1.4", "b19a89c1476c7c57afb5f9314aed5960b5bc95d5277de4cb5ee8e1d1616ce379", [:mix], [], "hexpm", "be3324cc454a42d80951cf6023b9954e9ff27c6daa255483b3e8d608670303f5"},
   "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
-  "makeup": {:hex, :makeup, "1.1.2", "9ba8837913bdf757787e71c1581c21f9d2455f4dd04cfca785c70bbfff1a76a3", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cce1566b81fbcbd21eca8ffe808f33b221f9eee2cbc7a1706fc3da9ff18e6cac"},
-  "makeup_elixir": {:hex, :makeup_elixir, "0.16.2", "627e84b8e8bf22e60a2579dad15067c755531fea049ae26ef1020cad58fe9578", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "41193978704763f6bbe6cc2758b84909e62984c7752b3784bd3c218bb341706b"},
-  "makeup_erlang": {:hex, :makeup_erlang, "1.0.1", "c7f58c120b2b5aa5fd80d540a89fdf866ed42f1f3994e4fe189abebeab610839", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "8a89a1eeccc2d798d6ea15496a6e4870b75e014d1af514b1b71fa33134f57814"},
-  "mime": {:hex, :mime, "2.0.3", "3676436d3d1f7b81b5a2d2bd8405f412c677558c81b1c92be58c00562bb59095", [:mix], [], "hexpm", "27a30bf0db44d25eecba73755acf4068cbfe26a4372f9eb3e4ea3a45956bff6b"},
-  "nif_call": {:hex, :nif_call, "0.1.3", "bb4af0d28d1a2f10602d50246155b95b4ef6a389025c12d830dc9924ef06d324", [:mix], [], "hexpm", "48ba2e66c7d5aab4ca1a9656d3fa7326d317ba63bac64d24b22a05c8c8a9aff0"},
-  "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
+  "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"},
+  "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"},
+  "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"},
+  "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"},
+  "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"},
   "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
   "nx": {:hex, :nx, "0.10.0", "128e4a094cb790f663e20e1334b127c1f2a4df54edfb8b13c22757ec33133b4f", [:mix], [{:complex, "~> 0.6", [hex: :complex, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "3db8892c124aeee091df0e6fbf8e5bf1b81f502eb0d4f5ba63e6378ebcae7da4"},
   "nx_image": {:hex, :nx_image, "0.1.2", "0c6e3453c1dc30fc80c723a54861204304cebc8a89ed3b806b972c73ee5d119d", [:mix], [{:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "9161863c42405ddccb6dbbbeae078ad23e30201509cc804b3b3a7c9e98764b81"},
   "nx_signal": {:hex, :nx_signal, "0.2.0", "e1ca0318877b17c81ce8906329f5125f1e2361e4c4235a5baac8a95ee88ea98e", [:mix], [{:nx, "~> 0.6", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "7247e5e18a177a59c4cb5355952900c62fdeadeb2bad02a9a34237b68744e2bb"},
-  "plug": {:hex, :plug, "1.14.2", "cff7d4ec45b4ae176a227acd94a7ab536d9b37b942c8e8fa6dfc0fff98ff4d80", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "842fc50187e13cf4ac3b253d47d9474ed6c296a8732752835ce4a86acdf68d13"},
-  "plug_cowboy": {:hex, :plug_cowboy, "2.6.1", "9a3bbfceeb65eff5f39dab529e5cd79137ac36e913c02067dba3963a26efe9b2", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "de36e1a21f451a18b790f37765db198075c25875c64834bcc82d90b309eb6613"},
-  "plug_crypto": {:hex, :plug_crypto, "1.2.5", "918772575e48e81e455818229bf719d4ab4181fcbf7f85b68a35620f78d89ced", [:mix], [], "hexpm", "26549a1d6345e2172eb1c233866756ae44a9609bd33ee6f99147ab3fd87fd842"},
+  "plug": {:hex, :plug, "1.18.1", "5067f26f7745b7e31bc3368bc1a2b818b9779faa959b49c934c17730efc911cf", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "57a57db70df2b422b564437d2d33cf8d33cd16339c1edb190cd11b1a3a546cc2"},
+  "plug_cowboy": {:hex, :plug_cowboy, "2.7.4", "729c752d17cf364e2b8da5bdb34fb5804f56251e88bb602aff48ae0bd8673d11", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "9b85632bd7012615bae0a5d70084deb1b25d2bcbb32cab82d1e9a1e023168aa3"},
+  "plug_crypto": {:hex, :plug_crypto, "2.1.1", "19bda8184399cb24afa10be734f84a16ea0a2bc65054e23a62bb10f06bc89491", [:mix], [], "hexpm", "6470bce6ffe41c8bd497612ffde1a7e4af67f36a15eea5f921af71cf3e11247c"},
   "polaris": {:hex, :polaris, "0.1.0", "dca61b18e3e801ecdae6ac9f0eca5f19792b44a5cb4b8d63db50fc40fc038d22", [:mix], [{:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "13ef2b166650e533cb24b10e2f3b8ab4f2f449ba4d63156e8c569527f206e2c2"},
   "progress_bar": {:hex, :progress_bar, "3.0.0", "f54ff038c2ac540cfbb4c2bfe97c75e7116ead044f3c2b10c9f212452194b5cd", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6981c2b25ab24aecc91a2dc46623658e1399c21a2ae24db986b90d678530f2b7"},
-  "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
-  "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.2", "5f25cbe220a8fac3e7ad62e6f950fcdca5a5a5f8501835d2823e8c74bf4268d5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "63d1bd5f8e23096d1ff851839923162096364bac8656a4a3c00d1fff8e83ee0a"},
+  "ranch": {:hex, :ranch, "1.8.1", "208169e65292ac5d333d6cdbad49388c1ae198136e4697ae2f474697140f201c", [:make, :rebar3], [], "hexpm", "aed58910f4e21deea992a67bf51632b6d60114895eb03bb392bb733064594dd0"},
+  "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.3", "4e741024b0b097fe783add06e53ae9a6f23ddc78df1010f215df0c02915ef5a8", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "c23f5f33cb6608542de4d04faf0f0291458c352a4648e4d28d17ee1098cddcc4"},
   "safetensors": {:hex, :safetensors, "0.1.3", "7ff3c22391e213289c713898481d492c9c28a49ab1d0705b72630fb8360426b2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "fe50b53ea59fde4e723dd1a2e31cfdc6013e69343afac84c6be86d6d7c562c14"},
-  "stb_image": {:hex, :stb_image, "0.6.9", "89e77998d4e6d5e2d05ab2d8b5a02e1e8df7bdf04c9dfb063f8b76b2c5870e1f", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}], "hexpm", "02981332167659a0b99f7dfa71fdc9ba3bc3a9200e4952f36fe9ba2a2753f4fa"},
+  "stb_image": {:hex, :stb_image, "0.6.10", "76975279e2a130f53dc670bf6f6b1cdc4fbd7ab6293053e88e7fb6a7eae0e836", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}], "hexpm", "26125372cfeda209084d3670417fab6819cfccd0e66c657678ecc48314369e8d"},
   "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
   "tokenizers": {:hex, :tokenizers, "0.5.1", "b0975d92b4ee5b18e8f47b5d65b9d5f1e583d9130189b1a2620401af4e7d4b35", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.6", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "5f08d97cc7f2ed3d71d370d68120da6d3de010948ccf676c9c0eb591ba4bacc9"},
   "torchx": {:hex, :torchx, "0.10.0", "81e583507cdb2bfca9fce0ab2f43da4505f19374bd9898d0cd50feec56b9035e", [:mix], [{:nx, "~> 0.10.0", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "557f886c828b4d8d88307de076493e61c6152d4da5b3680e1874688bc7245610"},

From 5641a4fc095cb4560aee3cd2b8d633c95f001d28 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 10:32:02 -0400
Subject: [PATCH 06/15] Add Qwen3-Reranker support and example

Implements document reranking using Qwen3-Reranker models.
Rerankers score query-document pairs for relevance, improving
retrieval quality in RAG and search applications.

Features:
- Automatic yes/no token detection from tokenizer
- Proper input format with instruction, query, and document
- Softmax-based relevance scoring (0-1 range)
- Support for custom task instructions

Example demonstrates:
- Basic query-document scoring
- Custom instructions for code search
- Reranking search results (top-k selection)

Results show correct ranking:
- Relevant docs score 0.99+
- Irrelevant docs score near 0.0
- Custom instructions work for domain-specific tasks

Works with Qwen3-Reranker-0.6B/4B/8B models.
---
 examples/qwen3_reranker.exs | 171 ++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 examples/qwen3_reranker.exs

diff --git a/examples/qwen3_reranker.exs b/examples/qwen3_reranker.exs
new file mode 100644
index 00000000..70594e95
--- /dev/null
+++ b/examples/qwen3_reranker.exs
@@ -0,0 +1,171 @@
+#!/usr/bin/env elixir
+
+# Qwen3-Reranker Example
+#
+# This example demonstrates using Qwen3-Reranker-0.6B for reranking
+# documents based on relevance to a query. Rerankers score query-document
+# pairs to improve retrieval quality in RAG and search applications.
+#
+# Usage: elixir examples/qwen3_reranker.exs
+
+Mix.install([
+  {:bumblebee, path: Path.expand("..", __DIR__)},
+  {:exla, ">= 0.0.0"}
+])
+
+Application.put_env(:nx, :default_backend, EXLA.Backend)
+
+# Load reranker model (uses same Qwen3ForCausalLM architecture)
+{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Reranker-0.6B"})
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Reranker-0.6B"})
+
+# Build model
+{_init_fn, predict_fn} = Axon.build(model_info.model)
+
+# Get yes/no token IDs by tokenizing the words
+tokenizer_no_special = Bumblebee.configure(tokenizer, add_special_tokens: false)
+yes_token_result = Bumblebee.apply_tokenizer(tokenizer_no_special, "yes")
+no_token_result = Bumblebee.apply_tokenizer(tokenizer_no_special, "no")
+yes_token_id = Nx.to_flat_list(yes_token_result["input_ids"]) |> hd()
+no_token_id = Nx.to_flat_list(no_token_result["input_ids"]) |> hd()
+
+# Format query-document pair as recommended by Qwen team
+format_pair = fn instruction, query, document ->
+  instruction =
+    instruction || "Given a web search query, retrieve relevant passages that answer the query"
+
+  # Format with suffix as per vLLM example
+  suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+  "<Instruct>: #{instruction}\n<Query>: #{query}\n<Document>: #{document}#{suffix}"
+end
+
+# Reranking function
+# Returns a relevance score between 0 and 1
+rerank_score = fn query, document, instruction ->
+  # Format the input
+  text = format_pair.(instruction, query, document)
+
+  # Tokenize
+  inputs = Bumblebee.apply_tokenizer(tokenizer, text)
+
+  # Get model output
+  output = predict_fn.(model_info.params, inputs)
+
+  # Extract logits for the last token
+  # Shape: {batch, seq, vocab}
+  {_batch, seq_len, _vocab} = Nx.shape(output.logits)
+  last_logits = output.logits[[0, seq_len - 1, ..]]
+
+  # Get logits for "yes" and "no" tokens
+  yes_logit = Nx.to_number(last_logits[yes_token_id])
+  no_logit = Nx.to_number(last_logits[no_token_id])
+
+  # Compute softmax probability for "yes"
+  # P(yes) = exp(yes) / (exp(yes) + exp(no))
+  exp_yes = :math.exp(yes_logit)
+  exp_no = :math.exp(no_logit)
+
+  relevance_score = exp_yes / (exp_yes + exp_no)
+  relevance_score
+end
+
+IO.puts("\n" <> String.duplicate("=", 70))
+IO.puts("Qwen3-Reranker Example")
+IO.puts(String.duplicate("=", 70))
+
+# Example 1: Basic reranking
+IO.puts("\n=== Example 1: Basic Query-Document Scoring ===")
+
+query = "What is the capital of France?"
+
+documents = [
+  "Paris is the capital and largest city of France.",
+  "London is the capital of the United Kingdom.",
+  "Machine learning is a subset of artificial intelligence.",
+  "The Eiffel Tower is located in Paris, France.",
+  "Berlin is the capital of Germany."
+]
+
+IO.puts("Query: #{query}")
+IO.puts("\nDocument relevance scores:")
+
+scores =
+  Enum.map(documents, fn doc ->
+    score = rerank_score.(query, doc, nil)
+    {doc, score}
+  end)
+  |> Enum.sort_by(&elem(&1, 1), :desc)
+
+Enum.with_index(scores, 1)
+|> Enum.each(fn {{doc, score}, rank} ->
+  IO.puts("  #{rank}. [#{Float.round(score, 4)}] #{String.slice(doc, 0..60)}...")
+end)
+
+# Example 2: Custom instruction
+IO.puts("\n=== Example 2: Custom Task Instruction ===")
+
+instruction = "Given a coding question, find relevant code examples"
+query = "How to calculate factorial in Elixir?"
+
+code_docs = [
+  "def factorial(n), do: if n <= 1, do: 1, else: n * factorial(n - 1)",
+  "def fibonacci(n), do: if n <= 1, do: n, else: fibonacci(n - 1) + fibonacci(n - 2)",
+  "def sum_list(list), do: Enum.reduce(list, 0, &+/2)",
+  "Factorial is a mathematical function that multiplies a number by all positive integers less than it."
+]
+
+IO.puts("Query: #{query}")
+IO.puts("Instruction: #{instruction}")
+IO.puts("\nCode snippet relevance:")
+
+code_docs
+|> Enum.map(fn doc ->
+  score = rerank_score.(query, doc, instruction)
+  {doc, score}
+end)
+|> Enum.sort_by(&elem(&1, 1), :desc)
+|> Enum.with_index(1)
+|> Enum.each(fn {{doc, score}, rank} ->
+  IO.puts("  #{rank}. [#{Float.round(score, 4)}] #{String.slice(doc, 0..60)}...")
+end)
+
+# Example 3: Reranking search results
+IO.puts("\n=== Example 3: Reranking Initial Search Results ===")
+
+query = "best practices for concurrent programming"
+
+# Simulated initial retrieval results (could be from vector search)
+search_results = [
+  "Concurrent programming involves multiple computations executing simultaneously.",
+  "Elixir uses the Actor model for concurrency with lightweight processes.",
+  "Python has threading and multiprocessing modules for parallel execution.",
+  "The weather is nice today and perfect for a walk.",
+  "OTP behaviors like GenServer provide patterns for concurrent systems."
+]
+
+IO.puts("Query: #{query}")
+IO.puts("\nInitial results (unranked):")
+
+Enum.with_index(search_results, 1)
+|> Enum.each(fn {doc, i} ->
+  IO.puts("  #{i}. #{String.slice(doc, 0..60)}...")
+end)
+
+IO.puts("\nAfter reranking:")
+
+search_results
+|> Enum.map(fn doc ->
+  score = rerank_score.(query, doc, nil)
+  {doc, score}
+end)
+|> Enum.sort_by(&elem(&1, 1), :desc)
+# Top 3 results
+|> Enum.take(3)
+|> Enum.with_index(1)
+|> Enum.each(fn {{doc, score}, rank} ->
+  IO.puts("  #{rank}. [#{Float.round(score, 4)}] #{String.slice(doc, 0..60)}...")
+end)
+
+IO.puts("\n" <> String.duplicate("=", 70))
+IO.puts("✓ Qwen3-Reranker successfully reranked documents by relevance")
+IO.puts(String.duplicate("=", 70) <> "\n")

From fa592c365b0daf0bcab5cba1969172db01084455 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Sun, 5 Oct 2025 10:42:18 -0400
Subject: [PATCH 07/15] Organize Qwen3 examples into dedicated folder

Move all Qwen3-related examples and documentation into examples/qwen3/
for better organization and discoverability.

Changes:
- Create examples/qwen3/ directory
- Move qwen3.exs, qwen3_embedding.exs, qwen3_embedding_prompts.exs, qwen3_reranker.exs
- Move QWEN3_IEX_GUIDE.md to examples/qwen3/
- Update examples/README.md to reference qwen3/ subdirectory

All examples now accessible under examples/qwen3/ with consistent structure.
---
 examples/README.md                            | 76 +++++++++----------
 .../qwen3/QWEN3_IEX_GUIDE.md                  |  0
 examples/{ => qwen3}/qwen3.exs                |  0
 examples/{ => qwen3}/qwen3_embedding.exs      |  0
 .../{ => qwen3}/qwen3_embedding_prompts.exs   |  0
 examples/{ => qwen3}/qwen3_reranker.exs       |  0
 6 files changed, 35 insertions(+), 41 deletions(-)
 rename QWEN3_IEX_GUIDE.md => examples/qwen3/QWEN3_IEX_GUIDE.md (100%)
 rename examples/{ => qwen3}/qwen3.exs (100%)
 rename examples/{ => qwen3}/qwen3_embedding.exs (100%)
 rename examples/{ => qwen3}/qwen3_embedding_prompts.exs (100%)
 rename examples/{ => qwen3}/qwen3_reranker.exs (100%)

diff --git a/examples/README.md b/examples/README.md
index 758205a2..c874e913 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -2,62 +2,56 @@
 
 This directory contains example scripts demonstrating how to use Bumblebee models.
 
-## Qwen3 Text Generation
+## Qwen3 Examples
 
-### Basic Usage
+See the `qwen3/` subdirectory for comprehensive Qwen3 model examples:
 
+### Text Generation
 ```bash
-elixir examples/qwen3_text_generation.exs
+elixir examples/qwen3/qwen3.exs
 ```
 
-This example demonstrates:
-- Loading Qwen3-4B-Instruct model
-- Text completion
-- Question answering
-- Story generation
-- Chat format (Instruct model)
-- Code generation
-
-### Requirements
-
-- **Disk space**: ~8GB for model weights (downloaded once and cached)
-- **Memory**: ~10GB RAM for inference
-- **Backend**: EXLA (CPU or GPU)
-
-### Example Output
-
+### Text Embeddings
+```bash
+elixir examples/qwen3/qwen3_embedding.exs
+elixir examples/qwen3/qwen3_embedding_prompts.exs
 ```
-=== Example 1: Text Completion ===
-The future of artificial intelligence is being shaped by the development
-of more advanced models that can understand and generate human-like language...
 
-=== Example 2: Question Answering ===
-What are the benefits of functional programming? The main benefits are
-immutability, composability, and easier testing...
+### Document Reranking
+```bash
+elixir examples/qwen3/qwen3_reranker.exs
 ```
 
-### Customization
+### Features Demonstrated
 
-Edit the script to:
-- Change `max_new_tokens` for longer/shorter output
-- Adjust `temperature` (0.0-1.0) for more deterministic/creative output
-- Modify `top_k` and `top_p` for sampling behavior
-- Use different prompts
+**Text Generation** (`qwen3.exs`):
+- Text completion
+- Question answering
+- Chat format
+- Code generation
 
-### Other Models
+**Embeddings** (`qwen3_embedding.exs`, `qwen3_embedding_prompts.exs`):
+- 1024-dimensional text embeddings
+- Semantic similarity computation
+- Instruction-aware prompts (recommended by Qwen team)
+- Multilingual support
+- Code search
 
-To use different Qwen3 model sizes, change the model name:
+**Reranking** (`qwen3_reranker.exs`):
+- Query-document relevance scoring
+- Custom task instructions
+- Top-k result selection
 
-```elixir
-# Smaller (faster)
-{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-0.6B"})
+### Requirements
+
+- **Text Generation**: ~8GB disk space, ~10GB RAM
+- **Embeddings**: ~1.5GB disk space, ~4GB RAM (0.6B model)
+- **Reranking**: ~1.5GB disk space, ~4GB RAM (0.6B model)
+- **Backend**: EXLA (CPU or GPU)
 
-# Balanced (recommended)
-{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+### Documentation
 
-# Larger (better quality)
-{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-8B"})
-```
+See `examples/qwen3/QWEN3_IEX_GUIDE.md` for interactive IEx usage examples.
 
 ## Phoenix Examples
 
diff --git a/QWEN3_IEX_GUIDE.md b/examples/qwen3/QWEN3_IEX_GUIDE.md
similarity index 100%
rename from QWEN3_IEX_GUIDE.md
rename to examples/qwen3/QWEN3_IEX_GUIDE.md
diff --git a/examples/qwen3.exs b/examples/qwen3/qwen3.exs
similarity index 100%
rename from examples/qwen3.exs
rename to examples/qwen3/qwen3.exs
diff --git a/examples/qwen3_embedding.exs b/examples/qwen3/qwen3_embedding.exs
similarity index 100%
rename from examples/qwen3_embedding.exs
rename to examples/qwen3/qwen3_embedding.exs
diff --git a/examples/qwen3_embedding_prompts.exs b/examples/qwen3/qwen3_embedding_prompts.exs
similarity index 100%
rename from examples/qwen3_embedding_prompts.exs
rename to examples/qwen3/qwen3_embedding_prompts.exs
diff --git a/examples/qwen3_reranker.exs b/examples/qwen3/qwen3_reranker.exs
similarity index 100%
rename from examples/qwen3_reranker.exs
rename to examples/qwen3/qwen3_reranker.exs

From 8208efdfbd6b387cf7c16e9dae96c56891d22412 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Mon, 6 Oct 2025 17:00:31 -0400
Subject: [PATCH 08/15] Address PR review feedback for Qwen3 support

- Remove .lexical/ from project gitignore (should be in global gitignore)
- Add :qwen2 tokenizer type with correct Qwen3 special tokens
- Refactor QK normalization to use generalized approach:
  - Add :query_norm and :key_norm options to Layers.Transformer
  - Apply normalization after head splitting, before rotary embedding
  - Update Qwen3 to use Layers.Transformer.blocks instead of custom implementation
  - Remove ~200 lines of custom decoder/attention code
- Remove standalone examples directory per review feedback

The generalized QK normalization approach makes the transformer layer more
flexible and maintainable, allowing other models to use similar patterns.
---
 .gitignore                                  |   3 -
 examples/qwen3/QWEN3_IEX_GUIDE.md           | 206 --------------
 examples/qwen3/qwen3.exs                    |  79 ------
 examples/qwen3/qwen3_embedding.exs          | 114 --------
 examples/qwen3/qwen3_embedding_prompts.exs  | 175 ------------
 examples/qwen3/qwen3_reranker.exs           | 171 ------------
 lib/bumblebee/layers/transformer.ex         |  55 +++-
 lib/bumblebee/text/pre_trained_tokenizer.ex |   6 +
 lib/bumblebee/text/qwen3.ex                 | 289 +++-----------------
 9 files changed, 90 insertions(+), 1008 deletions(-)
 delete mode 100644 examples/qwen3/QWEN3_IEX_GUIDE.md
 delete mode 100644 examples/qwen3/qwen3.exs
 delete mode 100644 examples/qwen3/qwen3_embedding.exs
 delete mode 100644 examples/qwen3/qwen3_embedding_prompts.exs
 delete mode 100644 examples/qwen3/qwen3_reranker.exs

diff --git a/.gitignore b/.gitignore
index 255bbfaa..f0e1a379 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,3 @@ bumblebee-*.tar
 
 # Temporary files, for example, from tests.
 /tmp/
-
-# Lexical LSP
-/.lexical/
diff --git a/examples/qwen3/QWEN3_IEX_GUIDE.md b/examples/qwen3/QWEN3_IEX_GUIDE.md
deleted file mode 100644
index 8e5f47db..00000000
--- a/examples/qwen3/QWEN3_IEX_GUIDE.md
+++ /dev/null
@@ -1,206 +0,0 @@
-# Qwen3 IEx Usage Guide
-
-## Text Generation (Qwen3-4B-Instruct)
-
-```elixir
-# Start IEx
-iex -S mix
-
-# Set backend
-Nx.default_backend(EXLA.Backend)
-
-# Load model components
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
-{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
-{:ok, c} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
-
-# Create serving
-s = Bumblebee.Text.generation(m, t, c)
-
-# Generate text
-Nx.Serving.run(s, "The future of AI is")
-
-# With chat format
-prompt = "<|im_start|>system
-You are a helpful assistant.<|im_end|>
-<|im_start|>user
-What is Elixir?<|im_end|>
-<|im_start|>assistant
-"
-Nx.Serving.run(s, prompt)
-```
-
-## Text Embeddings (Qwen3-Embedding-0.6B)
-
-### Method 1: Using :for_embedding Architecture (Recommended)
-
-```elixir
-# Start IEx
-iex -S mix
-
-# Set backend
-Nx.default_backend(EXLA.Backend)
-
-# Load embedding model with :for_embedding architecture
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
-  architecture: :for_embedding
-)
-{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
-
-# Create serving
-s = Bumblebee.Text.text_embedding(m, t,
-  output_attribute: :embedding,
-  embedding_processor: :l2_norm
-)
-
-# Generate embeddings
-e1 = Nx.Serving.run(s, "The cat sat on the mat")
-e2 = Nx.Serving.run(s, "A feline rested on the rug")
-e3 = Nx.Serving.run(s, "Python is a programming language")
-
-# Check dimension
-Nx.shape(e1.embedding)  # {1024}
-
-# Compute similarity
-Nx.dot(e1.embedding, e2.embedding) |> Nx.to_number()  # ~0.73 (similar)
-Nx.dot(e1.embedding, e3.embedding) |> Nx.to_number()  # ~0.34 (different)
-```
-
-### Method 2: Direct Model Access (Advanced)
-
-```elixir
-# For more control over the pipeline
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
-  architecture: :for_embedding
-)
-{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
-
-{_init, predict} = Axon.build(m.model)
-
-# Generate embedding
-inputs = Bumblebee.apply_tokenizer(t, "test text")
-output = predict.(m.params, inputs)
-embedding = Bumblebee.Utils.Nx.normalize(output.embedding)
-Nx.shape(embedding)  # {1, 1024}
-```
-
-## Instruction-Aware Embeddings (Qwen Team Recommendation)
-
-```elixir
-# Setup
-Nx.default_backend(EXLA.Backend)
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
-  architecture: :for_embedding
-)
-{:ok, t} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
-s = Bumblebee.Text.text_embedding(m, t,
-  output_attribute: :embedding,
-  embedding_processor: :l2_norm
-)
-
-# Without instruction
-query = "What is the capital of France?"
-q_plain = Nx.Serving.run(s, query)
-
-# With instruction (recommended by Qwen team)
-query_prompted = "Instruct: Given a web search query, retrieve relevant passages that answer the query
-Query: What is the capital of France?"
-q_with_prompt = Nx.Serving.run(s, query_prompted)
-
-# Documents (no instruction needed)
-doc = "Paris is the capital and largest city of France."
-d = Nx.Serving.run(s, doc)
-
-# Compare
-Nx.dot(q_plain.embedding, d.embedding) |> Nx.to_number()
-Nx.dot(q_with_prompt.embedding, d.embedding) |> Nx.to_number()
-```
-
-## Custom Task Instructions
-
-```elixir
-# Code search
-code_query = "Instruct: Given a code search query, find relevant code snippets
-Query: function to calculate factorial"
-
-code_doc = "def factorial(n), do: if n <= 1, do: 1, else: n * factorial(n - 1)"
-
-q = Nx.Serving.run(s, code_query)
-d = Nx.Serving.run(s, code_doc)
-
-Nx.dot(q.embedding, d.embedding) |> Nx.to_number()  # High similarity
-```
-
-## Semantic Search Example
-
-```elixir
-# Index documents
-documents = [
-  "Paris is the capital of France",
-  "Berlin is the capital of Germany",
-  "Machine learning uses neural networks",
-  "The Eiffel Tower is in Paris"
-]
-
-doc_embeddings = Enum.map(documents, fn doc ->
-  Nx.Serving.run(s, doc).embedding
-end)
-
-# Search
-query = "Instruct: Given a web search query, retrieve relevant passages
-Query: What is the French capital?"
-q_emb = Nx.Serving.run(s, query).embedding
-
-# Compute similarities
-similarities = Enum.map(doc_embeddings, fn doc_emb ->
-  Nx.dot(q_emb, doc_emb) |> Nx.to_number()
-end)
-
-# Show results ranked by similarity
-Enum.zip(documents, similarities)
-|> Enum.sort_by(&elem(&1, 1), :desc)
-|> Enum.each(fn {doc, score} ->
-  IO.puts("#{Float.round(score, 3)}: #{doc}")
-end)
-```
-
-## Batch Processing
-
-```elixir
-# Process multiple texts at once
-texts = [
-  "First document",
-  "Second document",
-  "Third document"
-]
-
-results = Nx.Serving.run(s, texts)
-
-embeddings = Enum.map(results, & &1.embedding)
-```
-
-## Model Variants
-
-```elixir
-# Different sizes available
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"}, architecture: :for_embedding)
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-4B"}, architecture: :for_embedding)
-{:ok, m} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-8B"}, architecture: :for_embedding)
-```
-
-## Common Similarity Metrics
-
-```elixir
-# Cosine similarity (recommended for normalized embeddings)
-cosine_sim = fn e1, e2 -> Nx.dot(e1, e2) |> Nx.to_number() end
-
-# Euclidean distance
-euclidean = fn e1, e2 ->
-  Nx.subtract(e1, e2) |> Nx.pow(2) |> Nx.sum() |> Nx.sqrt() |> Nx.to_number()
-end
-
-# Manhattan distance
-manhattan = fn e1, e2 ->
-  Nx.subtract(e1, e2) |> Nx.abs() |> Nx.sum() |> Nx.to_number()
-end
-```
diff --git a/examples/qwen3/qwen3.exs b/examples/qwen3/qwen3.exs
deleted file mode 100644
index 1c805b62..00000000
--- a/examples/qwen3/qwen3.exs
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env elixir
-
-# Qwen3-4B-Instruct Text Generation
-#
-# This example demonstrates using the Qwen3-4B-Instruct model for various
-# text generation tasks including completion, chat, and code generation.
-#
-# Usage:
-#   elixir examples/qwen3.exs
-
-Mix.install([
-  {:bumblebee, "~> 0.6.0"},
-  {:exla, ">= 0.0.0"}
-])
-
-Application.put_env(:nx, :default_backend, EXLA.Backend)
-
-# Load model, tokenizer, and generation configuration
-{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
-{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
-{:ok, generation_config} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
-
-# Configure generation parameters
-generation_config =
-  Bumblebee.configure(generation_config,
-    max_new_tokens: 100,
-    strategy: %{type: :multinomial_sampling, top_k: 20, top_p: 0.8},
-    temperature: 0.7
-  )
-
-# Create text generation serving
-serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
-
-# Example 1: Text Completion
-IO.puts("\n=== Text Completion ===")
-result = Nx.Serving.run(serving, "The future of artificial intelligence")
-IO.puts(result.results |> hd() |> Map.get(:text))
-
-# Example 2: Question Answering with Chat Format
-IO.puts("\n=== Question Answering ===")
-
-prompt = """
-<|im_start|>system
-You are a helpful assistant.<|im_end|>
-<|im_start|>user
-What are the key features of the Elixir programming language?<|im_end|>
-<|im_start|>assistant
-"""
-
-result = Nx.Serving.run(serving, prompt)
-IO.puts(result.results |> hd() |> Map.get(:text))
-
-# Example 3: Code Generation
-IO.puts("\n=== Code Generation ===")
-
-prompt = """
-<|im_start|>system
-You are an expert Elixir programmer.<|im_end|>
-<|im_start|>user
-Write a function to calculate the nth Fibonacci number using recursion.<|im_end|>
-<|im_start|>assistant
-"""
-
-result = Nx.Serving.run(serving, prompt)
-IO.puts(result.results |> hd() |> Map.get(:text))
-
-# Example 4: Creative Writing
-IO.puts("\n=== Creative Writing ===")
-
-prompt = """
-<|im_start|>system
-You are a creative storyteller.<|im_end|>
-<|im_start|>user
-Write the opening paragraph of a science fiction story.<|im_end|>
-<|im_start|>assistant
-"""
-
-result = Nx.Serving.run(serving, prompt)
-IO.puts(result.results |> hd() |> Map.get(:text))
diff --git a/examples/qwen3/qwen3_embedding.exs b/examples/qwen3/qwen3_embedding.exs
deleted file mode 100644
index c87c23b4..00000000
--- a/examples/qwen3/qwen3_embedding.exs
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env elixir
-
-# Qwen3-Embedding Example
-#
-# This example demonstrates using Qwen3-Embedding-0.6B for generating
-# text embeddings for semantic search and similarity tasks.
-#
-# Usage: elixir examples/qwen3_embedding.exs
-
-Mix.install([
-  {:bumblebee, path: Path.expand("..", __DIR__)},
-  {:exla, ">= 0.0.0"}
-])
-
-Application.put_env(:nx, :default_backend, EXLA.Backend)
-
-# Load embedding model with :for_embedding architecture
-{:ok, model_info} =
-  Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
-    architecture: :for_embedding
-  )
-
-{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
-
-# Create text embedding serving
-# The :for_embedding architecture automatically pools the last token
-serving =
-  Bumblebee.Text.text_embedding(model_info, tokenizer,
-    output_attribute: :embedding,
-    embedding_processor: :l2_norm
-  )
-
-# Helper function
-generate_embedding = fn text ->
-  result = Nx.Serving.run(serving, text)
-  result.embedding
-end
-
-# Example 1: Simple text embeddings
-IO.puts("\n=== Example 1: Generate Text Embeddings ===")
-
-texts = [
-  "The quick brown fox jumps over the lazy dog",
-  "A fast auburn fox leaps above a sleepy canine",
-  "The weather is nice today"
-]
-
-IO.puts("Generating embeddings for #{length(texts)} texts...")
-
-embeddings = Enum.map(texts, &generate_embedding.(&1))
-
-IO.puts("✓ Generated embeddings")
-IO.puts("  Embedding dimension: #{Nx.axis_size(hd(embeddings), 0)}")
-IO.puts("")
-
-# Example 2: Compute similarity
-IO.puts("=== Example 2: Semantic Similarity ===")
-IO.puts("Text 1: \"#{Enum.at(texts, 0)}\"")
-IO.puts("Text 2: \"#{Enum.at(texts, 1)}\"")
-IO.puts("Text 3: \"#{Enum.at(texts, 2)}\"")
-IO.puts("")
-
-# Compute cosine similarity
-similarity_1_2 =
-  Nx.dot(Enum.at(embeddings, 0), Enum.at(embeddings, 1))
-  |> Nx.to_number()
-
-similarity_1_3 =
-  Nx.dot(Enum.at(embeddings, 0), Enum.at(embeddings, 2))
-  |> Nx.to_number()
-
-IO.puts("Similarity (Text 1 vs Text 2): #{Float.round(similarity_1_2, 4)}")
-IO.puts("Similarity (Text 1 vs Text 3): #{Float.round(similarity_1_3, 4)}")
-IO.puts("")
-IO.puts("✓ Texts 1 and 2 are more similar (same meaning, different words)")
-
-# Example 3: Instruction-aware embeddings
-IO.puts("\n=== Example 3: Instruction-Aware Embeddings ===")
-
-query =
-  "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: What is the capital of France?"
-
-document = "Paris is the capital and largest city of France."
-
-query_embedding = generate_embedding.(query)
-doc_embedding = generate_embedding.(document)
-
-similarity =
-  Nx.dot(query_embedding, doc_embedding)
-  |> Nx.to_number()
-
-IO.puts("Query: What is the capital of France?")
-IO.puts("Document: Paris is the capital and largest city of France.")
-IO.puts("Similarity: #{Float.round(similarity, 4)}")
-IO.puts("")
-
-# Example 4: Batch processing
-IO.puts("=== Example 4: Batch Processing ===")
-
-batch_texts = [
-  "Machine learning is a subset of artificial intelligence",
-  "Deep learning uses neural networks with multiple layers",
-  "Python is a popular programming language"
-]
-
-IO.puts("Processing batch of #{length(batch_texts)} texts...")
-batch_embeddings = Enum.map(batch_texts, &generate_embedding.(&1))
-
-IO.puts("✓ Batch embeddings generated")
-IO.puts("  Number of embeddings: #{length(batch_embeddings)}")
-IO.puts("  Each embedding shape: #{inspect(Nx.shape(hd(batch_embeddings)))}")
-IO.puts("")
-
-IO.puts("=== Qwen3-Embedding is working! ===")
diff --git a/examples/qwen3/qwen3_embedding_prompts.exs b/examples/qwen3/qwen3_embedding_prompts.exs
deleted file mode 100644
index 49104499..00000000
--- a/examples/qwen3/qwen3_embedding_prompts.exs
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/usr/bin/env elixir
-
-# Qwen3-Embedding with Instruction Prompts
-#
-# This example demonstrates the Qwen team's recommended approach for using
-# instruction-aware prompts to improve retrieval performance by 1-5%.
-#
-# Usage: elixir examples/qwen3_embedding_prompts.exs
-
-Mix.install([
-  {:bumblebee, path: Path.expand("..", __DIR__)},
-  {:exla, ">= 0.0.0"}
-])
-
-Application.put_env(:nx, :default_backend, EXLA.Backend)
-
-# Load embedding model with :for_embedding architecture
-{:ok, model_info} =
-  Bumblebee.load_model({:hf, "Qwen/Qwen3-Embedding-0.6B"},
-    architecture: :for_embedding
-  )
-
-{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Embedding-0.6B"})
-
-# Create serving with L2 normalization
-serving =
-  Bumblebee.Text.text_embedding(model_info, tokenizer,
-    output_attribute: :embedding,
-    embedding_processor: :l2_norm
-  )
-
-# Embedding function
-generate_embedding = fn text ->
-  result = Nx.Serving.run(serving, text)
-  result.embedding
-end
-
-# Helper to compute similarity
-similarity = fn e1, e2 ->
-  Nx.dot(e1, e2) |> Nx.to_number() |> Float.round(4)
-end
-
-IO.puts("\n" <> String.duplicate("=", 70))
-IO.puts("Qwen3-Embedding: With vs Without Instruction Prompts")
-IO.puts(String.duplicate("=", 70))
-
-# Test data
-query_text = "What is the capital of France?"
-document1 = "Paris is the capital and largest city of France."
-document2 = "London is the capital of the United Kingdom."
-document3 = "Machine learning is a branch of artificial intelligence."
-
-IO.puts("\nQuery: #{query_text}")
-IO.puts("Doc 1: #{document1}")
-IO.puts("Doc 2: #{document2}")
-IO.puts("Doc 3: #{document3}")
-
-# ==============================================================================
-# Test 1: WITHOUT instruction prompts (baseline)
-# ==============================================================================
-
-IO.puts("\n" <> String.duplicate("-", 70))
-IO.puts("TEST 1: Without Instruction Prompts (Baseline)")
-IO.puts(String.duplicate("-", 70))
-
-q_plain = generate_embedding.(query_text)
-d1_plain = generate_embedding.(document1)
-d2_plain = generate_embedding.(document2)
-d3_plain = generate_embedding.(document3)
-
-IO.puts("\nSimilarity scores:")
-IO.puts("  Query vs Doc1 (relevant):     #{similarity.(q_plain, d1_plain)}")
-IO.puts("  Query vs Doc2 (semi-relevant): #{similarity.(q_plain, d2_plain)}")
-IO.puts("  Query vs Doc3 (irrelevant):    #{similarity.(q_plain, d3_plain)}")
-
-# ==============================================================================
-# Test 2: WITH instruction prompts (Qwen team's recommendation)
-# ==============================================================================
-
-IO.puts("\n" <> String.duplicate("-", 70))
-IO.puts("TEST 2: With Instruction Prompts (Qwen Team Recommendation)")
-IO.puts(String.duplicate("-", 70))
-
-# Qwen team's recommended format for web search
-query_with_prompt =
-  "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: #{query_text}"
-
-# Documents don't need prompts (as per config)
-q_prompted = generate_embedding.(query_with_prompt)
-
-IO.puts("\nQuery prompt format:")
-IO.puts("  Instruct: Given a web search query, retrieve relevant passages...")
-IO.puts("  Query: #{query_text}")
-
-IO.puts("\nSimilarity scores:")
-IO.puts("  Query vs Doc1 (relevant):     #{similarity.(q_prompted, d1_plain)}")
-IO.puts("  Query vs Doc2 (semi-relevant): #{similarity.(q_prompted, d2_plain)}")
-IO.puts("  Query vs Doc3 (irrelevant):    #{similarity.(q_prompted, d3_plain)}")
-
-# ==============================================================================
-# Test 3: Custom task instructions
-# ==============================================================================
-
-IO.puts("\n" <> String.duplicate("-", 70))
-IO.puts("TEST 3: Custom Task Instructions")
-IO.puts(String.duplicate("-", 70))
-
-# Code search example
-code_query = "function to calculate factorial"
-
-code_docs = [
-  "def factorial(n), do: if n <= 1, do: 1, else: n * factorial(n - 1)",
-  "def fibonacci(n), do: if n <= 1, do: n, else: fibonacci(n - 1) + fibonacci(n - 2)",
-  "defmodule Calculator do; def add(a, b), do: a + b; end"
-]
-
-code_query_prompt =
-  "Instruct: Given a code search query, find relevant code snippets\nQuery: #{code_query}"
-
-IO.puts("\nCode Search Task:")
-IO.puts("Query: #{code_query}")
-
-q_code = generate_embedding.(code_query_prompt)
-code_embeddings = Enum.map(code_docs, &generate_embedding.(&1))
-
-Enum.zip(code_docs, code_embeddings)
-|> Enum.with_index(1)
-|> Enum.each(fn {{doc, emb}, idx} ->
-  sim = similarity.(q_code, emb)
-  IO.puts("  Code #{idx} similarity: #{sim}")
-  IO.puts("    #{String.slice(doc, 0..60)}...")
-end)
-
-# ==============================================================================
-# Test 4: Multilingual example
-# ==============================================================================
-
-IO.puts("\n" <> String.duplicate("-", 70))
-IO.puts("TEST 4: Multilingual Embeddings")
-IO.puts(String.duplicate("-", 70))
-
-multilingual_texts = [
-  "The cat is sleeping",
-  "El gato está durmiendo",
-  "Le chat dort",
-  "猫在睡觉",
-  "The dog is running"
-]
-
-IO.puts("\nGenerating embeddings for 5 texts in different languages...")
-multi_embeddings = Enum.map(multilingual_texts, &generate_embedding.(&1))
-
-IO.puts("\nSemantic similarity (all about cat sleeping vs dog running):")
-
-Enum.take(multi_embeddings, 4)
-|> Enum.with_index(1)
-|> Enum.each(fn {emb, idx} ->
-  sim_to_english = similarity.(hd(multi_embeddings), emb)
-  sim_to_dog = similarity.(List.last(multi_embeddings), emb)
-  IO.puts("  Text #{idx}: same_meaning=#{sim_to_english}, different=#{sim_to_dog}")
-end)
-
-# ==============================================================================
-# Summary
-# ==============================================================================
-
-IO.puts("\n" <> String.duplicate("=", 70))
-IO.puts("SUMMARY")
-IO.puts(String.duplicate("=", 70))
-IO.puts("✓ Qwen3-Embedding supports instruction-aware prompts")
-IO.puts("✓ Recommended format: 'Instruct: [task]\\nQuery: [query]'")
-IO.puts("✓ Improves retrieval performance by 1-5%")
-IO.puts("✓ Works for multilingual and code search tasks")
-IO.puts("✓ Generates 1024-dimensional normalized vectors")
-IO.puts(String.duplicate("=", 70) <> "\n")
diff --git a/examples/qwen3/qwen3_reranker.exs b/examples/qwen3/qwen3_reranker.exs
deleted file mode 100644
index 70594e95..00000000
--- a/examples/qwen3/qwen3_reranker.exs
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/usr/bin/env elixir
-
-# Qwen3-Reranker Example
-#
-# This example demonstrates using Qwen3-Reranker-0.6B for reranking
-# documents based on relevance to a query. Rerankers score query-document
-# pairs to improve retrieval quality in RAG and search applications.
-#
-# Usage: elixir examples/qwen3_reranker.exs
-
-Mix.install([
-  {:bumblebee, path: Path.expand("..", __DIR__)},
-  {:exla, ">= 0.0.0"}
-])
-
-Application.put_env(:nx, :default_backend, EXLA.Backend)
-
-# Load reranker model (uses same Qwen3ForCausalLM architecture)
-{:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Reranker-0.6B"})
-{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Reranker-0.6B"})
-
-# Build model
-{_init_fn, predict_fn} = Axon.build(model_info.model)
-
-# Get yes/no token IDs by tokenizing the words
-tokenizer_no_special = Bumblebee.configure(tokenizer, add_special_tokens: false)
-yes_token_result = Bumblebee.apply_tokenizer(tokenizer_no_special, "yes")
-no_token_result = Bumblebee.apply_tokenizer(tokenizer_no_special, "no")
-yes_token_id = Nx.to_flat_list(yes_token_result["input_ids"]) |> hd()
-no_token_id = Nx.to_flat_list(no_token_result["input_ids"]) |> hd()
-
-# Format query-document pair as recommended by Qwen team
-format_pair = fn instruction, query, document ->
-  instruction =
-    instruction || "Given a web search query, retrieve relevant passages that answer the query"
-
-  # Format with suffix as per vLLM example
-  suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
-  "<Instruct>: #{instruction}\n<Query>: #{query}\n<Document>: #{document}#{suffix}"
-end
-
-# Reranking function
-# Returns a relevance score between 0 and 1
-rerank_score = fn query, document, instruction ->
-  # Format the input
-  text = format_pair.(instruction, query, document)
-
-  # Tokenize
-  inputs = Bumblebee.apply_tokenizer(tokenizer, text)
-
-  # Get model output
-  output = predict_fn.(model_info.params, inputs)
-
-  # Extract logits for the last token
-  # Shape: {batch, seq, vocab}
-  {_batch, seq_len, _vocab} = Nx.shape(output.logits)
-  last_logits = output.logits[[0, seq_len - 1, ..]]
-
-  # Get logits for "yes" and "no" tokens
-  yes_logit = Nx.to_number(last_logits[yes_token_id])
-  no_logit = Nx.to_number(last_logits[no_token_id])
-
-  # Compute softmax probability for "yes"
-  # P(yes) = exp(yes) / (exp(yes) + exp(no))
-  exp_yes = :math.exp(yes_logit)
-  exp_no = :math.exp(no_logit)
-
-  relevance_score = exp_yes / (exp_yes + exp_no)
-  relevance_score
-end
-
-IO.puts("\n" <> String.duplicate("=", 70))
-IO.puts("Qwen3-Reranker Example")
-IO.puts(String.duplicate("=", 70))
-
-# Example 1: Basic reranking
-IO.puts("\n=== Example 1: Basic Query-Document Scoring ===")
-
-query = "What is the capital of France?"
-
-documents = [
-  "Paris is the capital and largest city of France.",
-  "London is the capital of the United Kingdom.",
-  "Machine learning is a subset of artificial intelligence.",
-  "The Eiffel Tower is located in Paris, France.",
-  "Berlin is the capital of Germany."
-]
-
-IO.puts("Query: #{query}")
-IO.puts("\nDocument relevance scores:")
-
-scores =
-  Enum.map(documents, fn doc ->
-    score = rerank_score.(query, doc, nil)
-    {doc, score}
-  end)
-  |> Enum.sort_by(&elem(&1, 1), :desc)
-
-Enum.with_index(scores, 1)
-|> Enum.each(fn {{doc, score}, rank} ->
-  IO.puts("  #{rank}. [#{Float.round(score, 4)}] #{String.slice(doc, 0..60)}...")
-end)
-
-# Example 2: Custom instruction
-IO.puts("\n=== Example 2: Custom Task Instruction ===")
-
-instruction = "Given a coding question, find relevant code examples"
-query = "How to calculate factorial in Elixir?"
-
-code_docs = [
-  "def factorial(n), do: if n <= 1, do: 1, else: n * factorial(n - 1)",
-  "def fibonacci(n), do: if n <= 1, do: n, else: fibonacci(n - 1) + fibonacci(n - 2)",
-  "def sum_list(list), do: Enum.reduce(list, 0, &+/2)",
-  "Factorial is a mathematical function that multiplies a number by all positive integers less than it."
-]
-
-IO.puts("Query: #{query}")
-IO.puts("Instruction: #{instruction}")
-IO.puts("\nCode snippet relevance:")
-
-code_docs
-|> Enum.map(fn doc ->
-  score = rerank_score.(query, doc, instruction)
-  {doc, score}
-end)
-|> Enum.sort_by(&elem(&1, 1), :desc)
-|> Enum.with_index(1)
-|> Enum.each(fn {{doc, score}, rank} ->
-  IO.puts("  #{rank}. [#{Float.round(score, 4)}] #{String.slice(doc, 0..60)}...")
-end)
-
-# Example 3: Reranking search results
-IO.puts("\n=== Example 3: Reranking Initial Search Results ===")
-
-query = "best practices for concurrent programming"
-
-# Simulated initial retrieval results (could be from vector search)
-search_results = [
-  "Concurrent programming involves multiple computations executing simultaneously.",
-  "Elixir uses the Actor model for concurrency with lightweight processes.",
-  "Python has threading and multiprocessing modules for parallel execution.",
-  "The weather is nice today and perfect for a walk.",
-  "OTP behaviors like GenServer provide patterns for concurrent systems."
-]
-
-IO.puts("Query: #{query}")
-IO.puts("\nInitial results (unranked):")
-
-Enum.with_index(search_results, 1)
-|> Enum.each(fn {doc, i} ->
-  IO.puts("  #{i}. #{String.slice(doc, 0..60)}...")
-end)
-
-IO.puts("\nAfter reranking:")
-
-search_results
-|> Enum.map(fn doc ->
-  score = rerank_score.(query, doc, nil)
-  {doc, score}
-end)
-|> Enum.sort_by(&elem(&1, 1), :desc)
-# Top 3 results
-|> Enum.take(3)
-|> Enum.with_index(1)
-|> Enum.each(fn {{doc, score}, rank} ->
-  IO.puts("  #{rank}. [#{Float.round(score, 4)}] #{String.slice(doc, 0..60)}...")
-end)
-
-IO.puts("\n" <> String.duplicate("=", 70))
-IO.puts("✓ Qwen3-Reranker successfully reranked documents by relevance")
-IO.puts(String.duplicate("=", 70) <> "\n")
diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex
index 6cf93cd6..a2332ef3 100644
--- a/lib/bumblebee/layers/transformer.ex
+++ b/lib/bumblebee/layers/transformer.ex
@@ -50,7 +50,9 @@ defmodule Bumblebee.Layers.Transformer do
       :block_type,
       :attention_window_size,
       :scale_attention_weights,
-      :rotary_embedding
+      :rotary_embedding,
+      :query_norm,
+      :key_norm
     ]
 
     opts =
@@ -317,7 +319,9 @@ defmodule Bumblebee.Layers.Transformer do
         layer_norm: [],
         attention_window_size: nil,
         scale_attention_weights: true,
-        rotary_embedding: nil
+        rotary_embedding: nil,
+        query_norm: nil,
+        key_norm: nil
       ])
 
     name = opts[:name]
@@ -347,6 +351,8 @@ defmodule Bumblebee.Layers.Transformer do
     attention_window_size = opts[:attention_window_size]
     scale_attention_weights = opts[:scale_attention_weights]
     rotary_embedding = opts[:rotary_embedding]
+    query_norm = opts[:query_norm]
+    key_norm = opts[:key_norm]
 
     ffn_fun =
       case ffn do
@@ -405,6 +411,8 @@ defmodule Bumblebee.Layers.Transformer do
           attention_window_size: attention_window_size,
           scale_attention_weights: scale_attention_weights,
           rotary_embedding: rotary_embedding,
+          query_norm: query_norm,
+          key_norm: key_norm,
           name: join(name, "self_attention")
         )
 
@@ -690,6 +698,14 @@ defmodule Bumblebee.Layers.Transformer do
 
         * `:max_positions` - the maximum number of distinct positions
 
+    * `:query_norm` - configuration for query normalization. If set, normalizes
+      the query projection before rotary embedding. Configured with the same
+      options as `:layer_norm` in the block function. Defaults to `nil`
+
+    * `:key_norm` - configuration for key normalization. If set, normalizes
+      the key projection before rotary embedding. Configured with the same
+      options as `:layer_norm` in the block function. Defaults to `nil`
+
     * `:name` - the prefix for layer names
 
   ## References
@@ -721,7 +737,9 @@ defmodule Bumblebee.Layers.Transformer do
         key_use_bias: true,
         value_use_bias: true,
         output_use_bias: true,
-        rotary_embedding: nil
+        rotary_embedding: nil,
+        query_norm: nil,
+        key_norm: nil
       ])
 
     attention_mask = opts[:attention_mask]
@@ -739,6 +757,8 @@ defmodule Bumblebee.Layers.Transformer do
     scale_attention_weights = opts[:scale_attention_weights]
     dropout_rate = opts[:dropout_rate]
     rotary_embedding = opts[:rotary_embedding]
+    query_norm = opts[:query_norm]
+    key_norm = opts[:key_norm]
 
     query_use_bias = opts[:query_use_bias]
     key_use_bias = opts[:key_use_bias]
@@ -778,6 +798,35 @@ defmodule Bumblebee.Layers.Transformer do
       )
       |> Layers.split_heads(num_key_value_heads)
 
+    # Apply query and key normalization if configured (before rotary embedding)
+    query =
+      case query_norm do
+        opts when is_list(opts) ->
+          opts = Keyword.validate!(opts, epsilon: 1.0e-5)
+          # Normalize over the head dimension (channel_index: -1)
+          Layers.rms_norm(query, [epsilon: opts[:epsilon], channel_index: -1, name: join(name, "query_norm")])
+
+        fun when is_function(fun) ->
+          fun.(query, join(name, "query_norm"))
+
+        nil ->
+          query
+      end
+
+    key =
+      case key_norm do
+        opts when is_list(opts) ->
+          opts = Keyword.validate!(opts, epsilon: 1.0e-5)
+          # Normalize over the head dimension (channel_index: -1)
+          Layers.rms_norm(key, [epsilon: opts[:epsilon], channel_index: -1, name: join(name, "key_norm")])
+
+        fun when is_function(fun) ->
+          fun.(key, join(name, "key_norm"))
+
+        nil ->
+          key
+      end
+
     {query, key} =
       case rotary_embedding do
         opts when is_list(opts) ->
diff --git a/lib/bumblebee/text/pre_trained_tokenizer.ex b/lib/bumblebee/text/pre_trained_tokenizer.ex
index 59ab3468..faf57329 100644
--- a/lib/bumblebee/text/pre_trained_tokenizer.ex
+++ b/lib/bumblebee/text/pre_trained_tokenizer.ex
@@ -200,6 +200,12 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do
       },
       default_template_options: [language_token: "eng_Latn"]
     },
+    qwen2: %{
+      special_tokens: %{
+        eos: "<|im_end|>",
+        pad: "<|endoftext|>"
+      }
+    },
     roberta: %{
       special_tokens: %{
         bos: "<s>",
diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex
index 985ba852..f9d32d30 100644
--- a/lib/bumblebee/text/qwen3.ex
+++ b/lib/bumblebee/text/qwen3.ex
@@ -365,267 +365,42 @@ defmodule Bumblebee.Text.Qwen3 do
        ) do
     name = opts[:name]
 
-    # For Qwen3, we need custom attention with QK normalization
-    # We'll use a custom block implementation instead of Layers.Transformer.blocks
-    {attention_mask, cache} = Layers.Decoder.cached_attention_mask(attention_mask, cache)
-    offset = Layers.Decoder.get_cache_offset(cache)
-
-    state = %{
-      hidden_state: hidden_state,
-      hidden_states: Axon.container({hidden_state}),
-      attentions: Axon.container({}),
-      cache: cache
-    }
-
-    outputs =
-      for idx <- 0..(spec.num_blocks - 1), reduce: state do
-        state ->
-          block_name = join(name, "blocks.#{idx}")
-
-          block_cache = Layers.Decoder.get_block_cache(state.cache, idx)
-
-          block_attention_head_mask =
-            Layers.if_present attention_head_mask do
-              Axon.nx(attention_head_mask, & &1[idx])
-            else
-              Layers.none()
-            end
-
-          block_output =
-            qwen3_decoder_block(
-              state.hidden_state,
-              position_ids,
-              attention_mask,
-              block_attention_head_mask,
-              block_cache,
-              offset,
-              spec,
-              name: block_name
-            )
-
-          cache = Layers.Decoder.put_block_cache(state.cache, idx, block_output.cache)
-
-          %{
-            hidden_state: block_output.hidden_state,
-            hidden_states: Layers.append(state.hidden_states, block_output.hidden_state),
-            attentions: Layers.append(state.attentions, block_output.attention_weights),
-            cache: cache
-          }
-      end
-
-    outputs =
-      update_in(outputs.cache, &Layers.Decoder.update_cache_offset(&1, outputs.hidden_state))
-
-    %{
-      hidden_state: outputs.hidden_state,
-      hidden_states: outputs.hidden_states,
-      attentions: outputs.attentions,
-      cache: outputs.cache
-    }
-  end
-
-  defp qwen3_decoder_block(
-         hidden_state,
-         position_ids,
-         attention_mask,
-         attention_head_mask,
-         block_cache,
-         offset,
-         spec,
-         opts
-       ) do
-    name = opts[:name]
-
-    # Extract self-attention cache from block cache
-    {self_attention_cache, _cross_attention_cache} =
-      Layers.Decoder.get_attention_caches(block_cache)
-
-    # Pre-normalization
-    normalized_hidden_state =
-      Layers.rms_norm(hidden_state,
-        name: join(name, "self_attention_norm"),
-        epsilon: spec.layer_norm_epsilon
-      )
-
-    # Self-attention with QK normalization
-    attention_output =
-      qwen3_attention(
-        normalized_hidden_state,
-        position_ids,
-        attention_mask,
-        attention_head_mask,
-        self_attention_cache,
-        offset,
-        spec,
-        name: join(name, "self_attention")
-      )
-
-    # Residual connection
-    hidden_state = Axon.add(hidden_state, attention_output.hidden_state)
-
-    # FFN pre-normalization
-    normalized_hidden_state =
-      Layers.rms_norm(hidden_state,
-        name: join(name, "output_norm"),
-        epsilon: spec.layer_norm_epsilon
-      )
-
-    # Feed-forward network
-    ffn_output =
-      gated_ffn(normalized_hidden_state, spec.intermediate_size, spec.hidden_size,
-        name: join(name, "ffn"),
+    # Build query and key normalization configuration for Qwen3
+    query_norm = if spec.use_qk_norm, do: [epsilon: spec.layer_norm_epsilon], else: nil
+    key_norm = if spec.use_qk_norm, do: [epsilon: spec.layer_norm_epsilon], else: nil
+
+    # Use the generalized Layers.Transformer.blocks with QK normalization
+    Layers.Transformer.blocks(hidden_state,
+      num_blocks: spec.num_blocks,
+      num_attention_heads: spec.num_attention_heads,
+      num_key_value_heads: spec.num_key_value_heads,
+      hidden_size: spec.hidden_size,
+      attention_head_size: spec.attention_head_size,
+      kernel_initializer: kernel_initializer(spec),
+      query_use_bias: false,
+      key_use_bias: false,
+      value_use_bias: false,
+      output_use_bias: false,
+      block_type: :norm_first,
+      attention_mask: attention_mask,
+      attention_head_mask: attention_head_mask,
+      cache: cache,
+      causal: true,
+      layer_norm: &Layers.rms_norm(&1, epsilon: spec.layer_norm_epsilon, name: &2),
+      ffn: &gated_ffn(&1, spec.intermediate_size, spec.hidden_size,
+        name: &2,
         activation: spec.activation
-      )
-
-    # Residual connection
-    hidden_state = Axon.add(hidden_state, ffn_output)
-
-    # Build block cache with self-attention cache
-    updated_block_cache =
-      Layers.Decoder.put_attention_caches(
-        block_cache,
-        attention_output.cache,
-        Layers.none()
-      )
-
-    %{
-      hidden_state: hidden_state,
-      attention_weights: attention_output.attention_weights,
-      cache: updated_block_cache
-    }
-  end
-
-  defp qwen3_attention(
-         hidden_state,
-         position_ids,
-         attention_mask,
-         attention_head_mask,
-         cache,
-         offset,
-         spec,
-         opts
-       ) do
-    name = opts[:name]
-
-    num_heads = spec.num_attention_heads
-    num_key_value_heads = spec.num_key_value_heads
-    attention_head_size = spec.attention_head_size
-    hidden_size = spec.hidden_size
-
-    inner_size = num_heads * attention_head_size
-    inner_kv_size = num_key_value_heads * attention_head_size
-
-    # Query, Key, Value projections
-    query =
-      hidden_state
-      |> Axon.dense(inner_size,
-        kernel_initializer: kernel_initializer(spec),
-        name: join(name, "query"),
-        use_bias: false
-      )
-      |> Layers.split_heads(num_heads)
-
-    key =
-      hidden_state
-      |> Axon.dense(inner_kv_size,
-        kernel_initializer: kernel_initializer(spec),
-        name: join(name, "key"),
-        use_bias: false
-      )
-      |> Layers.split_heads(num_key_value_heads)
-
-    value =
-      hidden_state
-      |> Axon.dense(inner_kv_size,
-        kernel_initializer: kernel_initializer(spec),
-        name: join(name, "value"),
-        use_bias: false
-      )
-      |> Layers.split_heads(num_key_value_heads)
-
-    # QK Normalization (Qwen3-specific) - normalize over head_dim
-    query =
-      if spec.use_qk_norm do
-        Layers.rms_norm(query,
-          name: join(name, "query_norm"),
-          epsilon: spec.layer_norm_epsilon,
-          channel_index: -1
-        )
-      else
-        query
-      end
-
-    key =
-      if spec.use_qk_norm do
-        Layers.rms_norm(key,
-          name: join(name, "key_norm"),
-          epsilon: spec.layer_norm_epsilon,
-          channel_index: -1
-        )
-      else
-        key
-      end
-
-    # Apply rotary embeddings
-    {query, key} =
-      Layers.rotary_embedding(
-        query,
-        key,
-        position_ids,
-        attention_mask,
-        attention_head_size,
-        name: join(name, "rotary_embedding"),
+      ),
+      rotary_embedding: [
+        position_ids: position_ids,
         max_positions: spec.max_positions,
         base: spec.rotary_embedding_base,
         scaling_strategy: spec.rotary_embedding_scaling_strategy
-      )
-
-    # Repeat key-value for grouped query attention AFTER rotary embedding
-    num_key_value_groups = div(num_heads, num_key_value_heads)
-    key = repeat_states(key, num_key_value_groups)
-    value = repeat_states(value, num_key_value_groups)
-
-    # Cache key and value
-    {key, value, cache} =
-      Layers.Decoder.cached_attention_key_values(key, value, cache, offset)
-
-    # Compute attention
-    {attention_output, attention_weights} =
-      Layers.attention(
-        query,
-        key,
-        value,
-        attention_mask,
-        attention_head_mask,
-        Layers.none(),
-        offset,
-        scale: 1 / :math.sqrt(attention_head_size),
-        causal: true
-      )
-
-    # Merge heads and output projection
-    attention_output =
-      attention_output
-      |> Layers.flatten_trailing()
-      |> Axon.dense(hidden_size,
-        kernel_initializer: kernel_initializer(spec),
-        name: join(name, "output"),
-        use_bias: false
-      )
-
-    %{
-      hidden_state: attention_output,
-      attention_weights: attention_weights,
-      cache: cache
-    }
-  end
-
-  defp repeat_states(state, n) when n == 1, do: state
-
-  defp repeat_states(state, n) do
-    # state shape: {batch, seq, num_kv_heads, head_size}
-    # Repeat along axis 2 (the heads axis) - same as Layers.Transformer
-    Layers.repeat_interleave(state, n, axis: 2)
+      ],
+      query_norm: query_norm,
+      key_norm: key_norm,
+      name: name
+    )
   end
 
   defp gated_ffn(hidden_state, intermediate_size, output_size, opts) do

From 1f24cc65a7bd6c2a94dc135705aa270e95060082 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Mon, 6 Oct 2025 18:24:15 -0400
Subject: [PATCH 09/15] Fix Qwen3 layer naming for Layers.Transformer.blocks

Use 'decoder.blocks' as the name prefix when calling Layers.Transformer.blocks
to match the expected params mapping pattern decoder.blocks.{n}.*.
This aligns with how other models like BERT use the transformer blocks.
---
 lib/bumblebee/text/qwen3.ex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex
index f9d32d30..b1ea7858 100644
--- a/lib/bumblebee/text/qwen3.ex
+++ b/lib/bumblebee/text/qwen3.ex
@@ -399,7 +399,7 @@ defmodule Bumblebee.Text.Qwen3 do
       ],
       query_norm: query_norm,
       key_norm: key_norm,
-      name: name
+      name: join(name, "blocks")
     )
   end
 

From cb181f375b72549e7b2b202ac3771e788e83e6e3 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Mon, 6 Oct 2025 18:27:02 -0400
Subject: [PATCH 10/15] Map qwen3 model type to :qwen2 tokenizer type

Fix model_type_to_tokenizer_type mapping to use :qwen2 instead of :gpt2
for qwen3 models. This ensures Qwen3 models load with the correct
tokenizer configuration including proper special tokens.
---
 lib/bumblebee.ex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
index 285637d1..a8ca88b0 100644
--- a/lib/bumblebee.ex
+++ b/lib/bumblebee.ex
@@ -257,7 +257,7 @@ defmodule Bumblebee do
     "mbart" => :mbart,
     "phi" => :code_gen,
     "phi3" => :llama,
-    "qwen3" => :gpt2,
+    "qwen3" => :qwen2,
     "roberta" => :roberta,
     "t5" => :t5,
     "whisper" => :whisper,

From 165148852d4a65d23ed8b3fbba037f6cb5df181e Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Mon, 6 Oct 2025 18:31:06 -0400
Subject: [PATCH 11/15] Add comprehensive Qwen3 notebook with examples

Create notebooks/qwen3.livemd demonstrating:
- Text generation using Qwen3-4B-Instruct-2507
- Embeddings using Qwen3-Embedding-0.6B with similarity examples
- Reranking using Qwen3-Reranker-0.6B with query-document scoring

This replaces the deleted standalone examples with a consolidated,
easy-to-follow notebook format as suggested in PR review.
---
 notebooks/qwen3.livemd | 223 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 notebooks/qwen3.livemd

diff --git a/notebooks/qwen3.livemd b/notebooks/qwen3.livemd
new file mode 100644
index 00000000..c2888636
--- /dev/null
+++ b/notebooks/qwen3.livemd
@@ -0,0 +1,223 @@
+# Qwen3
+
+```elixir
+Mix.install([
+  {:bumblebee, "~> 0.6.0"},
+  {:nx, "~> 0.10.0"},
+  {:exla, "~> 0.10.0"},
+  {:kino, "~> 0.14.0"}
+])
+
+Nx.global_default_backend({EXLA.Backend, client: :host})
+```
+
+## Introduction
+
+In this notebook we explore the [Qwen3](https://qwenlm.github.io/blog/qwen3/) model family from Alibaba Cloud. Qwen3 is a series of large language models that includes:
+
+* **Text Generation** - Instruction-tuned models for conversational AI
+* **Embeddings** - Dense vector representations for semantic search
+* **Rerankers** - Models to rerank search results for better relevance
+
+<!-- livebook:{"branch_parent_index":0} -->
+
+## Text Generation
+
+Let's start with the Qwen3 instruction model for conversational text generation.
+
+```elixir
+repo = {:hf, "Qwen/Qwen3-4B-Instruct-2507"}
+
+{:ok, model_info} = Bumblebee.load_model(repo, type: :bf16, backend: EXLA.Backend)
+{:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
+{:ok, generation_config} = Bumblebee.load_generation_config(repo)
+
+:ok
+```
+
+Configure the generation parameters and create a serving:
+
+```elixir
+generation_config =
+  Bumblebee.configure(generation_config,
+    max_new_tokens: 256,
+    strategy: %{type: :multinomial_sampling, top_p: 0.8, top_k: 20, temperature: 0.7}
+  )
+
+serving =
+  Bumblebee.Text.generation(model_info, tokenizer, generation_config,
+    compile: [batch_size: 1, sequence_length: 1024],
+    stream: true,
+    defn_options: [compiler: EXLA]
+  )
+
+# Should be supervised
+Kino.start_child({Nx.Serving, name: Qwen3, serving: serving})
+```
+
+Create an input field and test the model:
+
+```elixir
+user_input = Kino.Input.textarea("User prompt", default: "Explain quantum computing in simple terms")
+```
+
+```elixir
+user = Kino.Input.read(user_input)
+
+# Qwen3 uses the <|im_start|> and <|im_end|> chat template format
+prompt = """
+<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+#{user}<|im_end|>
+<|im_start|>assistant
+"""
+
+Nx.Serving.batched_run(Qwen3, prompt) |> Enum.each(&IO.write/1)
+```
+
+<!-- livebook:{"branch_parent_index":0} -->
+
+## Embeddings
+
+Qwen3 embedding models convert text into dense vector representations, useful for semantic search and similarity tasks.
+
+```elixir
+repo = {:hf, "Qwen/Qwen3-Embedding-0.6B"}
+
+{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend)
+{:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
+
+serving =
+  Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
+    output_attribute: :embedding,
+    output_pool: :last_token,
+    embedding_processor: :l2_norm,
+    compile: [batch_size: 2, sequence_length: 512],
+    defn_options: [compiler: EXLA]
+  )
+
+Kino.start_child({Nx.Serving, name: Qwen3Embedding, serving: serving})
+```
+
+Test the embedding model with some example texts:
+
+```elixir
+texts = [
+  "The quick brown fox jumps over the lazy dog",
+  "A fast auburn canine leaps above an idle hound",
+  "Python is a programming language"
+]
+
+# Get embeddings for all texts
+embeddings =
+  texts
+  |> Enum.map(fn text ->
+    %{embedding: embedding} = Nx.Serving.batched_run(Qwen3Embedding, text)
+    {text, embedding}
+  end)
+
+# Calculate cosine similarity between first two texts (similar meaning)
+[{text1, emb1}, {text2, emb2}, {text3, emb3}] = embeddings
+
+similarity_1_2 =
+  Nx.dot(emb1, emb2)
+  |> Nx.to_number()
+  |> then(&Float.round(&1, 4))
+
+similarity_1_3 =
+  Nx.dot(emb1, emb3)
+  |> Nx.to_number()
+  |> then(&Float.round(&1, 4))
+
+IO.puts("Text 1: #{text1}")
+IO.puts("Text 2: #{text2}")
+IO.puts("Similarity: #{similarity_1_2}\n")
+
+IO.puts("Text 1: #{text1}")
+IO.puts("Text 3: #{text3}")
+IO.puts("Similarity: #{similarity_1_3}")
+```
+
+As expected, texts with similar meanings (sentences 1 and 2) have higher cosine similarity than texts with different meanings (sentences 1 and 3).
+
+<!-- livebook:{"branch_parent_index":0} -->
+
+## Reranking
+
+Reranking models take a query and a list of candidate documents, then score how relevant each document is to the query. This is useful for improving search results.
+
+```elixir
+repo = {:hf, "Qwen/Qwen3-Reranker-0.6B"}
+
+{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend)
+{:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
+
+serving =
+  Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
+    output_attribute: :embedding,
+    output_pool: :last_token,
+    compile: [batch_size: 4, sequence_length: 512],
+    defn_options: [compiler: EXLA]
+  )
+
+Kino.start_child({Nx.Serving, name: Qwen3Reranker, serving: serving})
+```
+
+Test the reranker with a query and multiple candidate documents:
+
+```elixir
+query = "What is machine learning?"
+
+documents = [
+  "Machine learning is a subset of artificial intelligence that enables computers to learn from data.",
+  "The weather today is sunny with a high of 75 degrees.",
+  "Deep learning uses neural networks with multiple layers to learn complex patterns.",
+  "My favorite color is blue and I enjoy long walks on the beach."
+]
+
+# Create query-document pairs with instruction prefix
+# Qwen3 reranker expects the format: "Instruct: {query}\nQuery: {text}"
+pairs =
+  Enum.map(documents, fn doc ->
+    "Instruct: Given a query, retrieve relevant documents\nQuery: #{query}\n#{doc}"
+  end)
+
+# Get embeddings (which represent relevance scores)
+results =
+  pairs
+  |> Enum.zip(documents)
+  |> Enum.map(fn {pair, doc} ->
+    %{embedding: embedding} = Nx.Serving.batched_run(Qwen3Reranker, pair)
+    # Take the first dimension as the relevance score
+    score =
+      embedding
+      |> Nx.to_flat_list()
+      |> List.first()
+      |> then(&Float.round(&1, 4))
+
+    {score, doc}
+  end)
+  |> Enum.sort_by(fn {score, _doc} -> score end, :desc)
+
+IO.puts("Query: #{query}\n")
+IO.puts("Ranked documents by relevance:\n")
+
+results
+|> Enum.with_index(1)
+|> Enum.each(fn {{score, doc}, idx} ->
+  IO.puts("#{idx}. [Score: #{score}] #{doc}")
+end)
+```
+
+The reranker correctly identifies that documents about machine learning and deep learning are most relevant to the query, while the unrelated documents receive lower scores.
+
+## Summary
+
+This notebook demonstrated three key capabilities of the Qwen3 model family:
+
+1. **Text Generation** - Conversational AI using instruction-tuned models
+2. **Embeddings** - Creating semantic vector representations for similarity search
+3. **Reranking** - Scoring and ranking documents by relevance to a query
+
+All three models work seamlessly with Bumblebee and can be used for various NLP applications.

From c02c29551a694d86ccf5fd8bd31c2450dd71270b Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Mon, 6 Oct 2025 18:39:48 -0400
Subject: [PATCH 12/15] Add instruction format to embeddings example in Qwen3
 notebook

Update the embeddings section to use the proper instruction format:
'Instruct: Given a query, retrieve relevant documents\nQuery: {query}\n{text}'

This ensures consistency with the reranker example and follows Qwen3
embedding best practices for better semantic search results.
---
 notebooks/qwen3.livemd | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/notebooks/qwen3.livemd b/notebooks/qwen3.livemd
index c2888636..3a5fc8a3 100644
--- a/notebooks/qwen3.livemd
+++ b/notebooks/qwen3.livemd
@@ -100,21 +100,32 @@ serving =
 Kino.start_child({Nx.Serving, name: Qwen3Embedding, serving: serving})
 ```
 
-Test the embedding model with some example texts:
+Test the embedding model with some example texts. The Qwen3 embedding model uses an instruction format for better results:
 
 ```elixir
+query = "animals"
+
 texts = [
   "The quick brown fox jumps over the lazy dog",
   "A fast auburn canine leaps above an idle hound",
   "Python is a programming language"
 ]
 
-# Get embeddings for all texts
-embeddings =
+# Format texts with instruction prefix for Qwen3 embeddings
+# Format: "Instruct: Given a query, retrieve relevant documents\nQuery: {query}\n{text}"
+formatted_texts =
   texts
   |> Enum.map(fn text ->
-    %{embedding: embedding} = Nx.Serving.batched_run(Qwen3Embedding, text)
-    {text, embedding}
+    "Instruct: Given a query, retrieve relevant documents\nQuery: #{query}\n#{text}"
+  end)
+
+# Get embeddings for all texts
+embeddings =
+  formatted_texts
+  |> Enum.zip(texts)
+  |> Enum.map(fn {formatted_text, original_text} ->
+    %{embedding: embedding} = Nx.Serving.batched_run(Qwen3Embedding, formatted_text)
+    {original_text, embedding}
   end)
 
 # Calculate cosine similarity between first two texts (similar meaning)

From bd19c796fbc6976ee27ff5fae43972e52023cf14 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <niko@hackthebox.eu>
Date: Mon, 6 Oct 2025 19:01:10 -0400
Subject: [PATCH 13/15] Add Qwen3 model tests with reference values

Add comprehensive test suite for Qwen3 using tiny-random/qwen3:
- Test :base architecture with QK normalization enabled
- Test :for_causal_language_modeling with logits verification
- Test :for_sequence_classification (shape only, random params)
- Test :for_embedding architecture

Reference values generated from tiny-random/qwen3 model predictions.
All tests pass successfully (4 tests, 0 failures).
---
 test/bumblebee/text/qwen3_test.exs | 107 +++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 test/bumblebee/text/qwen3_test.exs

diff --git a/test/bumblebee/text/qwen3_test.exs b/test/bumblebee/text/qwen3_test.exs
new file mode 100644
index 00000000..44857869
--- /dev/null
+++ b/test/bumblebee/text/qwen3_test.exs
@@ -0,0 +1,107 @@
+defmodule Bumblebee.Text.Qwen3Test do
+  use ExUnit.Case, async: false
+
+  import Bumblebee.TestHelpers
+
+  @moduletag model_test_tags()
+
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "tiny-random/qwen3"}, architecture: :base)
+
+    assert %Bumblebee.Text.Qwen3{architecture: :base} = spec
+    assert spec.use_qk_norm == true
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 64}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [
+          [0.0437, -0.0292, 0.6567],
+          [-0.0767, 0.0107, 0.2657],
+          [0.4693, -0.0452, 0.2521]
+        ]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "tiny-random/qwen3"})
+
+    assert %Bumblebee.Text.Qwen3{architecture: :for_causal_language_modeling} = spec
+    assert spec.use_qk_norm == true
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 151936}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [
+          [2.5975, 3.9118, -0.7135],
+          [1.8620, 0.6854, 2.3352],
+          [0.9874, -4.0238, -0.1917]
+        ]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "tiny-random/qwen3"},
+               architecture: :for_sequence_classification
+             )
+
+    assert %Bumblebee.Text.Qwen3{architecture: :for_sequence_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    # Note: tiny-random model is missing sequence_classification_head parameters,
+    # so it uses random initialization. We only verify the shape is correct.
+    assert Nx.shape(outputs.logits) == {1, 2}
+  end
+
+  test ":for_embedding" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "tiny-random/qwen3"}, architecture: :for_embedding)
+
+    assert %Bumblebee.Text.Qwen3{architecture: :for_embedding} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.embedding) == {1, 64}
+
+    assert_all_close(
+      outputs.embedding[[.., 1..3]],
+      Nx.tensor([[0.2217, -0.0037, -0.1757]]),
+      atol: 1.0e-3
+    )
+  end
+end

From 8d787ee3d8b16544d946b4a817517e5894423695 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Thu, 9 Oct 2025 20:01:18 -0400
Subject: [PATCH 14/15] Fix Qwen3 embedding pooling to use attention mask
 instead of pad_token_id

The Qwen3 :for_embedding architecture was incorrectly using pad_token_id
to find the last non-padding token for pooling. This caused embeddings to
differ from the reference Python implementation.

Root cause:
- Token 151643 serves as both the pad_token_id AND the EOS token
- When tokenizing "hello!", the output is [14990, 0, 151643] with
  attention_mask [1, 1, 1], meaning all tokens should be attended
- The EOS token at the end is part of the actual sequence, not padding
- Only explicitly added padding tokens have attention_mask = 0

The fix changes the pooling logic to use attention_mask.sum(dim=-1) - 1
to find the last attended token, matching the official HuggingFace
implementation's last_token_pool function.

Debugging process:
1. Compared raw transformer hidden states between Python and Elixir
2. Found both were producing identical hidden states (norm ~102 for all tokens)
3. Discovered Python was pooling index 2 (last token) while Elixir pooled
   index 1 (last non-pad_token_id token)
4. Investigated tokenizer behavior: attention_mask was [1,1,1] not [1,1,0]
5. Confirmed with explicit padding that only added padding has mask = 0
6. Updated architecture to use attention_mask for pooling logic

After fix, embeddings now match Python implementation within bf16 precision:
- Python: [0.00039361, -0.02717206, -0.01105759, ...]
- Elixir: [5.552e-4, -0.027919, -0.011104, ...]

Also updated notebooks/qwen3.livemd to specify architecture: :for_embedding
explicitly and remove unnecessary output_pool parameter.
---
 lib/bumblebee/text/qwen3.ex | 25 +++++++++++++++----------
 notebooks/qwen3.livemd      |  6 ++----
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex
index b1ea7858..37921a37 100644
--- a/lib/bumblebee/text/qwen3.ex
+++ b/lib/bumblebee/text/qwen3.ex
@@ -261,21 +261,25 @@ defmodule Bumblebee.Text.Qwen3 do
 
     outputs = core(inputs, spec)
 
-    # Pool the last token (last non-padding token) for embeddings
+    # Pool the last token using attention mask
+    # For Qwen3 embeddings, we need to find the last attended token based on
+    # the attention mask, not the pad_token_id. The EOS token (which matches
+    # pad_token_id) is actually part of the sequence and should be attended.
     pooled_state =
-      Layers.if_present inputs["input_ids"] do
+      Layers.if_present inputs["attention_mask"] do
         Axon.layer(
-          fn hidden_state, input_ids, _opts ->
+          fn hidden_state, attention_mask, _opts ->
+            # Find the last token with attention_mask = 1 (last attended token)
+            # This matches the behavior of the reference implementation
             indices =
-              input_ids
-              |> Nx.not_equal(spec.pad_token_id)
+              attention_mask
               |> Nx.sum(axes: [-1])
               |> Nx.subtract(1)
               |> Nx.as_type({:s, 64})
 
             Bumblebee.Utils.Nx.batched_take(hidden_state, indices)
           end,
-          [outputs.hidden_state, inputs["input_ids"]]
+          [outputs.hidden_state, inputs["attention_mask"]]
         )
       else
         Layers.take_token(outputs.hidden_state, axis: 1, index: -1)
@@ -387,10 +391,11 @@ defmodule Bumblebee.Text.Qwen3 do
       cache: cache,
       causal: true,
       layer_norm: &Layers.rms_norm(&1, epsilon: spec.layer_norm_epsilon, name: &2),
-      ffn: &gated_ffn(&1, spec.intermediate_size, spec.hidden_size,
-        name: &2,
-        activation: spec.activation
-      ),
+      ffn:
+        &gated_ffn(&1, spec.intermediate_size, spec.hidden_size,
+          name: &2,
+          activation: spec.activation
+        ),
       rotary_embedding: [
         position_ids: position_ids,
         max_positions: spec.max_positions,
diff --git a/notebooks/qwen3.livemd b/notebooks/qwen3.livemd
index 3a5fc8a3..6d8d6f6d 100644
--- a/notebooks/qwen3.livemd
+++ b/notebooks/qwen3.livemd
@@ -85,13 +85,12 @@ Qwen3 embedding models convert text into dense vector representations, useful fo
 ```elixir
 repo = {:hf, "Qwen/Qwen3-Embedding-0.6B"}
 
-{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend)
+{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend, architecture: :for_embedding)
 {:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
 
 serving =
   Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
     output_attribute: :embedding,
-    output_pool: :last_token,
     embedding_processor: :l2_norm,
     compile: [batch_size: 2, sequence_length: 512],
     defn_options: [compiler: EXLA]
@@ -161,13 +160,12 @@ Reranking models take a query and a list of candidate documents, then score how
 ```elixir
 repo = {:hf, "Qwen/Qwen3-Reranker-0.6B"}
 
-{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend)
+{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend, architecture: :for_embedding)
 {:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
 
 serving =
   Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
     output_attribute: :embedding,
-    output_pool: :last_token,
     compile: [batch_size: 4, sequence_length: 512],
     defn_options: [compiler: EXLA]
   )

From a1923e1cc8c2e0fdf2b816aaaa06473383bdeda4 Mon Sep 17 00:00:00 2001
From: Niko Maroulis <nik.maroulis@gmail.com>
Date: Thu, 9 Oct 2025 20:11:34 -0400
Subject: [PATCH 15/15] Add :for_reranker architecture for Qwen3

Implements binary relevance classification (reranking) for Qwen3 models.

Changes:
- Add :for_reranker architecture to Qwen3 model
  - Extracts logits at last attended token position
  - Returns full vocab logits for binary classification

- Create new TextReranking serving module
  - Handles query-document pair formatting
  - Applies Qwen3 reranker prompt template
  - Computes relevance scores from yes/no token logits
  - Uses log_softmax for score normalization

- Update notebook with proper reranker usage
  - Changed from :for_embedding to :for_reranker architecture
  - Uses Bumblebee.Text.text_reranking/3 API
  - Simplified query-document pair handling

The reranker follows the official HuggingFace implementation:
1. Format: "<|im_start|>system\n...<|im_end|>\n<|im_start|>user\n<Instruct>: {task}\n<Query>: {query}\n<Document>: {doc}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
2. Extract logits at last attended token (using attention mask)
3. Get yes/no token logits
4. Apply log_softmax([no_logit, yes_logit])
5. Return exp(yes_log_prob) as relevance score

Tested with Qwen3-Reranker-0.6B, scores match Python reference (both ~1.0).
---
 lib/bumblebee/text.ex                |  42 +++++
 lib/bumblebee/text/qwen3.ex          |  46 ++++-
 lib/bumblebee/text/text_reranking.ex | 243 +++++++++++++++++++++++++++
 notebooks/qwen3.livemd               |  37 ++--
 4 files changed, 344 insertions(+), 24 deletions(-)
 create mode 100644 lib/bumblebee/text/text_reranking.ex

diff --git a/lib/bumblebee/text.ex b/lib/bumblebee/text.ex
index 770a2192..405a48ce 100644
--- a/lib/bumblebee/text.ex
+++ b/lib/bumblebee/text.ex
@@ -444,6 +444,48 @@ defmodule Bumblebee.Text do
   defdelegate text_embedding(model_info, tokenizer, opts \\ []),
     to: Bumblebee.Text.TextEmbedding
 
+  @type text_reranking_input :: {String.t(), String.t()} | [{String.t(), String.t()}]
+  @type text_reranking_output :: %{scores: text_reranking_score() | list(text_reranking_score())}
+  @type text_reranking_score :: %{score: number(), query: String.t(), document: String.t()}
+
+  @doc """
+  Builds a serving for text reranking.
+
+  The serving expects input in one of the following formats:
+
+    * `{query, document}` - a tuple with query and document text
+    * `[{query1, doc1}, {query2, doc2}, ...]` - a list of query-document pairs
+
+  ## Options
+
+  See `Bumblebee.Text.TextReranking.text_reranking/3` for available options.
+
+  ## Examples
+
+      {:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Reranker-0.6B"},
+        architecture: :for_reranker)
+      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Reranker-0.6B"})
+
+      serving = Bumblebee.Text.text_reranking(model_info, tokenizer)
+
+      query = "What is the capital of France?"
+      documents = [
+        "Paris is the capital of France.",
+        "Berlin is the capital of Germany."
+      ]
+
+      pairs = Enum.map(documents, &{query, &1})
+      Nx.Serving.run(serving, pairs)
+
+  """
+  @spec text_reranking(
+          Bumblebee.model_info(),
+          Bumblebee.Tokenizer.t(),
+          keyword()
+        ) :: Nx.Serving.t()
+  defdelegate text_reranking(model_info, tokenizer, opts \\ []),
+    to: Bumblebee.Text.TextReranking
+
   @type fill_mask_input :: String.t()
   @type fill_mask_output :: %{predictions: list(fill_mask_prediction())}
   @type fill_mask_prediction :: %{score: number(), token: String.t()}
diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex
index 37921a37..8a2b72f6 100644
--- a/lib/bumblebee/text/qwen3.ex
+++ b/lib/bumblebee/text/qwen3.ex
@@ -100,6 +100,14 @@ defmodule Bumblebee.Text.Qwen3 do
       classification head. The head returns logits corresponding to
       possible classes
 
+    * `:for_embedding` - Qwen3 with pooling to produce a single embedding
+      vector per sequence. The head pools the last attended token (based on
+      attention mask) and returns it as an embedding
+
+    * `:for_reranker` - Qwen3 configured for binary relevance classification
+      (reranking). Returns logits at the last attended token position for
+      computing relevance scores between query-document pairs
+
   ## Inputs
 
     * `"input_ids"` - `{batch_size, sequence_length}`
@@ -162,7 +170,8 @@ defmodule Bumblebee.Text.Qwen3 do
       :base,
       :for_causal_language_modeling,
       :for_sequence_classification,
-      :for_embedding
+      :for_embedding,
+      :for_reranker
     ]
 
   @impl true
@@ -292,6 +301,41 @@ defmodule Bumblebee.Text.Qwen3 do
     })
   end
 
+  def model(%__MODULE__{architecture: :for_reranker} = spec) do
+    inputs = inputs(spec)
+
+    outputs = core(inputs, spec)
+    logits = language_modeling_head(outputs.hidden_state, spec, name: "language_modeling_head")
+
+    # For reranker, we need to extract the logits at the last attended token position
+    # and return them for binary classification (relevant vs not relevant)
+    last_token_logits =
+      Layers.if_present inputs["attention_mask"] do
+        Axon.layer(
+          fn logits, attention_mask, _opts ->
+            # Find the last attended token position
+            indices =
+              attention_mask
+              |> Nx.sum(axes: [-1])
+              |> Nx.subtract(1)
+              |> Nx.as_type({:s, 64})
+
+            Bumblebee.Utils.Nx.batched_take(logits, indices)
+          end,
+          [logits, inputs["attention_mask"]]
+        )
+      else
+        Layers.take_token(logits, axis: 1, index: -1)
+      end
+
+    Layers.output(%{
+      logits: last_token_logits,
+      hidden_states: outputs.hidden_states,
+      attentions: outputs.attentions,
+      cache: outputs.cache
+    })
+  end
+
   defp inputs(spec) do
     shape = {nil, nil}
     hidden_shape = {nil, nil, spec.hidden_size}
diff --git a/lib/bumblebee/text/text_reranking.ex b/lib/bumblebee/text/text_reranking.ex
new file mode 100644
index 00000000..974028ff
--- /dev/null
+++ b/lib/bumblebee/text/text_reranking.ex
@@ -0,0 +1,243 @@
+defmodule Bumblebee.Text.TextReranking do
+  @moduledoc false
+
+  alias Bumblebee.Shared
+
+  @doc """
+  Creates a serving for text reranking.
+
+  The serving expects input in one of the following formats:
+
+    * `{query, document}` - a tuple with query and document text
+    * `[{query1, doc1}, {query2, doc2}, ...]` - a list of query-document pairs
+
+  ## Options
+
+    * `:yes_token` - the token ID corresponding to "yes" for relevance scoring.
+      If not provided, will be inferred from the tokenizer
+
+    * `:no_token` - the token ID corresponding to "no" for relevance scoring.
+      If not provided, will be inferred from the tokenizer
+
+    * `:instruction_prefix` - the instruction prefix to use. Defaults to the
+      Qwen3 reranker format
+
+    * `:instruction_suffix` - the instruction suffix to use. Defaults to the
+      Qwen3 reranker format
+
+    * `:task_description` - the task description to include in prompts. Defaults
+      to "Given a web search query, retrieve relevant passages that answer the query"
+
+    * `:compile` - compiles all computations for predefined input shapes
+      during serving initialization. Should be a keyword list with the
+      following keys:
+
+        * `:batch_size` - the maximum batch size of the input. Inputs
+          are optionally padded to always match this batch size
+
+        * `:sequence_length` - the maximum input sequence length. Input
+          sequences are always padded/truncated to match that length
+
+      It is advised to set this option in production and also configure
+      a defn compiler using `:defn_options` to maximally reduce inference
+      time
+
+    * `:defn_options` - the options for JIT compilation. Defaults to `[]`
+
+    * `:preallocate_params` - when `true`, explicitly allocates params
+      on the device configured in `:defn_options`. You may want to set
+      this option when using partitioned models on the GPU
+
+  ## Examples
+
+      {:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Reranker-0.6B"},
+        architecture: :for_reranker)
+      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Reranker-0.6B"})
+
+      serving = Bumblebee.Text.TextReranking.text_reranking(model_info, tokenizer)
+
+      query = "What is the capital of France?"
+      documents = [
+        "Paris is the capital of France.",
+        "Berlin is the capital of Germany.",
+        "The Eiffel Tower is in Paris."
+      ]
+
+      pairs = Enum.map(documents, &{query, &1})
+      Nx.Serving.run(serving, pairs)
+      #=> %{
+      #=>   scores: [
+      #=>     %{score: 0.95, query: "What is the capital of France?", document: "Paris is the capital of France."},
+      #=>     %{score: 0.15, query: "What is the capital of France?", document: "Berlin is the capital of Germany."},
+      #=>     %{score: 0.72, query: "What is the capital of France?", document: "The Eiffel Tower is in Paris."}
+      #=>   ]
+      #=> }
+  """
+  def text_reranking(model_info, tokenizer, opts \\ []) do
+    %{model: model, params: params, spec: spec} = model_info
+    Shared.validate_architecture!(spec, :for_reranker)
+
+    # Get yes/no token IDs
+    yes_token =
+      opts[:yes_token] ||
+        get_token_id(tokenizer, "yes")
+
+    no_token =
+      opts[:no_token] ||
+        get_token_id(tokenizer, "no")
+
+    # Default Qwen3 reranker format
+    instruction_prefix =
+      opts[:instruction_prefix] ||
+        "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
+
+    instruction_suffix =
+      opts[:instruction_suffix] ||
+        "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+
+    task_description =
+      opts[:task_description] ||
+        "Given a web search query, retrieve relevant passages that answer the query"
+
+    opts =
+      Keyword.validate!(opts, [
+        :compile,
+        :yes_token,
+        :no_token,
+        :instruction_prefix,
+        :instruction_suffix,
+        :task_description,
+        defn_options: [],
+        preallocate_params: false
+      ])
+
+    preallocate_params = opts[:preallocate_params]
+    defn_options = opts[:defn_options]
+
+    compile =
+      if compile = opts[:compile] do
+        compile
+        |> Keyword.validate!([:batch_size, :sequence_length])
+        |> Shared.require_options!([:batch_size, :sequence_length])
+      end
+
+    batch_size = compile[:batch_size]
+    sequence_length = compile[:sequence_length]
+
+    tokenizer =
+      Bumblebee.configure(tokenizer,
+        length: sequence_length,
+        return_token_type_ids: false
+      )
+
+    {_init_fun, predict_fun} = Axon.build(model)
+
+    scores_fun = fn params, input ->
+      outputs = predict_fun.(params, input)
+      # outputs.logits has shape {batch_size, vocab_size}
+      # Extract logits for yes/no tokens
+      yes_logits = outputs.logits[[.., yes_token]]
+      no_logits = outputs.logits[[.., no_token]]
+
+      # Stack and apply log_softmax
+      stacked = Nx.stack([no_logits, yes_logits], axis: 1)
+      log_probs = Axon.Activations.log_softmax(stacked, axis: 1)
+
+      # Take exp of yes probability
+      scores = Nx.exp(log_probs[[.., 1]])
+      scores
+    end
+
+    batch_keys = Shared.sequence_batch_keys(sequence_length)
+
+    Nx.Serving.new(
+      fn batch_key, defn_options ->
+        params = Shared.maybe_preallocate(params, preallocate_params, defn_options)
+
+        scope = {:scores, batch_key}
+
+        scores_fun =
+          Shared.compile_or_jit(scores_fun, scope, defn_options, compile != nil, fn ->
+            {:sequence_length, sequence_length} = batch_key
+
+            inputs = %{
+              "input_ids" => Nx.template({batch_size, sequence_length}, :u32),
+              "attention_mask" => Nx.template({batch_size, sequence_length}, :u32)
+            }
+
+            [params, inputs]
+          end)
+
+        fn inputs ->
+          inputs = Shared.maybe_pad(inputs, batch_size)
+          scores_fun.(params, inputs) |> Shared.serving_post_computation()
+        end
+      end,
+      defn_options
+    )
+    |> Nx.Serving.batch_size(batch_size)
+    |> Nx.Serving.process_options(batch_keys: batch_keys)
+    |> Nx.Serving.client_preprocessing(fn input ->
+      {pairs, multi?} = validate_reranking_input!(input)
+
+      # Format each query-document pair with the instruction template
+      texts =
+        Enum.map(pairs, fn {query, document} ->
+          content = format_instruction(task_description, query, document)
+          "#{instruction_prefix}#{content}#{instruction_suffix}"
+        end)
+
+      inputs =
+        Nx.with_default_backend(Nx.BinaryBackend, fn ->
+          Bumblebee.apply_tokenizer(tokenizer, texts)
+        end)
+
+      batch_key = Shared.sequence_batch_key_for_inputs(inputs, sequence_length)
+      batch = [inputs] |> Nx.Batch.concatenate() |> Nx.Batch.key(batch_key)
+
+      {batch, {multi?, pairs}}
+    end)
+    |> Nx.Serving.client_postprocessing(fn {scores, _metadata}, {multi?, pairs} ->
+      results =
+        Enum.zip_with(Nx.to_list(scores), pairs, fn score, {query, document} ->
+          %{score: score, query: query, document: document}
+        end)
+
+      output = %{scores: results}
+      if multi?, do: output, else: %{scores: hd(results)}
+    end)
+  end
+
+  defp format_instruction(task, query, document) do
+    "<Instruct>: #{task}\n<Query>: #{query}\n<Document>: #{document}"
+  end
+
+  defp get_token_id(tokenizer, token) do
+    encoded = Bumblebee.apply_tokenizer(tokenizer, token)
+    Nx.to_flat_list(encoded["input_ids"]) |> hd()
+  end
+
+  defp validate_reranking_input!(input) do
+    case input do
+      {query, doc} when is_binary(query) and is_binary(doc) ->
+        {[{query, doc}], false}
+
+      list when is_list(list) ->
+        pairs =
+          Enum.map(list, fn
+            {query, doc} when is_binary(query) and is_binary(doc) ->
+              {query, doc}
+
+            other ->
+              raise ArgumentError,
+                    "expected a query-document tuple {query, doc} where both are strings, got: #{inspect(other)}"
+          end)
+
+        {pairs, true}
+
+      other ->
+        raise ArgumentError,
+              "expected a query-document tuple {query, doc} or a list of such tuples, got: #{inspect(other)}"
+    end
+  end
+end
diff --git a/notebooks/qwen3.livemd b/notebooks/qwen3.livemd
index 6d8d6f6d..50b82093 100644
--- a/notebooks/qwen3.livemd
+++ b/notebooks/qwen3.livemd
@@ -160,12 +160,13 @@ Reranking models take a query and a list of candidate documents, then score how
 ```elixir
 repo = {:hf, "Qwen/Qwen3-Reranker-0.6B"}
 
-{:ok, model_info} = Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend, architecture: :for_embedding)
+{:ok, model_info} =
+  Bumblebee.load_model(repo, type: :f32, backend: EXLA.Backend, architecture: :for_reranker)
+
 {:ok, tokenizer} = Bumblebee.load_tokenizer(repo)
 
 serving =
-  Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
-    output_attribute: :embedding,
+  Bumblebee.Text.text_reranking(model_info, tokenizer,
     compile: [batch_size: 4, sequence_length: 512],
     defn_options: [compiler: EXLA]
   )
@@ -185,29 +186,19 @@ documents = [
   "My favorite color is blue and I enjoy long walks on the beach."
 ]
 
-# Create query-document pairs with instruction prefix
-# Qwen3 reranker expects the format: "Instruct: {query}\nQuery: {text}"
-pairs =
-  Enum.map(documents, fn doc ->
-    "Instruct: Given a query, retrieve relevant documents\nQuery: #{query}\n#{doc}"
-  end)
+# Create query-document pairs
+pairs = Enum.map(documents, fn doc -> {query, doc} end)
+
+# Get relevance scores
+%{scores: results} = Nx.Serving.batched_run(Qwen3Reranker, pairs)
 
-# Get embeddings (which represent relevance scores)
+# Sort by score descending
 results =
-  pairs
-  |> Enum.zip(documents)
-  |> Enum.map(fn {pair, doc} ->
-    %{embedding: embedding} = Nx.Serving.batched_run(Qwen3Reranker, pair)
-    # Take the first dimension as the relevance score
-    score =
-      embedding
-      |> Nx.to_flat_list()
-      |> List.first()
-      |> then(&Float.round(&1, 4))
-
-    {score, doc}
+  results
+  |> Enum.sort_by(& &1.score, :desc)
+  |> Enum.map(fn result ->
+    {Float.round(result.score, 4), result.document}
   end)
-  |> Enum.sort_by(fn {score, _doc} -> score end, :desc)
 
 IO.puts("Query: #{query}\n")
 IO.puts("Ranked documents by relevance:\n")