refactor: tokenization

sigridjineth · sigridjineth · commit 7d32afeed3f4 · 2025-08-12T16:56:14.000+09:00
diff --git a/backends/candle/src/models/flash_bert.rs b/backends/candle/src/models/flash_bert.rs
@@ -259,6 +259,7 @@ impl FlashBertModel {
                 };
                 (pool, None, splade)
             }
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let (embeddings, encoder) = match (
@@ -326,6 +327,7 @@ impl FlashBertModel {
                 };
                 (pool, None, splade)
             }
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let (embeddings, encoder) = match (
diff --git a/backends/candle/src/models/flash_distilbert.rs b/backends/candle/src/models/flash_distilbert.rs
@@ -200,6 +200,7 @@ impl FlashDistilBertModel {
                 candle::bail!("`classifier` model type is not supported for DistilBert")
             }
             ModelType::Embedding(pool) => pool,
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let (embeddings, encoder) = match (
diff --git a/backends/candle/src/models/flash_gte.rs b/backends/candle/src/models/flash_gte.rs
@@ -191,6 +191,7 @@ impl FlashGTEModel {
                 (pool, Some(classifier))
             }
             ModelType::Embedding(pool) => (pool, None),
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let (word_embeddings, token_type_embeddings, layers, embeddings_norm) =
diff --git a/backends/candle/src/models/flash_jina.rs b/backends/candle/src/models/flash_jina.rs
@@ -267,6 +267,7 @@ impl FlashJinaBertModel {
                 }
                 (pool, None)
             }
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let (embeddings, encoder) = match (
diff --git a/backends/candle/src/models/flash_jina_code.rs b/backends/candle/src/models/flash_jina_code.rs
@@ -314,6 +314,7 @@ impl FlashJinaCodeBertModel {
                 }
                 pool
             }
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let (embeddings, encoder) = match (
diff --git a/backends/candle/src/models/flash_mistral.rs b/backends/candle/src/models/flash_mistral.rs
@@ -251,6 +251,7 @@ impl FlashMistralModel {
                 candle::bail!("`classifier` model type is not supported for Mistral")
             }
             ModelType::Embedding(pool) => pool,
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let embeddings = Embedding::new(
diff --git a/backends/candle/src/models/flash_modernbert.rs b/backends/candle/src/models/flash_modernbert.rs
@@ -274,6 +274,7 @@ impl FlashModernBertModel {
 
                 (pool, None)
             }
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let embeddings = ModernBertEmbeddings::load(vb.pp("model.embeddings"), config)
diff --git a/backends/candle/src/models/flash_nomic.rs b/backends/candle/src/models/flash_nomic.rs
@@ -228,6 +228,7 @@ impl FlashNomicBertModel {
                 }
                 pool
             }
+            ModelType::ListwiseReranker => todo!(),
         };
 
         let embeddings = NomicBertEmbeddings::load(vb.clone(), config)?;
diff --git a/backends/candle/src/models/flash_qwen2.rs b/backends/candle/src/models/flash_qwen2.rs
@@ -259,6 +259,7 @@ impl FlashQwen2Model {
                 candle::bail!("`classifier` model type is not supported for Qwen2")
             }
             ModelType::Embedding(pool) => pool,
+            ModelType::ListwiseReranker => todo!(),
         };
 
         // Pushing the prefix for `model` is apparently only required if the model architecture is
diff --git a/backends/candle/src/models/flash_qwen3.rs b/backends/candle/src/models/flash_qwen3.rs
@@ -1,7 +1,7 @@
 use crate::flash_attn::flash_attn_varlen;
 use crate::layers::{get_cos_sin, get_inv_freqs, HiddenAct, Linear, RMSNorm};
 use crate::models::{Model, Qwen3Config};
-use candle::{DType, Device, IndexOp, Result, Tensor};
+use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{Embedding, Module, VarBuilder};
 use candle_rotary::apply_rotary_inplace;
 use text_embeddings_backend_core::{Batch, ModelType, Pool};
@@ -592,10 +592,13 @@ impl Model for FlashQwen3Model {
 
                 let h_last = Tensor::stack(&last_hidden_states, 0)?; // [bs, hidden_size]
 
-                let true_id = 9693u32;
-                let false_id = 2152u32;
+                // Correct token IDs for Qwen3 (verified from tokenizer)
+                let yes_id = 9454u32; // "yes" token ID
+                let no_id = 2901u32; // "no" token ID
 
-                let ids = Tensor::from_vec(vec![false_id, true_id], 2, &self.device)?;
+                tracing::debug!("Using Qwen3 token IDs - yes: {}, no: {}", yes_id, no_id);
+
+                let ids = Tensor::from_vec(vec![no_id, yes_id], 2, &self.device)?;
                 let w = self.lm_head_weight.index_select(&ids, 0)?; // [2, hidden_size]
                 let logits = h_last.matmul(&w.t()?)?; // [bs, 2] (no, yes)
                 let log_probs = candle_nn::ops::log_softmax(&logits, D::Minus1)?;
diff --git a/backends/candle/src/models/qwen3.rs b/backends/candle/src/models/qwen3.rs
@@ -711,137 +711,55 @@ impl Model for Qwen3Model {
     fn predict(&self, batch: Batch) -> Result<Tensor> {
         match &self.model_type {
             ModelType::ListwiseReranker => {
+                // Extract needed values before moving batch
                 let batch_size = batch.len();
                 let max_length = batch.max_length as usize;
-                let shape = (batch_size, max_length);
-
-                let (input_ids, position_ids, _input_lengths, attention_bias): (
-                    Tensor,
-                    Tensor,
-                    Vec<usize>,
-                    Option<Tensor>,
-                ) = if batch_size > 1 {
-                    let elems = batch_size * max_length;
-                    let mut input_ids = Vec::with_capacity(elems);
-                    let mut position_ids = Vec::with_capacity(elems);
-                    let mut attention_bias = Vec::with_capacity(elems);
-                    let mut masking = false;
-
-                    for i in 0..batch_size {
-                        let start = batch.cumulative_seq_lengths[i] as usize;
-                        let end = batch.cumulative_seq_lengths[i + 1] as usize;
-                        let seq_length = end - start;
-
-                        // Left padding for Qwen3-Embedding (pad at the beginning)
-                        let padding = max_length - seq_length;
-                        if padding > 0 {
-                            masking = true;
-                            for _ in 0..padding {
-                                input_ids.push(self.pad_token_id);
-                                position_ids.push(0);
-                                attention_bias.push(f32::NEG_INFINITY);
-                            }
-                        }
-
-                        // Then add the actual sequence
-                        for j in start..end {
-                            input_ids.push(batch.input_ids[j]);
-                            position_ids.push(batch.position_ids[j]);
-                            attention_bias.push(0.0);
-                        }
-                    }
-
-                    let input_ids = Tensor::from_vec(input_ids, shape, &self.device)?;
-                    let position_ids = Tensor::from_vec(position_ids, shape, &self.device)?;
 
-                    let attention_bias = if masking {
-                        let attention_bias = Tensor::from_vec(
-                            attention_bias,
-                            (batch_size, 1, 1, max_length),
-                            &self.device,
-                        )?
-                        .to_dtype(self.dtype)?;
-                        let attention_bias = attention_bias
-                            .broadcast_as((
-                                batch_size,
-                                self.num_attention_heads,
-                                max_length,
-                                max_length,
-                            ))?
-                            .contiguous()?;
-                        Some(attention_bias)
-                    } else {
-                        None
-                    };
+                // Use the existing forward method to get hidden states
+                let (_, raw_embeddings) = self.forward(batch)?;
 
-                    (input_ids, position_ids, Vec::<usize>::new(), attention_bias)
-                } else {
-                    let input_ids = Tensor::from_vec(
-                        batch.input_ids.clone(),
-                        (1, batch.input_ids.len()),
-                        &self.device,
-                    )?;
-                    let position_ids = Tensor::from_vec(
-                        batch.position_ids.clone(),
-                        (1, batch.position_ids.len()),
-                        &self.device,
-                    )?;
-
-                    let seq_len = batch.input_ids.len();
-                    let attention_bias = Tensor::zeros(
-                        (1, self.num_attention_heads, seq_len, seq_len),
-                        self.dtype,
-                        &self.device,
-                    )?;
-
-                    (
-                        input_ids,
-                        position_ids,
-                        vec![batch.input_ids.len()],
-                        Some(attention_bias),
-                    )
+                let hidden_states = match raw_embeddings {
+                    Some(embeddings) => embeddings,
+                    None => candle::bail!("No hidden states returned from forward pass"),
                 };
 
-                let attention_bias = if let Some(attn_bias) = attention_bias {
-                    Some(self.get_causal_attention_bias(attn_bias)?)
-                } else {
-                    None
-                };
+                // Project through LM head to get logits
+                let logits = hidden_states.matmul(&self.lm_head_weight.t()?)?;
 
-                let mut hidden_states = self.embeddings.forward(&input_ids)?;
+                // Correct token IDs for Qwen3 (verified from tokenizer)
+                let yes_id = 9454u32; // "yes" token ID
+                let no_id = 2901u32; // "no" token ID
 
-                let cos = self
-                    .rotary_cache
-                    .0
-                    .index_select(&position_ids.flatten_all()?, 0)?;
-                let sin = self
-                    .rotary_cache
-                    .1
-                    .index_select(&position_ids.flatten_all()?, 0)?;
+                tracing::debug!("Using Qwen3 token IDs - yes: {}, no: {}", yes_id, no_id);
 
-                let cos = cos.reshape((batch_size, 1, max_length, self.rotary_dim))?;
-                let sin = sin.reshape((batch_size, 1, max_length, self.rotary_dim))?;
+                // Extract logits for last position of each sequence
+                let mut scores_vec = Vec::with_capacity(batch_size);
 
-                for layer in &self.layers {
-                    hidden_states =
-                        layer.forward(&hidden_states, attention_bias.as_ref(), &cos, &sin)?;
-                }
+                for i in 0..batch_size {
+                    // For left-padded sequences, the last position contains the actual output
+                    let last_pos = max_length - 1;
 
-                let (outputs, _) = self.norm.forward(&hidden_states, None)?;
+                    // Get logits for the last position
+                    let last_logits = logits.i((i, last_pos, ..))?;
 
-                let last_idx = max_length - 1;
-                let h_last = outputs.i((.., last_idx, ..))?; // [bs, hidden_size]
+                    // Extract yes/no logits directly
+                    let yes_logit = last_logits.i(yes_id as usize)?;
+                    let no_logit = last_logits.i(no_id as usize)?;
 
-                let true_id = 9693u32; // "yes" token
-                let false_id = 2152u32; // "no" token
+                    // Stack [no, yes] and apply log_softmax
+                    let logit_pair = Tensor::stack(&[&no_logit, &yes_logit], 0)?;
+                    let log_probs =
+                        candle_nn::ops::log_softmax(&logit_pair.unsqueeze(0)?, D::Minus1)?;
 
-                tracing::debug!("Using Qwen3 token IDs - yes: {}, no: {}", true_id, false_id);
+                    // Extract yes probability (index 1) and exp
+                    let yes_log_prob = log_probs.i((0, 1))?;
+                    let score = yes_log_prob.exp()?.to_scalar::<f32>()?;
+
+                    scores_vec.push(score);
+                }
 
-                let ids = Tensor::from_vec(vec![false_id, true_id], 2, &self.device)?;
-                let w = self.lm_head_weight.index_select(&ids, 0)?; // [2, hidden_size]g
-                let logits = h_last.matmul(&w.t()?)?; // [bs, 2] (no, yes)
-                let log_probs = candle_nn::ops::log_softmax(&logits, D::Minus1)?;
-                let scores = log_probs.i((.., 1))?.exp()?; // P("yes") ∈ (0,1)
+                // Convert to tensor
+                let scores = Tensor::from_vec(scores_vec, batch_size, &self.device)?;
 
                 Ok(scores)
             }
diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs
@@ -7,30 +7,20 @@ use clap::ValueEnum;
 #[cfg_attr(feature = "clap", derive(Clone, ValueEnum))]
 pub enum DType {
     // Float16 is not available on accelerate
-    #[cfg(any(
-        feature = "python",
-        all(feature = "candle", not(feature = "accelerate"))
-    ))]
+    #[cfg(all(feature = "candle", not(feature = "accelerate")))]
     Float16,
-    #[cfg(any(feature = "python", feature = "candle", feature = "ort"))]
+    #[cfg(any(feature = "candle", feature = "ort"))]
     Float32,
-    #[cfg(feature = "python")]
-    Bfloat16,
 }
 
 impl fmt::Display for DType {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
             // Float16 is not available on accelerate
-            #[cfg(any(
-                feature = "python",
-                all(feature = "candle", not(feature = "accelerate"))
-            ))]
+            #[cfg(all(feature = "candle", not(feature = "accelerate")))]
             DType::Float16 => write!(f, "float16"),
-            #[cfg(any(feature = "python", feature = "candle", feature = "ort"))]
+            #[cfg(any(feature = "candle", feature = "ort"))]
             DType::Float32 => write!(f, "float32"),
-            #[cfg(feature = "python")]
-            DType::Bfloat16 => write!(f, "bfloat16"),
         }
     }
 }
@@ -42,18 +32,9 @@ impl Default for DType {
         {
             DType::Float32
         }
-        #[cfg(not(any(
-            feature = "accelerate",
-            feature = "mkl",
-            feature = "ort",
-            feature = "python"
-        )))]
+        #[cfg(not(any(feature = "accelerate", feature = "mkl", feature = "ort")))]
         {
             DType::Float16
         }
-        #[cfg(feature = "python")]
-        {
-            DType::Bfloat16
-        }
     }
 }
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
@@ -24,9 +24,6 @@ use text_embeddings_backend_candle::CandleBackend;
 #[cfg(feature = "ort")]
 use text_embeddings_backend_ort::OrtBackend;
 
-#[cfg(feature = "python")]
-use text_embeddings_backend_python::PythonBackend;
-
 fn powers_of_two(max_value: usize) -> Vec<usize> {
     let mut result = Vec::new();
     let mut power: usize = 1;
@@ -398,7 +395,7 @@ async fn init_backend(
     }
 
     if let Some(api_repo) = api_repo.as_ref() {
-        if cfg!(feature = "python") || cfg!(feature = "candle") {
+        if cfg!(feature = "candle") {
             let start = std::time::Instant::now();
             if download_safetensors(api_repo).await.is_err() {
                 tracing::warn!("safetensors weights not found. Using `pytorch_model.bin` instead. Model loading will be significantly slower.");
@@ -432,32 +429,6 @@ async fn init_backend(
         }
     }
 
-    if cfg!(feature = "python") {
-        #[cfg(feature = "python")]
-        {
-            let backend = std::thread::spawn(move || {
-                PythonBackend::new(
-                    model_path.to_str().unwrap().to_string(),
-                    dtype.to_string(),
-                    model_type,
-                    uds_path,
-                    otlp_endpoint,
-                    otlp_service_name,
-                )
-            })
-            .join()
-            .expect("Python Backend management thread failed");
-
-            match backend {
-                Ok(b) => return Ok(Box::new(b)),
-                Err(err) => {
-                    tracing::error!("Could not start Python backend: {err}");
-                    backend_start_failed = true;
-                }
-            }
-        }
-    }
-
     if backend_start_failed {
         Err(BackendError::Start(
             "Could not start a suitable backend".to_string(),
diff --git a/core/src/infer.rs b/core/src/infer.rs
@@ -499,7 +499,10 @@ impl Infer {
 
     #[instrument(skip(self))]
     pub fn is_classifier(&self) -> bool {
-        matches!(self.backend.model_type, ModelType::Classifier)
+        matches!(
+            self.backend.model_type,
+            ModelType::Classifier | ModelType::ListwiseReranker
+        )
     }
 
     #[instrument(skip(self))]
diff --git a/core/src/tokenization.rs b/core/src/tokenization.rs
diff --git a/router/Cargo.toml b/router/Cargo.toml
diff --git a/router/src/lib.rs b/router/src/lib.rs

Original file line number	Diff line number	Diff line change
`@@ -259,6 +259,7 @@ impl FlashBertModel {`
`259`	`259`	`};`
`260`	`260`	`(pool, None, splade)`
`261`	`261`	`}`
	`262`	`+ ModelType::ListwiseReranker => todo!(),`
`262`	`263`	`};`
`263`	`264`
`264`	`265`	`let (embeddings, encoder) = match (`
`@@ -326,6 +327,7 @@ impl FlashBertModel {`
`326`	`327`	`};`
`327`	`328`	`(pool, None, splade)`
`328`	`329`	`}`
	`330`	`+ ModelType::ListwiseReranker => todo!(),`
`329`	`331`	`};`
`330`	`332`
`331`	`333`	`let (embeddings, encoder) = match (`
Original file line number	Diff line number	Diff line change
`@@ -200,6 +200,7 @@ impl FlashDistilBertModel {`
`200`	`200`	candle::bail!("`classifier` model type is not supported for DistilBert")
`201`	`201`	`}`
`202`	`202`	`ModelType::Embedding(pool) => pool,`
	`203`	`+ ModelType::ListwiseReranker => todo!(),`
`203`	`204`	`};`
`204`	`205`
`205`	`206`	`let (embeddings, encoder) = match (`
Original file line number	Diff line number	Diff line change
`@@ -191,6 +191,7 @@ impl FlashGTEModel {`
`191`	`191`	`(pool, Some(classifier))`
`192`	`192`	`}`
`193`	`193`	`ModelType::Embedding(pool) => (pool, None),`
	`194`	`+ ModelType::ListwiseReranker => todo!(),`
`194`	`195`	`};`
`195`	`196`
`196`	`197`	`let (word_embeddings, token_type_embeddings, layers, embeddings_norm) =`
Original file line number	Diff line number	Diff line change
`@@ -267,6 +267,7 @@ impl FlashJinaBertModel {`
`267`	`267`	`}`
`268`	`268`	`(pool, None)`
`269`	`269`	`}`
	`270`	`+ ModelType::ListwiseReranker => todo!(),`
`270`	`271`	`};`
`271`	`272`
`272`	`273`	`let (embeddings, encoder) = match (`
Original file line number	Diff line number	Diff line change
`@@ -314,6 +314,7 @@ impl FlashJinaCodeBertModel {`
`314`	`314`	`}`
`315`	`315`	`pool`
`316`	`316`	`}`
	`317`	`+ ModelType::ListwiseReranker => todo!(),`
`317`	`318`	`};`
`318`	`319`
`319`	`320`	`let (embeddings, encoder) = match (`
Original file line number	Diff line number	Diff line change
`@@ -251,6 +251,7 @@ impl FlashMistralModel {`
`251`	`251`	candle::bail!("`classifier` model type is not supported for Mistral")
`252`	`252`	`}`
`253`	`253`	`ModelType::Embedding(pool) => pool,`
	`254`	`+ ModelType::ListwiseReranker => todo!(),`
`254`	`255`	`};`
`255`	`256`
`256`	`257`	`let embeddings = Embedding::new(`
Original file line number	Diff line number	Diff line change
`@@ -274,6 +274,7 @@ impl FlashModernBertModel {`
`274`	`274`
`275`	`275`	`(pool, None)`
`276`	`276`	`}`
	`277`	`+ ModelType::ListwiseReranker => todo!(),`
`277`	`278`	`};`
`278`	`279`
`279`	`280`	`let embeddings = ModernBertEmbeddings::load(vb.pp("model.embeddings"), config)`
Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ impl FlashNomicBertModel {`
`228`	`228`	`}`
`229`	`229`	`pool`
`230`	`230`	`}`
	`231`	`+ ModelType::ListwiseReranker => todo!(),`
`231`	`232`	`};`
`232`	`233`
`233`	`234`	`let embeddings = NomicBertEmbeddings::load(vb.clone(), config)?;`
Original file line number	Diff line number	Diff line change
`@@ -259,6 +259,7 @@ impl FlashQwen2Model {`
`259`	`259`	candle::bail!("`classifier` model type is not supported for Qwen2")
`260`	`260`	`}`
`261`	`261`	`ModelType::Embedding(pool) => pool,`
	`262`	`+ ModelType::ListwiseReranker => todo!(),`
`262`	`263`	`};`
`263`	`264`
`264`	`265`	// Pushing the prefix for `model` is apparently only required if the model architecture is