tobi · NguyenQS504092s · Mar 9, 2026
diff --git a/README.md b/README.md
@@ -516,6 +516,25 @@ llm_cache       -- Cached LLM responses (query expansion, rerank scores)
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
+| `QMD_EMBED_MODEL` | `embeddinggemma-300M` | Override embedding model (HF URI) |
+| `QMD_EXPAND_CONTEXT_SIZE` | `2048` | Context size for query expansion |
+| `QMD_RERANK_CONTEXT_SIZE` | `2048` | Context size for reranking (lower = less VRAM) |
+| `QMD_EMBED_CONTEXT_SIZE` | auto | Context size for embedding (lower = less VRAM) |
+| `QMD_MAX_PARALLELISM` | auto | Cap on parallel contexts (lower = less VRAM) |
+| `QMD_EMBED_BATCH_SIZE` | `32` | Batch size for embedding loop |
+
+### Low-VRAM GPU Configuration
+
+On GPUs with ≤4GB VRAM (e.g. RTX 3050, GTX 960M), the default settings may cause
+OOM errors. Set these environment variables to reduce VRAM usage:
+
+```bash
+export QMD_RERANK_CONTEXT_SIZE=1024
+export QMD_EMBED_CONTEXT_SIZE=1024
+export QMD_MAX_PARALLELISM=2
+export QMD_EMBED_BATCH_SIZE=8
+export QMD_EXPAND_CONTEXT_SIZE=1024
+```
 
 ## How It Works
 

diff --git a/src/llm.ts b/src/llm.ts
@@ -359,6 +359,18 @@ export type LlamaCppConfig = {
    * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
    */
   expandContextSize?: number;
+  /**
+   * Context size for rerank contexts.
+   * Default: 2048. Can also be set via QMD_RERANK_CONTEXT_SIZE.
+   * Lower values (e.g. 1024) reduce VRAM usage on low-VRAM GPUs.
+   */
+  rerankContextSize?: number;
+  /**
+   * Context size for embedding contexts.
+   * Default: auto (model decides). Can also be set via QMD_EMBED_CONTEXT_SIZE.
+   * Lower values (e.g. 1024) reduce VRAM usage on low-VRAM GPUs.
+   */
+  embedContextSize?: number;
   /**
    * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
    *
@@ -382,6 +394,7 @@ export type LlamaCppConfig = {
 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
+const DEFAULT_RERANK_CONTEXT_SIZE = 2048;
 
 function resolveExpandContextSize(configValue?: number): number {
   if (configValue !== undefined) {
@@ -404,6 +417,62 @@ function resolveExpandContextSize(configValue?: number): number {
   return parsed;
 }
 
+function resolveRerankContextSize(configValue?: number): number {
+  if (configValue !== undefined) {
+    if (!Number.isInteger(configValue) || configValue <= 0) {
+      throw new Error(`Invalid rerankContextSize: ${configValue}. Must be a positive integer.`);
+    }
+    return configValue;
+  }
+
+  const envValue = process.env.QMD_RERANK_CONTEXT_SIZE?.trim();
+  if (!envValue) return DEFAULT_RERANK_CONTEXT_SIZE;
+
+  const parsed = Number.parseInt(envValue, 10);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    process.stderr.write(
+      `QMD Warning: invalid QMD_RERANK_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_RERANK_CONTEXT_SIZE}.\n`
+    );
+    return DEFAULT_RERANK_CONTEXT_SIZE;
+  }
+  return parsed;
+}
+
+function resolveEmbedContextSize(configValue?: number): number | undefined {
+  if (configValue !== undefined) {
+    if (!Number.isInteger(configValue) || configValue <= 0) {
+      throw new Error(`Invalid embedContextSize: ${configValue}. Must be a positive integer.`);
+    }
+    return configValue;
+  }
+
+  const envValue = process.env.QMD_EMBED_CONTEXT_SIZE?.trim();
+  if (!envValue) return undefined; // auto (let node-llama-cpp decide)
+
+  const parsed = Number.parseInt(envValue, 10);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    process.stderr.write(
+      `QMD Warning: invalid QMD_EMBED_CONTEXT_SIZE="${envValue}", using default (auto).\n`
+    );
+    return undefined;
+  }
+  return parsed;
+}
+
+function resolveMaxParallelism(): number {
+  const envValue = process.env.QMD_MAX_PARALLELISM?.trim();
+  if (!envValue) return 0; // 0 means no override
+
+  const parsed = Number.parseInt(envValue, 10);
+  if (!Number.isInteger(parsed) || parsed <= 0) {
+    process.stderr.write(
+      `QMD Warning: invalid QMD_MAX_PARALLELISM="${envValue}", ignoring.\n`
+    );
+    return 0;
+  }
+  return parsed;
+}
+
 export class LlamaCpp implements LLM {
   private llama: Llama | null = null;
   private embedModel: LlamaModel | null = null;
@@ -417,6 +486,8 @@ export class LlamaCpp implements LLM {
   private rerankModelUri: string;
   private modelCacheDir: string;
   private expandContextSize: number;
+  private rerankContextSize: number;
+  private embedContextSize: number | undefined;
 
   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
@@ -438,6 +509,8 @@ export class LlamaCpp implements LLM {
     this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
     this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
     this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
+    this.rerankContextSize = resolveRerankContextSize(config.rerankContextSize);
+    this.embedContextSize = resolveEmbedContextSize(config.embedContextSize);
     this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
     this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
   }
@@ -606,25 +679,35 @@ export class LlamaCpp implements LLM {
    * CPU: constrained by cores. Splitting threads across contexts enables
    *      true parallelism (each context runs on its own cores). Use at most
    *      half the math cores, with at least 4 threads per context.
+   *
+   * QMD_MAX_PARALLELISM env var can cap the result (useful for low-VRAM GPUs).
    */
   private async computeParallelism(perContextMB: number): Promise<number> {
+    const maxParallelism = resolveMaxParallelism();
     const llama = await this.ensureLlama();
+    let computed: number;
 
     if (llama.gpu) {
       try {
         const vram = await llama.getVramState();
         const freeMB = vram.free / (1024 * 1024);
         const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
-        return Math.max(1, Math.min(8, maxByVram));
+        computed = Math.max(1, Math.min(8, maxByVram));
       } catch {
-        return 2;
+        computed = 2;
       }
+    } else {
+      // CPU: split cores across contexts. At least 4 threads per context.
+      const cores = llama.cpuMathCores || 4;
+      const maxContexts = Math.floor(cores / 4);
+      computed = Math.max(1, Math.min(4, maxContexts));
     }
 
-    // CPU: split cores across contexts. At least 4 threads per context.
-    const cores = llama.cpuMathCores || 4;
-    const maxContexts = Math.floor(cores / 4);
-    return Math.max(1, Math.min(4, maxContexts));
+    // Allow env var override (useful for low-VRAM GPUs)
+    if (maxParallelism > 0) {
+      return Math.min(computed, maxParallelism);
+    }
+    return computed;
   }
 
   /**
@@ -662,6 +745,7 @@ export class LlamaCpp implements LLM {
       for (let i = 0; i < n; i++) {
         try {
           this.embedContexts.push(await model.createEmbeddingContext({
+            ...(this.embedContextSize !== undefined ? { contextSize: this.embedContextSize } : {}),
             ...(threads > 0 ? { threads } : {}),
           }));
         } catch {
@@ -758,7 +842,7 @@ export class LlamaCpp implements LLM {
   // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
   // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
   // Use 2048 for safety margin. Still 17× less than auto (40960).
-  private static readonly RERANK_CONTEXT_SIZE = 2048;
+  // Configurable via QMD_RERANK_CONTEXT_SIZE env var for low-VRAM GPUs.
   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
     if (this.rerankContexts.length === 0) {
       const model = await this.ensureRerankModel();
@@ -768,7 +852,7 @@ export class LlamaCpp implements LLM {
       for (let i = 0; i < n; i++) {
         try {
           this.rerankContexts.push(await model.createRankingContext({
-            contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
+            contextSize: this.rerankContextSize,
             flashAttention: true,
             ...(threads > 0 ? { threads } : {}),
           } as any));
@@ -777,7 +861,7 @@ export class LlamaCpp implements LLM {
             // Flash attention might not be supported — retry without it
             try {
               this.rerankContexts.push(await model.createRankingContext({
-                contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
+                contextSize: this.rerankContextSize,
                 ...(threads > 0 ? { threads } : {}),
               }));
             } catch {
@@ -1076,7 +1160,7 @@ export class LlamaCpp implements LLM {
     // Truncate documents that would exceed the rerank context size.
     // Budget = contextSize - template overhead - query tokens
     const queryTokens = model.tokenize(query).length;
-    const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
+    const maxDocTokens = this.rerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
     const truncationCache = new Map<string, string>();
 
     const truncatedDocs = documents.map((doc) => {

diff --git a/src/qmd.ts b/src/qmd.ts
@@ -1638,8 +1638,10 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
     const startTime = Date.now();
 
     // Batch embedding for better throughput
-    // Process in batches of 32 to balance memory usage and efficiency
-    const BATCH_SIZE = 32;
+    // Process in batches to balance memory usage and efficiency
+    // Configurable via QMD_EMBED_BATCH_SIZE env var (default: 32, lower for low-VRAM GPUs)
+    const envBatchSize = process.env.QMD_EMBED_BATCH_SIZE?.trim();
+    const BATCH_SIZE = envBatchSize ? (Number.parseInt(envBatchSize, 10) || 32) : 32;
 
     for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) {
       const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length);