From dd5330bf0d6fa27e5ea10006542069a546fe9a9b Mon Sep 17 00:00:00 2001
From: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:09:16 +0800
Subject: [PATCH] add doc to llm_runtime.py

Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
---
 examples/llm-api/llm_runtime.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/examples/llm-api/llm_runtime.py b/examples/llm-api/llm_runtime.py
index 40c6af6a9b0..a7735460fa1 100644
--- a/examples/llm-api/llm_runtime.py
+++ b/examples/llm-api/llm_runtime.py
@@ -48,15 +48,29 @@ def example_cuda_graph_config():
 
 
 def example_kv_cache_config():
+    """
+    Example demonstrating KV cache configuration for memory management and performance.
+
+    KV cache configuration helps with:
+    - Controlling GPU memory allocation for key-value cache
+    - Enabling block reuse to optimize memory usage for shared prefixes
+    - Balancing memory usage between model weights and cache storage
+
+    Please refer to the api reference for more details.
+    """
+
     print("\n=== KV Cache Configuration Example ===")
     print("\n1. KV Cache Configuration:")
 
-    llm_advanced = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-                       max_batch_size=8,
-                       max_seq_len=1024,
-                       kv_cache_config=KvCacheConfig(
-                           free_gpu_memory_fraction=0.5,
-                           enable_block_reuse=True))
+    llm_advanced = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_batch_size=8,
+        max_seq_len=1024,
+        kv_cache_config=KvCacheConfig(
+            # free_gpu_memory_fraction: the fraction of free GPU memory to allocate to the KV cache
+            free_gpu_memory_fraction=0.5,
+            # enable_block_reuse: whether to enable block reuse
+            enable_block_reuse=True))
 
     prompts = [
         "Hello, my name is",