From dd5330bf0d6fa27e5ea10006542069a546fe9a9b Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:09:16 +0800 Subject: [PATCH] add doc to llm_runtime.py Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> --- examples/llm-api/llm_runtime.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/llm-api/llm_runtime.py b/examples/llm-api/llm_runtime.py index 40c6af6a9b0..a7735460fa1 100644 --- a/examples/llm-api/llm_runtime.py +++ b/examples/llm-api/llm_runtime.py @@ -48,15 +48,29 @@ def example_cuda_graph_config(): def example_kv_cache_config(): + """ + Example demonstrating KV cache configuration for memory management and performance. + + KV cache configuration helps with: + - Controlling GPU memory allocation for key-value cache + - Enabling block reuse to optimize memory usage for shared prefixes + - Balancing memory usage between model weights and cache storage + + Please refer to the api reference for more details. + """ + print("\n=== KV Cache Configuration Example ===") print("\n1. KV Cache Configuration:") - llm_advanced = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_batch_size=8, - max_seq_len=1024, - kv_cache_config=KvCacheConfig( - free_gpu_memory_fraction=0.5, - enable_block_reuse=True)) + llm_advanced = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_batch_size=8, + max_seq_len=1024, + kv_cache_config=KvCacheConfig( + # free_gpu_memory_fraction: the fraction of free GPU memory to allocate to the KV cache + free_gpu_memory_fraction=0.5, + # enable_block_reuse: whether to enable block reuse + enable_block_reuse=True)) prompts = [ "Hello, my name is",