PaddlePaddle · Jiang-Jia-Jun · Oct 17, 2025 · Sep 29, 2025
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -1142,13 +1142,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
    */
   m.def("recover_decode_task", &RecoverDecodeTask, "recover decode task for scheduler v1 function");
 
-  /**
-   * extract_text_token_output.cu
-   * extract_text_token_output
-   */
-  m.def("extract_text_token_output", &ExtractTextTokenOutput,
-        "extract_text_token_output function");
-
   m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
         "group_swiglu_with_masked function");
 

diff --git a/custom_ops/gpu_ops/extract_text_token_output.cu b/custom_ops/gpu_ops/extract_text_token_output.cu
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -290,7 +290,6 @@ def find_end_files(directory, end_str):
         "gpu_ops/cpp_extensions.cc",
         "gpu_ops/share_external_data.cu",
         "gpu_ops/per_token_quant_fp8.cu",
-        "gpu_ops/extract_text_token_output.cu",
         "gpu_ops/update_split_fuse_input.cu",
         "gpu_ops/text_image_index_out.cu",
         "gpu_ops/text_image_gather_scatter.cu",
@@ -536,6 +535,9 @@ def find_end_files(directory, end_str):
                 "gpu_ops/token_penalty_multi_scores.cu",
                 "gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
                 "gpu_ops/sample_kernels/top_k_renorm_probs.cu",
+                "gpu_ops/text_image_index_out.cu",
+                "gpu_ops/text_image_gather_scatter.cu",
+                "gpu_ops/set_data_ipc.cu",
                 "iluvatar_ops/moe_dispatch.cu",
                 "iluvatar_ops/moe_reduce.cu",
                 "iluvatar_ops/paged_attn.cu",
@@ -594,7 +596,6 @@ def find_end_files(directory, end_str):
         "gpu_ops/read_data_ipc.cu",
         "gpu_ops/dequant_int8.cu",
         "gpu_ops/share_external_data.cu",
-        "gpu_ops/extract_text_token_output.cu",
         "gpu_ops/moe/tritonmoe_preprocess.cu",
         "gpu_ops/moe/moe_topk_select.cu",
         "gpu_ops/recover_decode_task.cu",

diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md
@@ -409,3 +409,148 @@ Accuracy: 0.962
 Invaild: 0.000
 Latency: 17332.728 s
 ```
+
+# Run ERNIE-4.5-VL-28B-A3B-Paddle model on iluvatar machine
+
+## Machine Preparation
+First, the `TP=2` when running the ERNIE-4.5-VL-28B-A3B-Paddle model and so you need to prepare a machine with the following configurations:
+
+| CPU | Memory | Card | Hard Disk|
+| :---: | :---: | :---: | :---: |
+| x86 | 1TB| 2xBI150| 1TB|
+
+## Image Preparation
+Pull the Docker image
+
+```bash
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+```
+
+## Container Preparation
+### Start Container
+
+```bash
+docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker exec -it paddle_infer bash
+```
+
+/home/paddle contains the model files, *.whl packages, and scripts.
+
+### Install paddle
+
+```bash
+pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+```
+For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
+
+### Install FastDeploy
+```bash
+pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.aliyun.com/pypi/simple/
+```
+
+## Prepare the inference demo script
+
+script list below:
+
+`run_demo_vl.sh`:
+
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 run_demo_vl.py
+```
+
+`run_demo_vl.py`:
+
+```python
+import io
+import requests
+from PIL import Image
+
+from fastdeploy.entrypoints.llm import LLM
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+
+
+PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
+tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
+            {"type":"text", "text":"图中的文物属于哪个年代"}
+        ]
+     }
+]
+prompt = tokenizer.apply_chat_template(messages, tokenize=False)
+images, videos = [], []
+for message in messages:
+    content = message["content"]
+    if not isinstance(content, list):
+        continue
+    for part in content:
+        if part["type"] == "image_url":
+            url = part["image_url"]["url"]
+            image_bytes = requests.get(url).content
+            img = Image.open(io.BytesIO(image_bytes))
+            images.append(img)
+        elif part["type"] == "video_url":
+            url = part["video_url"]["url"]
+            video_bytes = requests.get(url).content
+            videos.append({
+                "video": video_bytes,
+                "max_frames": 30
+            })
+
+sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
+llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
+outputs = llm.generate(prompts={
+    "prompt": prompt,
+    "multimodal_data": {
+        "image": images,
+        "video": videos
+    }
+}, sampling_params=sampling_params)
+# Output results
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+    reasoning_text = output.outputs.reasoning_content
+    print(f"generated_text={generated_text}")
+```
+
+## run demo
+
+```bash
+./run_demo_vl.sh
+```
+
+The following logs will be printed:
+
+```
+[2025-09-23 10:13:10,844] [    INFO] - Using download source: huggingface
+[2025-09-23 10:13:10,844] [    INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
+[2025-09-23 10:13:10,845] [    INFO] - Using download source: huggingface
+[2025-09-23 10:13:10,845] [    INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
+unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+  warnings.warn(
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
+`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.                                                                                                            warnings.warn(
+INFO     2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
+Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00,  1.41s/it]
+Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.65it/s]
+INFO     2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
+prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+generated_text=
+图中的文物是**北齐释迦牟尼佛像**，属于**北齐（公元550年－577年）**的文物。
+
+这件佛像具有典型的北齐风格，佛像结跏趺坐于莲花座上，身披通肩袈裟，面部圆润，神态安详，体现了北齐佛教艺术的独特魅力。
+```