huggingface · leejuyuu · May 10, 2025 · May 12, 2025 · May 17, 2025 · May 17, 2025
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
@@ -119,7 +119,7 @@ struct Args {
     #[clap(default_value = "3000", long, short, env)]
     port: u16,
 
-    #[clap(default_value = "9000", long, short, env)]
+    #[clap(default_value = "9000", long, env)]
     prometheus_port: u16,
 
     /// Enable JSON output format.

diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
@@ -26,7 +26,7 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
 
-    tle::ExecutorConfig backend_workspace_t::executor_config() const {
+    tle::ExecutorConfig backend_workspace_t::executor_config(const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str) const {
         // Retrieve the compute capabilities to enable some options at runtime
         const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
 
@@ -40,32 +40,50 @@ namespace huggingface::tgi::backends::trtllm {
         executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
         executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
         executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
+        executor_config.setGuidedDecodingConfig(tle::GuidedDecodingConfig(
+            tle::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR,
+            encoded_vocab,
+            std::string(tokenizer_str),
+            generation_config().eos_token_ids
+        ));
         return executor_config;
     }
 
-    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
-            : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
-
-    size_t backend_t::num_tokens_ready() const noexcept {
-        return executor_.getNumResponsesReady();
-    }
+    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str)
+            : workspace(engines_folder, executor_worker_path), 
+              executor_(executor_factory_initializer(workspace, encoded_vocab, tokenizer_str)) {}
 
     std::expected<request_id_t, backend_error_t>
     backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
                       const sampling_params_t s_params) noexcept {
         SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
-        return executor_.enqueueRequest(tle::Request{
+        tle::Request req {
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
                 static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
                 (tle::SamplingConfig) s_params,
-                tle::OutputConfig{ /* returnLogProbs= */ true},
+                tle::OutputConfig{ 
+                    /* returnLogProbs= */ true,
+                    false,
+                    false,
+                    false,
+                    false,
+                    /* returnPerfMetrics=*/ true,
+                },
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
                 workspace.generation_config().stop_words
-        });
+        };
+
+        if (g_params.guide_type.has_value()) {
+            req.setGuidedDecodingParams(tle::GuidedDecodingParams(
+                g_params.guide_type.value(),
+                g_params.guide
+            ));
+        }
+        return executor_.enqueueRequest(req);
     }
 
     std::vector<tle::Response> backend_t::pull_tokens() noexcept {

diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
@@ -25,6 +25,8 @@ namespace huggingface::tgi::backends::trtllm {
      */
     struct generation_params_t {
         uint32_t max_new_tokens;
+        std::optional<tle::GuidedDecodingParams::GuideType> guide_type;
+        std::string guide;
     };
 
     /**
@@ -66,17 +68,31 @@ namespace huggingface::tgi::backends::trtllm {
         float_t top_p;
         float_t temperature;
         std::list<std::vector<int32_t>> stop_words;
+        std::vector<int32_t> eos_token_ids;
 
         constexpr explicit generation_config_t(const json &config) :
-                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
-            if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
+                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0), eos_token_ids{} {
+            if (!config.contains("/eos_token_id"_json_pointer)) {
+                return;
+            }
+            if (config["/eos_token_id"_json_pointer].is_array()) {
+                SPDLOG_DEBUG("generation config eos_token_id is array");
                 const auto &eos_token_id = config["/eos_token_id"_json_pointer];
                 std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
-                    stop_words.emplace_back(1, token_id.template get<int32_t>());
+                    const auto token = token_id.template get<int32_t>();
+                    stop_words.emplace_back(1, token);
+                    eos_token_ids.emplace_back(token);
                 });
+            }
 
-                SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
+            if (config["/eos_token_id"_json_pointer].is_number()) {
+                SPDLOG_DEBUG("generation config eos_token_id is number");
+                const auto token = config["/eos_token_id"_json_pointer].get<int32_t>();
+                stop_words.emplace_back(1, token);
+                eos_token_ids.emplace_back(token);
             }
+
+            SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
         }
     };
 
@@ -134,7 +150,7 @@ namespace huggingface::tgi::backends::trtllm {
          * to initialize `tensorrt_llm::executor::Executor`
          * @return `tensorrt_llm::executor::ExecutorConfig` instance
          */
-        [[nodiscard]] tle::ExecutorConfig executor_config() const;
+        [[nodiscard]] tle::ExecutorConfig executor_config(const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str) const;
     };
 
     /**
@@ -158,10 +174,10 @@ namespace huggingface::tgi::backends::trtllm {
         tle::Executor executor_;
 
     public:
-        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
+        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str);
 
-        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
-                : backend_t(engines_folder, executor_worker_path) {};
+        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str)
+                : backend_t(engines_folder, executor_worker_path, encoded_vocab, tokenizer_str) {};
 
         /**
          * Submit a new request to the executor
@@ -175,13 +191,6 @@ namespace huggingface::tgi::backends::trtllm {
         submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
                sampling_params_t sampling_params) noexcept;
 
-        /**
-         * Query the number of tokens available across all in-flight generations
-         * @return
-         */
-        [[nodiscard("Pulling out the number of tokens")]]
-        size_t num_tokens_ready() const noexcept;
-
         /**
          * Pull out newly generated tokens from the executor
          * @return
@@ -199,9 +208,9 @@ namespace huggingface::tgi::backends::trtllm {
     /**
      * Create a TensorRT-LLM executor from a workspace
      */
-    const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
+    const auto executor_factory_initializer = [](const backend_workspace_t &workspace, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str) -> tle::Executor {
         return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
-                workspace.executor_config()};
+                workspace.executor_config(encoded_vocab, tokenizer_str)};
     };
 }
 

diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
@@ -1,7 +1,10 @@
 #ifndef TGI_BACKEND_TRTLLM_FFI
 #define TGI_BACKEND_TRTLLM_FFI
 
+#include <chrono>
+#include <exception>
 #include <memory>
+#include <optional>
 #include <thread>
 
 #include <nvml.h>
@@ -17,7 +20,7 @@ namespace rust::behavior {
     template<typename Try, typename Fail>
     static void trycatch(Try &&func, Fail &&fail) noexcept try {
         func();
-    } catch (tensorrt_llm::common::TllmException &e) {
+    } catch (const std::exception &e) {
         fail(e.what());
     }
 }
@@ -42,22 +45,46 @@ namespace huggingface::tgi::backends::trtllm {
                 return finish_reason_t::kEND_ID;
             case tle::FinishReason::kLENGTH:
                 return finish_reason_t::kLENGTH;
+            case tle::FinishReason::kTIMED_OUT:
+                return finish_reason_t::kTIMED_OUT;
+            case tle::FinishReason::kCANCELLED:
+                return finish_reason_t::kCANCELLED;
             default:
                 std::unreachable();
         }
     }
 
-    static auto as_generation_step = [](const tle::Response &r) {
+    static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point<std::chrono::steady_clock> created) {
         const auto reqId = r.getRequestId();
         if (!r.hasError()) [[likely]] {
             const auto result = r.getResult();
-            const auto logits = result.logProbs.value()[0];
+            std::optional<uint32_t> token_id = std::nullopt;
+            if (!result.outputTokenIds.empty() && !result.outputTokenIds[0].empty()) {
+                token_id = static_cast<uint32_t>(result.outputTokenIds[0][0]);
+            }
+
+            std::optional<float> log_prob = std::nullopt;
+            if (result.logProbs && !result.logProbs->empty() && !result.logProbs.value()[0].empty()) {
+                log_prob = result.logProbs.value()[0].back();
+            }
+
+            std::optional<int64_t> first_scheduled_time_ns = std::nullopt;
+            if (result.requestPerfMetrics) {
+                const auto &t = result.requestPerfMetrics->timingMetrics;
+                const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t.firstScheduledTime - created).count();
+                first_scheduled_time_ns = static_cast<int64_t>(ns);
+            }
+
             return generation_step_t{
                     reqId,
-                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                    logits.back(),
+                    token_id.value_or(0),
+                    log_prob.value_or(0.0),
+                    first_scheduled_time_ns.value_or(0),
                     result.isFinal,
                     as_finish_reason_t(result.finishReasons[0]),
+                    token_id.has_value(),
+                    log_prob.has_value(),
+                    first_scheduled_time_ns.has_value(),
                     false,
                     std::string()
             };
@@ -66,8 +93,12 @@ namespace huggingface::tgi::backends::trtllm {
                     reqId,
                     0,
                     0.0,
+                    0,
                     true,
                     finish_reason_t::kNOT_FINISHED,
+                    false,
+                    false,
+                    false,
                     true,
                     std::move(r.getErrorMsg())
             };
@@ -77,13 +108,18 @@ namespace huggingface::tgi::backends::trtllm {
 
     class tensorrt_llm_backend_t {
     private:
-        backend_t inner_;
+        mutable backend_t inner_;
+
+        // m_created_time is a reference point to convert time from c++ time_point
+        // to rust Instant.
+        std::chrono::time_point<std::chrono::steady_clock> m_created_time;
 
-    public:
-        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
-                : inner_(engine_folder, executor_worker_path) {}
 
-        size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
+    public:
+        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time, const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str)
+                : inner_(engine_folder, executor_worker_path, encoded_vocab, tokenizer_str),
+                  m_created_time {created_time}
+        {}
 
         request_id_t submit(
                 rust::Slice<const uint32_t> tokens,
@@ -93,16 +129,31 @@ namespace huggingface::tgi::backends::trtllm {
                 float_t temperature,
                 float_t repetition_penalty,
                 float_t frequency_penalty,
-                uint64_t seed
-        ) {
+                uint64_t seed,
+                grammar_type_t grammar_type,
+                rust::Str grammar_value
+        ) const {
             // This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
             SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
 
             // Submit the request to the executor and get back a potential request_id used to track request status
             const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
+
+            std::optional<tle::GuidedDecodingParams::GuideType> guide_type = std::nullopt;
+            switch (grammar_type) {
+            case grammar_type_t::kJSON:
+                guide_type = tle::GuidedDecodingParams::GuideType::kJSON_SCHEMA;
+                break;
+            case grammar_type_t::kREGEX:
+                guide_type = tle::GuidedDecodingParams::GuideType::kREGEX;
+                break;
+            default:
+                break;
+            }
+
             const auto maybe_request_id = inner_.submit(
                     signed_tokens,
-                    {max_new_tokens},
+                    {max_new_tokens, guide_type, std::string(grammar_value)},
                     {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
             );
 
@@ -115,28 +166,26 @@ namespace huggingface::tgi::backends::trtllm {
             }
         }
 
-        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
-            if (num_tokens_ready() > 0) [[likely]] {
-                const auto responses = inner_.pull_tokens();
+        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() const noexcept {
+            const auto responses = inner_.pull_tokens();
 
-                SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
+            SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
 
-                // Transform tle::Response to generation_step_t
+            auto f = [this](const tle::Response &r){
+                return as_generation_step(r, m_created_time);
+            };
+            auto steps = std::make_unique<std::vector<generation_step_t>>();
+            // Transform tle::Response to generation_step_t
 #ifdef __cpp_lib_ranges_to_container
-                auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
+            *steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
 #else
-                auto steps = std::vector<generation_step_t>();
-                steps.reserve(responses.size());
-                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
+            steps->reserve(responses.size());
+            std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
 #endif
-                return std::make_unique<std::vector<generation_step_t>>(steps);
-
-            } else {
-                return std::make_unique<std::vector<generation_step_t>>();
-            }
+            return steps;
         }
 
-        void cancel(request_id_t request_id) noexcept {
+        void cancel(request_id_t request_id) const noexcept {
             SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
             inner_.cancel(request_id);
         }
@@ -178,13 +227,25 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     std::unique_ptr<tensorrt_llm_backend_t>
-    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path, const rust::Str tokenizer_str, const rust::Vec<rust::String> encoded_vocab) {
+        const auto created_time = std::chrono::steady_clock::now();
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
+
+        std::vector<std::string> encoded_vocab_std{};
+        encoded_vocab_std.reserve(encoded_vocab.size());
+
+        for (const auto& v : encoded_vocab) {
+            encoded_vocab_std.push_back(std::string(v));
+        }
+
         return std::make_unique<tensorrt_llm_backend_t>(
                 std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
                                       std::filesystem::path::format::auto_format),
                 std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
-                                      std::filesystem::path::format::auto_format)
+                                      std::filesystem::path::format::auto_format),
+                created_time,
+                encoded_vocab_std,
+                std::string_view(tokenizer_str)
         );
     }
 }

diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs
@@ -19,4 +19,8 @@ pub enum TensorRtLlmBackendError {
     WebServer(#[from] server::WebServerError),
     #[error("Tokio runtime failed to start: {0}")]
     Tokio(#[from] std::io::Error),
+    #[error("config.json doesn't exist in engine folder {0}")]
+    ConfigNotFound(PathBuf),
+    #[error("generation_config.json doesn't exist in engine folder {0}")]
+    GenerationConfigNotFound(PathBuf),
 }