Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/llamacpp/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ struct Args {
#[clap(default_value = "3000", long, short, env)]
port: u16,

#[clap(default_value = "9000", long, short, env)]
#[clap(default_value = "9000", long, env)]
prometheus_port: u16,

/// Enable JSON output format.
Expand Down
38 changes: 28 additions & 10 deletions backends/trtllm/csrc/backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace huggingface::tgi::backends::trtllm {
}


tle::ExecutorConfig backend_workspace_t::executor_config() const {
tle::ExecutorConfig backend_workspace_t::executor_config(const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str) const {
// Retrieve the compute capabilities to enable some options at runtime
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();

Expand All @@ -40,32 +40,50 @@ namespace huggingface::tgi::backends::trtllm {
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
executor_config.setGuidedDecodingConfig(tle::GuidedDecodingConfig(
tle::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR,
encoded_vocab,
std::string(tokenizer_str),
generation_config().eos_token_ids
));
return executor_config;
}

backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}

size_t backend_t::num_tokens_ready() const noexcept {
return executor_.getNumResponsesReady();
}
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str)
: workspace(engines_folder, executor_worker_path),
executor_(executor_factory_initializer(workspace, encoded_vocab, tokenizer_str)) {}

std::expected<request_id_t, backend_error_t>
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
const sampling_params_t s_params) noexcept {
SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
return executor_.enqueueRequest(tle::Request{
tle::Request req {
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
static_cast<tle::SizeType32>(g_params.max_new_tokens),
true,
(tle::SamplingConfig) s_params,
tle::OutputConfig{ /* returnLogProbs= */ true},
tle::OutputConfig{
/* returnLogProbs= */ true,
false,
false,
false,
false,
/* returnPerfMetrics=*/ true,
},
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
workspace.generation_config().stop_words
});
};

if (g_params.guide_type.has_value()) {
req.setGuidedDecodingParams(tle::GuidedDecodingParams(
g_params.guide_type.value(),
g_params.guide
));
}
return executor_.enqueueRequest(req);
}

std::vector<tle::Response> backend_t::pull_tokens() noexcept {
Expand Down
43 changes: 26 additions & 17 deletions backends/trtllm/csrc/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ namespace huggingface::tgi::backends::trtllm {
*/
struct generation_params_t {
uint32_t max_new_tokens;
std::optional<tle::GuidedDecodingParams::GuideType> guide_type;
std::string guide;
};

/**
Expand Down Expand Up @@ -66,17 +68,31 @@ namespace huggingface::tgi::backends::trtllm {
float_t top_p;
float_t temperature;
std::list<std::vector<int32_t>> stop_words;
std::vector<int32_t> eos_token_ids;

constexpr explicit generation_config_t(const json &config) :
top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0), eos_token_ids{} {
if (!config.contains("/eos_token_id"_json_pointer)) {
return;
}
if (config["/eos_token_id"_json_pointer].is_array()) {
SPDLOG_DEBUG("generation config eos_token_id is array");
const auto &eos_token_id = config["/eos_token_id"_json_pointer];
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
stop_words.emplace_back(1, token_id.template get<int32_t>());
const auto token = token_id.template get<int32_t>();
stop_words.emplace_back(1, token);
eos_token_ids.emplace_back(token);
});
}

SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
if (config["/eos_token_id"_json_pointer].is_number()) {
SPDLOG_DEBUG("generation config eos_token_id is number");
const auto token = config["/eos_token_id"_json_pointer].get<int32_t>();
stop_words.emplace_back(1, token);
eos_token_ids.emplace_back(token);
}

SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
}
};

Expand Down Expand Up @@ -134,7 +150,7 @@ namespace huggingface::tgi::backends::trtllm {
* to initialize `tensorrt_llm::executor::Executor`
* @return `tensorrt_llm::executor::ExecutorConfig` instance
*/
[[nodiscard]] tle::ExecutorConfig executor_config() const;
[[nodiscard]] tle::ExecutorConfig executor_config(const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str) const;
};

/**
Expand All @@ -158,10 +174,10 @@ namespace huggingface::tgi::backends::trtllm {
tle::Executor executor_;

public:
backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str);

backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
: backend_t(engines_folder, executor_worker_path) {};
backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str)
: backend_t(engines_folder, executor_worker_path, encoded_vocab, tokenizer_str) {};

/**
* Submit a new request to the executor
Expand All @@ -175,13 +191,6 @@ namespace huggingface::tgi::backends::trtllm {
submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
sampling_params_t sampling_params) noexcept;

/**
* Query the number of tokens available across all in-flight generations
* @return
*/
[[nodiscard("Pulling out the number of tokens")]]
size_t num_tokens_ready() const noexcept;

/**
* Pull out newly generated tokens from the executor
* @return
Expand All @@ -199,9 +208,9 @@ namespace huggingface::tgi::backends::trtllm {
/**
* Create a TensorRT-LLM executor from a workspace
*/
const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
const auto executor_factory_initializer = [](const backend_workspace_t &workspace, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str) -> tle::Executor {
return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
workspace.executor_config()};
workspace.executor_config(encoded_vocab, tokenizer_str)};
};
}

Expand Down
121 changes: 91 additions & 30 deletions backends/trtllm/csrc/ffi.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#ifndef TGI_BACKEND_TRTLLM_FFI
#define TGI_BACKEND_TRTLLM_FFI

#include <chrono>
#include <exception>
#include <memory>
#include <optional>
#include <thread>

#include <nvml.h>
Expand All @@ -17,7 +20,7 @@ namespace rust::behavior {
template<typename Try, typename Fail>
static void trycatch(Try &&func, Fail &&fail) noexcept try {
func();
} catch (tensorrt_llm::common::TllmException &e) {
} catch (const std::exception &e) {
fail(e.what());
}
}
Expand All @@ -42,22 +45,46 @@ namespace huggingface::tgi::backends::trtllm {
return finish_reason_t::kEND_ID;
case tle::FinishReason::kLENGTH:
return finish_reason_t::kLENGTH;
case tle::FinishReason::kTIMED_OUT:
return finish_reason_t::kTIMED_OUT;
case tle::FinishReason::kCANCELLED:
return finish_reason_t::kCANCELLED;
default:
std::unreachable();
}
}

static auto as_generation_step = [](const tle::Response &r) {
static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point<std::chrono::steady_clock> created) {
const auto reqId = r.getRequestId();
if (!r.hasError()) [[likely]] {
const auto result = r.getResult();
const auto logits = result.logProbs.value()[0];
std::optional<uint32_t> token_id = std::nullopt;
if (!result.outputTokenIds.empty() && !result.outputTokenIds[0].empty()) {
token_id = static_cast<uint32_t>(result.outputTokenIds[0][0]);
}

std::optional<float> log_prob = std::nullopt;
if (result.logProbs && !result.logProbs->empty() && !result.logProbs.value()[0].empty()) {
log_prob = result.logProbs.value()[0].back();
}

std::optional<int64_t> first_scheduled_time_ns = std::nullopt;
if (result.requestPerfMetrics) {
const auto &t = result.requestPerfMetrics->timingMetrics;
const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t.firstScheduledTime - created).count();
first_scheduled_time_ns = static_cast<int64_t>(ns);
}

return generation_step_t{
reqId,
static_cast<uint32_t>(result.outputTokenIds[0][0]),
logits.back(),
token_id.value_or(0),
log_prob.value_or(0.0),
first_scheduled_time_ns.value_or(0),
result.isFinal,
as_finish_reason_t(result.finishReasons[0]),
token_id.has_value(),
log_prob.has_value(),
first_scheduled_time_ns.has_value(),
false,
std::string()
};
Expand All @@ -66,8 +93,12 @@ namespace huggingface::tgi::backends::trtllm {
reqId,
0,
0.0,
0,
true,
finish_reason_t::kNOT_FINISHED,
false,
false,
false,
true,
std::move(r.getErrorMsg())
};
Expand All @@ -77,13 +108,18 @@ namespace huggingface::tgi::backends::trtllm {

class tensorrt_llm_backend_t {
private:
backend_t inner_;
mutable backend_t inner_;

// m_created_time is a reference point to convert time from c++ time_point
// to rust Instant.
std::chrono::time_point<std::chrono::steady_clock> m_created_time;

public:
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
: inner_(engine_folder, executor_worker_path) {}

size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
public:
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time, const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str)
: inner_(engine_folder, executor_worker_path, encoded_vocab, tokenizer_str),
m_created_time {created_time}
{}

request_id_t submit(
rust::Slice<const uint32_t> tokens,
Expand All @@ -93,16 +129,31 @@ namespace huggingface::tgi::backends::trtllm {
float_t temperature,
float_t repetition_penalty,
float_t frequency_penalty,
uint64_t seed
) {
uint64_t seed,
grammar_type_t grammar_type,
rust::Str grammar_value
) const {
// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));

// Submit the request to the executor and get back a potential request_id used to track request status
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());

std::optional<tle::GuidedDecodingParams::GuideType> guide_type = std::nullopt;
switch (grammar_type) {
case grammar_type_t::kJSON:
guide_type = tle::GuidedDecodingParams::GuideType::kJSON_SCHEMA;
break;
case grammar_type_t::kREGEX:
guide_type = tle::GuidedDecodingParams::GuideType::kREGEX;
break;
default:
break;
}

const auto maybe_request_id = inner_.submit(
signed_tokens,
{max_new_tokens},
{max_new_tokens, guide_type, std::string(grammar_value)},
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
);

Expand All @@ -115,28 +166,26 @@ namespace huggingface::tgi::backends::trtllm {
}
}

std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
if (num_tokens_ready() > 0) [[likely]] {
const auto responses = inner_.pull_tokens();
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() const noexcept {
const auto responses = inner_.pull_tokens();

SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());

// Transform tle::Response to generation_step_t
auto f = [this](const tle::Response &r){
return as_generation_step(r, m_created_time);
};
auto steps = std::make_unique<std::vector<generation_step_t>>();
// Transform tle::Response to generation_step_t
#ifdef __cpp_lib_ranges_to_container
auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
*steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
#else
auto steps = std::vector<generation_step_t>();
steps.reserve(responses.size());
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
steps->reserve(responses.size());
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
#endif
return std::make_unique<std::vector<generation_step_t>>(steps);

} else {
return std::make_unique<std::vector<generation_step_t>>();
}
return steps;
}

void cancel(request_id_t request_id) noexcept {
void cancel(request_id_t request_id) const noexcept {
SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
inner_.cancel(request_id);
}
Expand Down Expand Up @@ -178,13 +227,25 @@ namespace huggingface::tgi::backends::trtllm {
}

std::unique_ptr<tensorrt_llm_backend_t>
create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path, const rust::Str tokenizer_str, const rust::Vec<rust::String> encoded_vocab) {
const auto created_time = std::chrono::steady_clock::now();
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);

std::vector<std::string> encoded_vocab_std{};
encoded_vocab_std.reserve(encoded_vocab.size());

for (const auto& v : encoded_vocab) {
encoded_vocab_std.push_back(std::string(v));
}

return std::make_unique<tensorrt_llm_backend_t>(
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
std::filesystem::path::format::auto_format),
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
std::filesystem::path::format::auto_format)
std::filesystem::path::format::auto_format),
created_time,
encoded_vocab_std,
std::string_view(tokenizer_str)
);
}
}
Expand Down
4 changes: 4 additions & 0 deletions backends/trtllm/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,8 @@ pub enum TensorRtLlmBackendError {
WebServer(#[from] server::WebServerError),
#[error("Tokio runtime failed to start: {0}")]
Tokio(#[from] std::io::Error),
#[error("config.json doesn't exist in engine folder {0}")]
ConfigNotFound(PathBuf),
#[error("generation_config.json doesn't exist in engine folder {0}")]
GenerationConfigNotFound(PathBuf),
}
Loading