8
8
#include " backend.h"
9
9
#include " hardware.h"
10
10
11
- void huggingface::tgi::backends::InitializeBackend () {
11
+
12
+ void huggingface::tgi::backends::InitializeLogging () {
13
+ #ifdef NDEBUG
12
14
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv (" TRTLLM_LOG_LEVEL" )) {
13
15
std::string log_level (TRTLLM_LOG_LEVEL_CSTR);
14
16
std::transform (log_level.begin (), log_level.end (), log_level.begin (), [](unsigned char c) {
@@ -20,11 +22,18 @@ void huggingface::tgi::backends::InitializeBackend() {
20
22
else
21
23
spdlog::set_level (spdlog::level::info);
22
24
}
25
+ #else
26
+ spdlog::set_level (spdlog::level::debug);
27
+ #endif
28
+ }
23
29
30
+ void huggingface::tgi::backends::InitializeBackend () {
24
31
SPDLOG_INFO (" Initializing Backend..." );
25
32
nvmlInit_v2 ();
26
33
initTrtLlmPlugins ();
27
34
35
+ InitializeLogging ();
36
+
28
37
SPDLOG_INFO (" Backend Executor Version: {}" , tle::version ());
29
38
const auto numGpus = huggingface::hardware::cuda::GetNumDevices ();
30
39
if (numGpus.has_value ()) {
@@ -34,6 +43,23 @@ void huggingface::tgi::backends::InitializeBackend() {
34
43
}
35
44
}
36
45
46
+ [[nodiscard]]
47
+ tle::ParallelConfig
48
+ huggingface::tgi::backends::GetParallelConfig (const size_t worldSize, const std::string workerPath) noexcept {
49
+ auto mode = tle::CommunicationMode::kLEADER ;
50
+ std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
51
+
52
+ if (worldSize > 1 ) {
53
+ SPDLOG_INFO (" Detected sharded engine deployment, using orchestrator mode" );
54
+ mode = tle::CommunicationMode::kORCHESTRATOR ;
55
+ orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true , workerPath, nullptr , true );
56
+ } else {
57
+ SPDLOG_INFO (" Detected single engine deployment, using leader mode" );
58
+ }
59
+
60
+ return tle::ParallelConfig (tle::CommunicationType::kMPI , mode, std::nullopt, std::nullopt, orchestratorConfig);
61
+ }
62
+
37
63
[[nodiscard]]
38
64
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig (const json &config, const std::string &workerPath) {
39
65
tle::ExecutorConfig execConfig (/* maxBeamWidth = */ 1 );
@@ -42,29 +68,13 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
42
68
const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities ();
43
69
44
70
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
45
- if (config[" /pretrained_config/mapping/world_size" _json_pointer].get <uint8_t >() == 1 ) {
46
- SPDLOG_INFO (" Detected single engine deployment, using leader mode" );
47
- execConfig.setParallelConfig (tle::ParallelConfig (
48
- tle::CommunicationType::kMPI ,
49
- tle::CommunicationMode::kLEADER ,
50
- std::nullopt,
51
- std::nullopt,
52
- std::nullopt
53
- ));
54
- } else { // Multiple engines -> using orchestrator mode (MPI involved)
55
- SPDLOG_INFO (" Detected sharded engine deployment, using orchestrator mode" );
56
- execConfig.setParallelConfig (tle::ParallelConfig (
57
- tle::CommunicationType::kMPI ,
58
- tle::CommunicationMode::kORCHESTRATOR ,
59
- std::nullopt,
60
- std::nullopt,
61
- tle::OrchestratorConfig (true , workerPath, nullptr , true )
62
- ));
63
- }
71
+ const auto worldSize = config[" /pretrained_config/mapping/world_size" _json_pointer].get <size_t >();
72
+ execConfig.setParallelConfig (GetParallelConfig (worldSize, workerPath));
64
73
65
74
// Define some configuration variables
66
75
execConfig.setKvCacheConfig (tle::KvCacheConfig (true ));
67
- execConfig.setEnableChunkedContext (computeCapabilities.isPostAmpere ());
76
+ execConfig.setEnableChunkedContext (computeCapabilities.IsPostAmpere ());
77
+ execConfig.setSchedulerConfig (tle::SchedulerConfig (tle::CapacitySchedulerPolicy::kMAX_UTILIZATION ));
68
78
return execConfig;
69
79
}
70
80
@@ -93,28 +103,66 @@ tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
93
103
);
94
104
}
95
105
106
+ std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
107
+ huggingface::tgi::backends::GetStopWordsFromConfig (
108
+ const std::filesystem::path &generationConfigPath) noexcept {
109
+ if (exists (generationConfigPath)) {
110
+ const auto generationConfig = json::parse (std::ifstream (generationConfigPath));
111
+ if (const auto eosTokenIds = generationConfig[" /eos_token_id" _json_pointer]; eosTokenIds.is_array ()) {
112
+ SPDLOG_INFO (FMT_STRING (" Found {:d} EOS tokens" ), eosTokenIds.size ());
113
+ std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords (eosTokenIds.size ());
114
+
115
+ const auto to_single_token = [](const auto tokenIdObj) -> decltype (stopWords)::value_type {
116
+ return {tokenIdObj.template get <tle::TokenIdType>()};
117
+ };
118
+
119
+ std::transform (eosTokenIds.cbegin (), eosTokenIds.cend (), stopWords.begin (), to_single_token);
120
+ return stopWords;
121
+ } else {
122
+ SPDLOG_INFO (" Invalid EOS tokens entry found (not an array)" );
123
+ }
124
+ } else {
125
+ SPDLOG_INFO (" No EOS tokens found, generation_config.json doesn't exist" );
126
+ }
127
+
128
+ return std::nullopt;
129
+ }
130
+
96
131
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend (
97
132
const std::filesystem::path &enginesFolder,
98
133
const std::filesystem::path &executorWorker
99
134
) :
100
135
config(json::parse(std::ifstream(enginesFolder / " config.json" ))),
101
136
executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY ,
102
137
GetExecutorConfig (config, executorWorker.string())) {
103
- SPDLOG_INFO (FMT_STRING (" Engine (version={})" ), config[" /version" _json_pointer].get_ref <const std::string &>());
138
+
139
+ SPDLOG_INFO (FMT_STRING (" Engine (version={})" ), config[" /version" _json_pointer].get <std::string_view>());
140
+
141
+ // Ensure we have enough GPUs on the system
142
+ const auto worldSize = config[" /pretrained_config/mapping/world_size" _json_pointer].get <size_t >();
143
+ const auto numGpus = huggingface::hardware::cuda::GetNumDevices ().value_or (0 );
144
+ if (numGpus < worldSize) {
145
+ SPDLOG_CRITICAL (FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
146
+ // todo : raise exception to catch on rust side
147
+ }
104
148
105
149
// Cache variables
106
150
maxNumTokens = config[" /build_config/max_num_tokens" _json_pointer].get <uint32_t >();
151
+
152
+ // Attempt to discover stopWords from the generation_config.json
153
+ const auto generationConfigPath = enginesFolder / " generation_config.json" ;
154
+ stopWords = GetStopWordsFromConfig (generationConfigPath).value_or (std::list<std::vector<TokenId>>());
107
155
}
108
156
109
157
[[nodiscard(" Returned number of requests needs to be consumed" )]]
110
158
size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
159
+ #ifdef NDEBUG
160
+ return executor.getNumResponsesReady ();
161
+ #else
111
162
const auto numResponses = executor.getNumResponsesReady ();
112
-
113
- #ifndef NDEBUG
114
- if (numResponses > 0 ) SPDLOG_INFO (FMT_STRING (" Num responses ready: {:d}" ), numResponses);
115
- #endif
116
-
163
+ if (numResponses > 0 ) SPDLOG_INFO (FMT_STRING (" Num responses ready: {:d}" ), numResponses);
117
164
return numResponses;
165
+ #endif
118
166
}
119
167
120
168
[[nodiscard(" Returned request id needs to be provided back to gather generated tokens" )]]
@@ -124,8 +172,8 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
124
172
const int32_t topK,
125
173
const float_t topP,
126
174
const float_t temperature,
127
- const float_t repetition_penalty ,
128
- const float_t frequency_penalty ,
175
+ const float_t repetitionPenalty ,
176
+ const float_t frequencyPenalty ,
129
177
const uint64_t seed
130
178
) {
131
179
const auto maxNewTokensChecked = std::min (maxNewTokens, static_cast <uint32_t >(maxNumTokens - tokens.size ()));
@@ -135,14 +183,19 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
135
183
const auto &lastIteration = iterations.front ();
136
184
137
185
SPDLOG_DEBUG (FMT_EXECUTOR_STATS, fmt::join (tokens, " , " ), lastIteration.numActiveRequests );
138
- SPDLOG_DEBUG (FMT_SAMPLING_CONFIG, topK, topP, temperature, repetition_penalty, frequency_penalty , seed);
186
+ SPDLOG_DEBUG (FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty , seed);
139
187
SPDLOG_DEBUG (FMT_STRING (" Asking for max_new_tokens={:d}" ), maxNewTokensChecked);
140
188
}
141
189
#endif
142
190
143
- const auto sampling = GetSamplingConfig (topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
144
- const auto maxNewTokensChecked_ = static_cast <tle::SizeType32>(maxNewTokensChecked);
145
- return executor.enqueueRequest (tle::Request{tokens, maxNewTokensChecked_, true , sampling, OUTPUT_CONFIG});
191
+ const auto sampling = GetSamplingConfig (topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
192
+
193
+ // Build the request
194
+ auto request = tle::Request{tokens, CAST_SIZETYPE (maxNewTokensChecked), true , sampling, OUTPUT_CONFIG};
195
+ request.setStopWords (stopWords);
196
+
197
+ // Submit to the executor for batching
198
+ return executor.enqueueRequest (request);
146
199
}
147
200
148
201
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens () {
0 commit comments