Skip to content

Commit 66ef1df

Browse files
kaiyuxLokiiiiii
andauthored
Update TensorRT-LLM (NVIDIA#1492)
* Update TensorRT-LLM --------- Co-authored-by: Loki <[email protected]>
1 parent 71d8d4d commit 66ef1df

File tree

319 files changed

+21366
-37304
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

319 files changed

+21366
-37304
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ cpp/.ccache/
3232
tensorrt_llm/libs
3333
tensorrt_llm/bindings.pyi
3434
tensorrt_llm/bindings/*.pyi
35+
*docs/cpp_docs*
36+
*docs/source/_cpp_gen*
3537

3638
# Testing
3739
.coverage.*

3rdparty/cutlass

Submodule cutlass updated 548 files

README.md

+15-445
Large diffs are not rendered by default.

benchmarks/cpp/README.md

+3-7
Original file line numberDiff line numberDiff line change
@@ -225,23 +225,19 @@ python examples/llama/convert_checkpoint.py --model_dir ${MODEL_CHECKPOINT} \
225225
--output_dir ${CONVERTED_CHECKPOINT} \
226226
--dtype ${DTYPE} \
227227
--tp_size ${TP} \
228-
--pp_size 1 \
229-
--lora_target_modules attn_qkv \
230-
--max_lora_rank ${MAX_LORA_RANK}
228+
--pp_size 1
231229
232230
${HOME}/.local/bin/trtllm-build \
233231
--checkpoint_dir ${CONVERTED_CHECKPOINT} \
234232
--output_dir ${LORA_ENGINE} \
235233
--max_batch_size ${MAX_BATCH} \
236234
--max_input_len $MAX_LEN \
237235
--max_output_len $MAX_LEN \
238-
--gpt_attention_plugin float16 \
239-
--paged_kv_cache enable \
240-
--remove_input_padding enable \
241236
--gemm_plugin float16 \
242237
--lora_plugin float16 \
243238
--use_paged_context_fmha enable \
244-
--use_custom_all_reduce disable
239+
--lora_target_modules attn_qkv \
240+
--max_lora_rank ${MAX_LORA_RANK}
245241
246242
NUM_LORAS=(8 16 24 32 64 128 256)
247243
NUM_REQUESTS=1024

benchmarks/cpp/gptSessionBenchmark.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17+
18+
/*****************************************************************************
19+
*
20+
* GptSession is going to be deprecated soon.
21+
* Please do not add new functionality in this file!
22+
*
23+
*****************************************************************************/
24+
1725
#include "tensorrt_llm/common/cudaUtils.h"
1826
#include "tensorrt_llm/common/mpiUtils.h"
1927
#include "tensorrt_llm/plugins/api/tllmPlugin.h"

benchmarks/python/allowed_configs.py

+33
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,39 @@ class ModelConfig:
11271127
max_output_len=200,
11281128
builder_opt=None,
11291129
)),
1130+
"qwen1.5_7b_chat":
1131+
ModelConfig(name="qwen1.5_7b_chat",
1132+
family="qwen2",
1133+
benchmark_type="gpt",
1134+
build_config=BuildConfig(num_layers=32,
1135+
num_heads=32,
1136+
hidden_size=4096,
1137+
vocab_size=151936,
1138+
hidden_act='silu',
1139+
n_positions=8192,
1140+
inter_size=11008,
1141+
max_batch_size=128,
1142+
max_input_len=512,
1143+
max_output_len=200,
1144+
builder_opt=None,
1145+
bias=False)),
1146+
"qwen1.5_14b_chat":
1147+
ModelConfig(name="qwen1.5_14b_chat",
1148+
family="qwen2",
1149+
benchmark_type="gpt",
1150+
build_config=BuildConfig(
1151+
num_layers=40,
1152+
num_heads=40,
1153+
hidden_size=5120,
1154+
vocab_size=152064,
1155+
hidden_act='silu',
1156+
n_positions=8192,
1157+
inter_size=13696,
1158+
max_batch_size=64,
1159+
max_input_len=512,
1160+
max_output_len=200,
1161+
builder_opt=None,
1162+
)),
11301163
"mamba_2.8b":
11311164
ModelConfig(name="mamba_2.8b",
11321165
family="mamba",

benchmarks/python/build.py

+46
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ def build_gpt(args):
232232
builder_config_extra_kwargs['mamba_expand'] = build_config[
233233
'mamba_expand']
234234
builder_config_extra_kwargs['max_beam_width'] = max_beam_width
235+
builder_config_extra_kwargs['layer_types'] = ['recurrent']
235236
builder_config = builder.create_builder_config(
236237
name=args.model,
237238
precision=args.dtype,
@@ -715,6 +716,51 @@ def build_gpt(args):
715716
build_config["moe_num_experts"],
716717
'moe_top_k':
717718
build_config["moe_top_k"],
719+
'qwen_type':
720+
'qwen',
721+
}
722+
config = PretrainedConfig.from_dict(config)
723+
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)
724+
elif family == "qwen2":
725+
config = {
726+
'architecture':
727+
'QWenForCausalLM',
728+
'dtype':
729+
args.dtype,
730+
'num_hidden_layers':
731+
build_config['num_layers'],
732+
'num_attention_heads':
733+
build_config['num_heads'],
734+
'num_key_value_heads':
735+
build_config['num_heads'] if build_config['num_kv_heads'] is None
736+
else build_config['num_kv_heads'],
737+
'hidden_size':
738+
build_config['hidden_size'],
739+
'intermediate_size':
740+
build_config['inter_size'],
741+
'vocab_size':
742+
build_config['vocab_size'],
743+
'position_embedding_type':
744+
'rope_gpt_neox',
745+
'max_position_embeddings':
746+
build_config['n_positions'],
747+
'hidden_act':
748+
build_config['hidden_act'],
749+
'quantization': {
750+
'group_size': 128,
751+
'quant_algo': quant_algo,
752+
'kv_cache_quant_algo': kv_cache_quant_algo
753+
},
754+
'mapping': {
755+
'world_size': world_size,
756+
'tp_size': world_size
757+
},
758+
'moe_num_experts':
759+
build_config["moe_num_experts"],
760+
'moe_top_k':
761+
build_config["moe_top_k"],
762+
'qwen_type':
763+
'qwen2',
718764
}
719765
config = PretrainedConfig.from_dict(config)
720766
tensorrt_llm_model = tensorrt_llm.models.QWenForCausalLM(config)

cpp/include/tensorrt_llm/batch_manager/GptManager.h

+8-4
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
#include "tensorrt_llm/batch_manager/llmRequest.h"
2222
#include "tensorrt_llm/batch_manager/schedulerPolicy.h"
2323
#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
24-
#include "tensorrt_llm/runtime/gptModelConfig.h"
24+
#include "tensorrt_llm/runtime/modelConfig.h"
2525
#include "tensorrt_llm/runtime/worldConfig.h"
2626

2727
#include <atomic>
@@ -79,17 +79,21 @@ class GptManager
7979
virtual ~GptManager();
8080

8181
protected:
82+
/* Synchronizes the decoder */
83+
virtual BatchManagerErrorCode_t forwardSync();
84+
8285
/* Invokes one step of backend
8386
Updates state of all requests */
84-
virtual BatchManagerErrorCode_t step(RequestList& activeRequests, std::set<uint64_t>& activeRequestsIds);
87+
virtual BatchManagerErrorCode_t forwardAsync(
88+
RequestList& activeRequests, std::unordered_set<uint64_t>& activeRequestsIds);
8589

8690
private:
8791
[[nodiscard]] SizeType getMaxInputLen() const;
8892
[[nodiscard]] SizeType getMaxSequenceLen() const;
8993
[[nodiscard]] SizeType getMaxNumSequences() const;
9094

9195
void validateLlmRequest(
92-
LlmRequest& newReq, runtime::GptModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
96+
LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
9397
static std::shared_ptr<LlmRequest> fillLlmRequest(std::shared_ptr<InferenceRequest> newReq);
9498
static std::shared_ptr<std::vector<TokenIdType>> getReqInputTokens(std::shared_ptr<InferenceRequest> newReq);
9599
static SizeType getMaxNewTokens(std::shared_ptr<InferenceRequest> newReq);
@@ -108,7 +112,7 @@ class GptManager
108112
// List of live requests
109113
RequestList mActiveRequests;
110114
// IDs of live requests
111-
std::set<uint64_t> mActiveRequestsIds;
115+
std::unordered_set<uint64_t> mActiveRequestsIds;
112116
// Boolean that controls if prompt should be included in output tokens for non-streaming
113117
bool mExcludeInputInOutput;
114118

cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h

+2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ class KvCacheConfig
6363
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
6464
}
6565

66+
friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
67+
6668
std::optional<SizeType> maxTokens;
6769
std::optional<SizeType> maxAttentionWindow;
6870
std::optional<SizeType> sinkTokenLength;

0 commit comments

Comments
 (0)