|
495 | 495 | tb_qwen_chat_7b = dict(type=TurboMindModel,
|
496 | 496 | abbr='qwen-7b-chat-turbomind',
|
497 | 497 | path='Qwen/Qwen-7B-Chat',
|
498 |
| - engine_config=tb_engine_config_template_max_bs_16, |
| 498 | + engine_config=tb_engine_config_template_max_bs_128, |
499 | 499 | gen_config=qwen_gen_config_template,
|
500 | 500 | max_out_len=MAX_NEW_TOKENS,
|
501 | 501 | max_seq_len=MAX_SESSION_LEN,
|
502 |
| - batch_size=16, |
503 |
| - concurrency=16, |
| 502 | + batch_size=128, |
| 503 | + concurrency=128, |
504 | 504 | meta_template=qwen_meta_template,
|
505 | 505 | run_cfg=run_cfg_tp1_template,
|
506 | 506 | end_str='<|im_end|>')
|
507 | 507 |
|
| 508 | +tb_qwen_chat_7b_w4a16 = dict( |
| 509 | + type=TurboMindModel, |
| 510 | + abbr='qwen-7b-chat-4bits-turbomind', |
| 511 | + path='Qwen/Qwen-7B-Chat-inner-4bits', |
| 512 | + engine_config=tb_awq_engine_config_template_max_bs_128, |
| 513 | + gen_config=qwen_gen_config_template, |
| 514 | + max_out_len=MAX_NEW_TOKENS, |
| 515 | + max_seq_len=MAX_SESSION_LEN, |
| 516 | + batch_size=128, |
| 517 | + concurrency=128, |
| 518 | + meta_template=qwen_meta_template, |
| 519 | + run_cfg=run_cfg_tp1_template, |
| 520 | + end_str='<|im_end|>') |
| 521 | + |
508 | 522 | # config for qwen-chat-7b pytorch
|
509 | 523 | pt_qwen_chat_7b = dict(type=LmdeployPytorchModel,
|
510 | 524 | abbr='qwen-7b-chat-pytorch',
|
511 | 525 | path='Qwen/Qwen-7B-Chat',
|
512 |
| - engine_config=pt_engine_config_template_max_bs_16, |
| 526 | + engine_config=pt_engine_config_template_max_bs_64, |
513 | 527 | gen_config=qwen_gen_config_template,
|
514 | 528 | max_out_len=MAX_NEW_TOKENS,
|
515 | 529 | max_seq_len=MAX_SESSION_LEN,
|
516 |
| - batch_size=16, |
517 |
| - concurrency=16, |
| 530 | + batch_size=64, |
| 531 | + concurrency=64, |
518 | 532 | meta_template=qwen_meta_template,
|
519 | 533 | run_cfg=run_cfg_tp1_template,
|
520 | 534 | end_str='<|im_end|>')
|
|
552 | 566 | run_cfg=run_cfg_tp1_template,
|
553 | 567 | end_str='[INST]')
|
554 | 568 |
|
| 569 | +# config for llama2-chat-7b-w4a16 turbomind |
| 570 | +tb_llama2_chat_7b_wa416 = dict( |
| 571 | + type=TurboMindModel, |
| 572 | + abbr='llama-2-7b-chat-4bits-turbomind', |
| 573 | + path='meta-llama/Llama-2-7b-chat-hf-inner-4bits', |
| 574 | + engine_config=tb_awq_engine_config_template_max_bs_128, |
| 575 | + gen_config=gen_config_template, |
| 576 | + max_out_len=MAX_NEW_TOKENS, |
| 577 | + max_seq_len=MAX_SESSION_LEN, |
| 578 | + batch_size=128, |
| 579 | + concurrency=128, |
| 580 | + meta_template=llama2_meta_template, |
| 581 | + run_cfg=run_cfg_tp1_template, |
| 582 | + end_str='[INST]') |
| 583 | + |
555 | 584 | # config for llama2-chat-7b pytorch
|
556 | 585 | pt_llama2_chat_7b = dict(type=LmdeployPytorchModel,
|
557 | 586 | abbr='llama-2-7b-chat-pytorch',
|
|
854 | 883 | meta_template=llama3_meta_template,
|
855 | 884 | run_cfg=run_cfg_tp1_template,
|
856 | 885 | end_str='[INST]')
|
| 886 | + |
| 887 | +# config for code llama |
| 888 | +tb_codellama_7b_chat = dict(type=TurboMindModel, |
| 889 | + abbr='codellama-7b-chat-turbomind', |
| 890 | + path='codellama/CodeLlama-7b-Instruct-hf', |
| 891 | + engine_config=tb_engine_config_template_max_bs_128, |
| 892 | + gen_config=gen_config_template, |
| 893 | + max_out_len=MAX_NEW_TOKENS, |
| 894 | + max_seq_len=MAX_SESSION_LEN, |
| 895 | + batch_size=128, |
| 896 | + concurrency=128, |
| 897 | + run_cfg=dict(num_gpus=1, num_procs=1), |
| 898 | + end_str='</s>') |
| 899 | + |
| 900 | +tb_codellama_7b_chat_w4a16 = dict( |
| 901 | + type=TurboMindModel, |
| 902 | + abbr='codellama-7b-chat-4bits-turbomind', |
| 903 | + path='codellama/CodeLlama-7b-Instruct-hf-inner-4bits', |
| 904 | + engine_config=tb_awq_engine_config_template_max_bs_128, |
| 905 | + gen_config=gen_config_template, |
| 906 | + max_out_len=MAX_NEW_TOKENS, |
| 907 | + max_seq_len=MAX_SESSION_LEN, |
| 908 | + batch_size=128, |
| 909 | + concurrency=128, |
| 910 | + run_cfg=dict(num_gpus=1, num_procs=1), |
| 911 | + end_str='</s>') |
| 912 | + |
| 913 | +pt_codellama_7b_chat = dict(type=LmdeployPytorchModel, |
| 914 | + abbr='codellama-7b-chat-pytorch', |
| 915 | + path='codellama/CodeLlama-7b-Instruct-hf', |
| 916 | + engine_config=pt_engine_config_template_max_bs_128, |
| 917 | + gen_config=gen_config_template, |
| 918 | + max_out_len=MAX_NEW_TOKENS, |
| 919 | + max_seq_len=MAX_SESSION_LEN, |
| 920 | + batch_size=128, |
| 921 | + concurrency=128, |
| 922 | + run_cfg=dict(num_gpus=1, num_procs=1), |
| 923 | + end_str='</s>') |
0 commit comments