@@ -481,39 +481,39 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
481
481
</ div >
482
482
< div class ="output_area docutils container ">
483
483
< div class ="highlight "> < pre >
484
- [2025-03-14 07:04:11] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=36081, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=341608108, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=5, speculative_eagle_topk=4, speculative_num_draft_tokens=8, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, enable_flashinfer_mla=False, flashinfer_mla_disable_ragged=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False)
485
- [2025-03-14 07:04:17 ] Downcasting torch.float32 to torch.float16.
486
- [2025-03-14 07:04:30 TP0] Downcasting torch.float32 to torch.float16.
487
- [2025-03-14 07:04:30 TP0] Overlap scheduler is disabled for embedding models.
488
- [2025-03-14 07:04:30 TP0] Downcasting torch.float32 to torch.float16.
489
- [2025-03-14 07:04:30 TP0] Init torch distributed begin.
490
- [2025-03-14 07:04:31 TP0] Init torch distributed ends. mem usage=0.00 GB
491
- [2025-03-14 07:04:31 TP0] Load weight begin. avail mem=59.32 GB
492
- [2025-03-14 07:04:31 TP0] The following error message 'operation scheduled before its operands' can be ignored.
493
- [2025-03-14 07:04:35 TP0] Using model weights format ['*.safetensors']
484
+ [2025-03-14 07:44:15] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=37732, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=125932383, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=5, speculative_eagle_topk=4, speculative_num_draft_tokens=8, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, enable_flashinfer_mla=False, flashinfer_mla_disable_ragged=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False)
485
+ [2025-03-14 07:44:20 ] Downcasting torch.float32 to torch.float16.
486
+ [2025-03-14 07:44:34 TP0] Downcasting torch.float32 to torch.float16.
487
+ [2025-03-14 07:44:34 TP0] Overlap scheduler is disabled for embedding models.
488
+ [2025-03-14 07:44:34 TP0] Downcasting torch.float32 to torch.float16.
489
+ [2025-03-14 07:44:34 TP0] Init torch distributed begin.
490
+ [2025-03-14 07:44:34 TP0] Init torch distributed ends. mem usage=0.00 GB
491
+ [2025-03-14 07:44:34 TP0] Load weight begin. avail mem=63.19 GB
492
+ [2025-03-14 07:44:34 TP0] The following error message 'operation scheduled before its operands' can be ignored.
493
+ [2025-03-14 07:44:36 TP0] Using model weights format ['*.safetensors']
494
494
Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00<?, ?it/s]
495
- Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:01<00:10 , 1.71s /it]
496
- Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:03<00:09 , 1.84s /it]
497
- Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:05 <00:07 , 1.87s /it]
498
- Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:07 <00:05, 1.87s /it]
499
- Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:09 <00:03, 1.94s /it]
500
- Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:10<00:01, 1.62s /it]
501
- Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11 <00:00, 1.58s /it]
502
- Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11 <00:00, 1.71s /it]
503
-
504
- [2025-03-14 07:04 :48 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=47.33 GB, mem usage=11.99 GB.
505
- [2025-03-14 07:04 :48 TP0] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
506
- [2025-03-14 07:04 :48 TP0] Memory pool end. avail mem=45.96 GB
507
- [2025-03-14 07:04:48 TP0] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
508
- [2025-03-14 07:04:48 ] INFO: Started server process [2222846 ]
509
- [2025-03-14 07:04:48 ] INFO: Waiting for application startup.
510
- [2025-03-14 07:04:48 ] INFO: Application startup complete.
511
- [2025-03-14 07:04:48 ] INFO: Uvicorn running on http://0.0.0.0:36081 (Press CTRL+C to quit)
512
- [2025-03-14 07:04 :49] INFO: 127.0.0.1:34548 - "GET /v1/models HTTP/1.1" 200 OK
513
- [2025-03-14 07:04:49 ] INFO: 127.0.0.1:34554 - "GET /get_model_info HTTP/1.1" 200 OK
514
- [2025-03-14 07:04:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
515
- [2025-03-14 07:04 :51] INFO: 127.0.0.1:34556 - "POST /encode HTTP/1.1" 200 OK
516
- [2025-03-14 07:04 :51] The server is fired up and ready to roll!
495
+ Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:01<00:08 , 1.39s /it]
496
+ Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:03<00:08 , 1.69s /it]
497
+ Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:04 <00:05 , 1.35s /it]
498
+ Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:06 <00:05, 1.73s /it]
499
+ Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:08 <00:03, 1.78s /it]
500
+ Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:10<00:01, 1.86s /it]
501
+ Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12 <00:00, 1.91s /it]
502
+ Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12 <00:00, 1.78s /it]
503
+
504
+ [2025-03-14 07:44 :48 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=28.27 GB, mem usage=34.93 GB.
505
+ [2025-03-14 07:44 :48 TP0] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
506
+ [2025-03-14 07:44 :48 TP0] Memory pool end. avail mem=26.90 GB
507
+ [2025-03-14 07:44:49 TP0] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
508
+ [2025-03-14 07:44:49 ] INFO: Started server process [817661 ]
509
+ [2025-03-14 07:44:49 ] INFO: Waiting for application startup.
510
+ [2025-03-14 07:44:49 ] INFO: Application startup complete.
511
+ [2025-03-14 07:44:49 ] INFO: Uvicorn running on http://0.0.0.0:37732 (Press CTRL+C to quit)
512
+ [2025-03-14 07:44 :49] INFO: 127.0.0.1:53222 - "GET /v1/models HTTP/1.1" 200 OK
513
+ [2025-03-14 07:44:50 ] INFO: 127.0.0.1:53232 - "GET /get_model_info HTTP/1.1" 200 OK
514
+ [2025-03-14 07:44:50 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
515
+ [2025-03-14 07:44 :51] INFO: 127.0.0.1:53240 - "POST /encode HTTP/1.1" 200 OK
516
+ [2025-03-14 07:44 :51] The server is fired up and ready to roll!
517
517
</ pre > </ div > </ div >
518
518
</ div >
519
519
< div class ="nboutput nblast docutils container ">
@@ -549,8 +549,8 @@ <h2>Using cURL<a class="headerlink" href="#Using-cURL" title="Link to this headi
549
549
</ div >
550
550
< div class ="output_area docutils container ">
551
551
< div class ="highlight "> < pre >
552
- [2025-03-14 07:04 :54 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
553
- [2025-03-14 07:04 :54] INFO: 127.0.0.1:36858 - "POST /v1/embeddings HTTP/1.1" 200 OK
552
+ [2025-03-14 07:44 :54 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
553
+ [2025-03-14 07:44 :54] INFO: 127.0.0.1:53250 - "POST /v1/embeddings HTTP/1.1" 200 OK
554
554
</ pre > </ div > </ div >
555
555
</ div >
556
556
< div class ="nboutput nblast docutils container ">
@@ -586,8 +586,8 @@ <h2>Using Python Requests<a class="headerlink" href="#Using-Python-Requests" tit
586
586
</ div >
587
587
< div class ="output_area docutils container ">
588
588
< div class ="highlight "> < pre >
589
- [2025-03-14 07:04:54 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
590
- [2025-03-14 07:04:54 ] INFO: 127.0.0.1:36864 - "POST /v1/embeddings HTTP/1.1" 200 OK
589
+ [2025-03-14 07:44:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
590
+ [2025-03-14 07:44:55 ] INFO: 127.0.0.1:53258 - "POST /v1/embeddings HTTP/1.1" 200 OK
591
591
</ pre > </ div > </ div >
592
592
</ div >
593
593
< div class ="nboutput nblast docutils container ">
@@ -623,8 +623,8 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
623
623
</ div >
624
624
< div class ="output_area docutils container ">
625
625
< div class ="highlight "> < pre >
626
- [2025-03-14 07:04:54 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
627
- [2025-03-14 07:04:54 ] INFO: 127.0.0.1:36868 - "POST /v1/embeddings HTTP/1.1" 200 OK
626
+ [2025-03-14 07:44:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
627
+ [2025-03-14 07:44:55 ] INFO: 127.0.0.1:53272 - "POST /v1/embeddings HTTP/1.1" 200 OK
628
628
</ pre > </ div > </ div >
629
629
</ div >
630
630
< div class ="nboutput nblast docutils container ">
@@ -666,8 +666,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
666
666
</ div >
667
667
< div class ="output_area docutils container ">
668
668
< div class ="highlight "> < pre >
669
- [2025-03-14 07:04:59 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
670
- [2025-03-14 07:04:59 ] INFO: 127.0.0.1:36870 - "POST /v1/embeddings HTTP/1.1" 200 OK
669
+ [2025-03-14 07:45:00 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
670
+ [2025-03-14 07:45:00 ] INFO: 127.0.0.1:35672 - "POST /v1/embeddings HTTP/1.1" 200 OK
671
671
</ pre > </ div > </ div >
672
672
</ div >
673
673
< div class ="nboutput nblast docutils container ">
0 commit comments