@@ -481,41 +481,44 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
481
481
</ div >
482
482
< div class ="output_area docutils container ">
483
483
< div class ="highlight "> < pre >
484
+ INFO 03-22 07:52:22 __init__.py:190] Automatically detected platform cuda.
484
485
The following error message 'operation scheduled before its operands' can be ignored.
485
- [2025-03-22 02:14:52] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, completion_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=36642, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=112918802, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=5, speculative_eagle_topk=4, speculative_num_draft_tokens=8, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, enable_flashinfer_mla=False, enable_flashmla=False, flashinfer_mla_disable_ragged=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998)
486
- [2025-03-22 02:14:57] Downcasting torch.float32 to torch.float16.
486
+ [2025-03-22 07:52:26] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, completion_template=None, is_embedding=True, revision=None, host='0.0.0.0', port=39753, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=286557904, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method='round_robin', ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='xgrammar', speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=5, speculative_eagle_topk=4, speculative_num_draft_tokens=8, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_deepep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, cuda_graph_bs=None, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, hicache_ratio=2.0, enable_flashinfer_mla=False, enable_flashmla=False, flashinfer_mla_disable_ragged=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_bootstrap_port=8998)
487
+ [2025-03-22 07:52:31] Downcasting torch.float32 to torch.float16.
488
+ INFO 03-22 07:52:38 __init__.py:190] Automatically detected platform cuda.
489
+ INFO 03-22 07:52:38 __init__.py:190] Automatically detected platform cuda.
487
490
The following error message 'operation scheduled before its operands' can be ignored.
488
491
The following error message 'operation scheduled before its operands' can be ignored.
489
- [2025-03-22 02:15:13 TP0] Downcasting torch.float32 to torch.float16.
490
- [2025-03-22 02:15:13 TP0] Overlap scheduler is disabled for embedding models.
491
- [2025-03-22 02:15:13 TP0] Downcasting torch.float32 to torch.float16.
492
- [2025-03-22 02:15:13 TP0] Init torch distributed begin.
493
- [2025-03-22 02:15:13 TP0] Init torch distributed ends. mem usage=0.00 GB
494
- [2025-03-22 02:15:13 TP0] Load weight begin. avail mem=62.69 GB
495
- [2025-03-22 02:15:14 TP0] Using model weights format ['*.safetensors']
492
+ [2025-03-22 07:52:48 TP0] Downcasting torch.float32 to torch.float16.
493
+ [2025-03-22 07:52:49 TP0] Overlap scheduler is disabled for embedding models.
494
+ [2025-03-22 07:52:49 TP0] Downcasting torch.float32 to torch.float16.
495
+ [2025-03-22 07:52:49 TP0] Init torch distributed begin.
496
+ [2025-03-22 07:52:49 TP0] Init torch distributed ends. mem usage=0.00 GB
497
+ [2025-03-22 07:52:49 TP0] Load weight begin. avail mem=59.71 GB
498
+ [2025-03-22 07:52:50 TP0] Using model weights format ['*.safetensors']
496
499
Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00<?, ?it/s]
497
- Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:01 <00:10 , 1.70s/it ]
498
- Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:03 <00:09 , 1.80s /it]
499
- Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:05 <00:07 , 1.78s /it]
500
- Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:07 <00:05 , 1.78s /it]
501
- Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:08 <00:03, 1.77s /it]
502
- Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:09<00:01, 1.46s /it]
503
- Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.42s /it]
504
- Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.58s /it]
505
-
506
- [2025-03-22 02:15:25 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=28.31 GB, mem usage=34.38 GB.
507
- [2025-03-22 02:15:25 TP0] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
508
- [2025-03-22 02:15:25 TP0] Memory pool end. avail mem=26.94 GB
509
- [2025-03-22 02:15:26 TP0] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
510
- [2025-03-22 02:15:26 ] INFO: Started server process [1585814 ]
511
- [2025-03-22 02:15:26 ] INFO: Waiting for application startup.
512
- [2025-03-22 02:15:26 ] INFO: Application startup complete.
513
- [2025-03-22 02:15:26 ] INFO: Uvicorn running on http://0.0.0.0:36642 (Press CTRL+C to quit)
514
- [2025-03-22 02:15:27 ] INFO: 127.0.0.1:33756 - "GET /v1/models HTTP/1.1" 200 OK
515
- [2025-03-22 02:15:27 ] INFO: 127.0.0.1:33762 - "GET /get_model_info HTTP/1.1" 200 OK
516
- [2025-03-22 02:15:27 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
517
- [2025-03-22 02:15:29 ] INFO: 127.0.0.1:33774 - "POST /encode HTTP/1.1" 200 OK
518
- [2025-03-22 02:15:29 ] The server is fired up and ready to roll!
500
+ Loading safetensors checkpoint shards: 14% Completed | 1/7 [00:00 <00:04 , 1.24it/s ]
501
+ Loading safetensors checkpoint shards: 29% Completed | 2/7 [00:02 <00:07 , 1.41s /it]
502
+ Loading safetensors checkpoint shards: 43% Completed | 3/7 [00:04 <00:06 , 1.64s /it]
503
+ Loading safetensors checkpoint shards: 57% Completed | 4/7 [00:06 <00:04 , 1.57s /it]
504
+ Loading safetensors checkpoint shards: 71% Completed | 5/7 [00:07 <00:03, 1.69s /it]
505
+ Loading safetensors checkpoint shards: 86% Completed | 6/7 [00:09<00:01, 1.78s /it]
506
+ Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.86s /it]
507
+ Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11<00:00, 1.70s /it]
508
+
509
+ [2025-03-22 07:53:02 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=47.49 GB, mem usage=12.22 GB.
510
+ [2025-03-22 07:53:02 TP0] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
511
+ [2025-03-22 07:53:02 TP0] Memory pool end. avail mem=46.12 GB
512
+ [2025-03-22 07:53:02 TP0] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
513
+ [2025-03-22 07:53:03 ] INFO: Started server process [977038 ]
514
+ [2025-03-22 07:53:03 ] INFO: Waiting for application startup.
515
+ [2025-03-22 07:53:03 ] INFO: Application startup complete.
516
+ [2025-03-22 07:53:03 ] INFO: Uvicorn running on http://0.0.0.0:39753 (Press CTRL+C to quit)
517
+ [2025-03-22 07:53:03 ] INFO: 127.0.0.1:43086 - "GET /v1/models HTTP/1.1" 200 OK
518
+ [2025-03-22 07:53:04 ] INFO: 127.0.0.1:43096 - "GET /get_model_info HTTP/1.1" 200 OK
519
+ [2025-03-22 07:53:04 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
520
+ [2025-03-22 07:53:05 ] INFO: 127.0.0.1:43104 - "POST /encode HTTP/1.1" 200 OK
521
+ [2025-03-22 07:53:05 ] The server is fired up and ready to roll!
519
522
</ pre > </ div > </ div >
520
523
</ div >
521
524
< div class ="nboutput nblast docutils container ">
@@ -551,8 +554,8 @@ <h2>Using cURL<a class="headerlink" href="#Using-cURL" title="Link to this headi
551
554
</ div >
552
555
< div class ="output_area docutils container ">
553
556
< div class ="highlight "> < pre >
554
- [2025-03-22 02:15:32 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
555
- [2025-03-22 02:15:32 ] INFO: 127.0.0.1:43494 - "POST /v1/embeddings HTTP/1.1" 200 OK
557
+ [2025-03-22 07:53:08 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
558
+ [2025-03-22 07:53:08 ] INFO: 127.0.0.1:57040 - "POST /v1/embeddings HTTP/1.1" 200 OK
556
559
</ pre > </ div > </ div >
557
560
</ div >
558
561
< div class ="nboutput nblast docutils container ">
@@ -588,8 +591,8 @@ <h2>Using Python Requests<a class="headerlink" href="#Using-Python-Requests" tit
588
591
</ div >
589
592
< div class ="output_area docutils container ">
590
593
< div class ="highlight "> < pre >
591
- [2025-03-22 02:15:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
592
- [2025-03-22 02:15:32 ] INFO: 127.0.0.1:43496 - "POST /v1/embeddings HTTP/1.1" 200 OK
594
+ [2025-03-22 07:53:08 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
595
+ [2025-03-22 07:53:08 ] INFO: 127.0.0.1:57052 - "POST /v1/embeddings HTTP/1.1" 200 OK
593
596
</ pre > </ div > </ div >
594
597
</ div >
595
598
< div class ="nboutput nblast docutils container ">
@@ -625,8 +628,8 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
625
628
</ div >
626
629
< div class ="output_area docutils container ">
627
630
< div class ="highlight "> < pre >
628
- [2025-03-22 02:15:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
629
- [2025-03-22 02:15:32 ] INFO: 127.0.0.1:43504 - "POST /v1/embeddings HTTP/1.1" 200 OK
631
+ [2025-03-22 07:53:08 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
632
+ [2025-03-22 07:53:08 ] INFO: 127.0.0.1:57060 - "POST /v1/embeddings HTTP/1.1" 200 OK
630
633
</ pre > </ div > </ div >
631
634
</ div >
632
635
< div class ="nboutput nblast docutils container ">
@@ -668,8 +671,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
668
671
</ div >
669
672
< div class ="output_area docutils container ">
670
673
< div class ="highlight "> < pre >
671
- [2025-03-22 02:15:37 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
672
- [2025-03-22 02:15:37 ] INFO: 127.0.0.1:43512 - "POST /v1/embeddings HTTP/1.1" 200 OK
674
+ [2025-03-22 07:53:14 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
675
+ [2025-03-22 07:53:14 ] INFO: 127.0.0.1:57072 - "POST /v1/embeddings HTTP/1.1" 200 OK
673
676
</ pre > </ div > </ div >
674
677
</ div >
675
678
< div class ="nboutput nblast docutils container ">
0 commit comments