sgl-project
diff --git a/‎_sources/backend/function_calling.ipynb
+112-144 b/‎_sources/backend/function_calling.ipynb
+112-144
diff --git a/‎_sources/backend/native_api.ipynb
+243-218 b/‎_sources/backend/native_api.ipynb
+243-218
diff --git a/‎_sources/backend/offline_engine_api.ipynb
+428-420 b/‎_sources/backend/offline_engine_api.ipynb
+428-420
diff --git a/‎_sources/backend/openai_api_completions.ipynb
+193-170 b/‎_sources/backend/openai_api_completions.ipynb
+193-170
diff --git a/‎_sources/backend/openai_api_embeddings.ipynb
+69-63 b/‎_sources/backend/openai_api_embeddings.ipynb
+69-63
diff --git a/‎_sources/backend/openai_api_vision.ipynb
+98-86 b/‎_sources/backend/openai_api_vision.ipynb
+98-86
diff --git a/‎_sources/backend/send_request.ipynb
+81-116 b/‎_sources/backend/send_request.ipynb
+81-116
diff --git a/‎_sources/backend/separate_reasoning.ipynb
+111-125 b/‎_sources/backend/separate_reasoning.ipynb
+111-125
diff --git a/‎_sources/backend/speculative_decoding.ipynb
+183-150 b/‎_sources/backend/speculative_decoding.ipynb
+183-150
diff --git a/‎_sources/backend/structured_outputs.ipynb
+122-122 b/‎_sources/backend/structured_outputs.ipynb
+122-122
diff --git a/‎_sources/frontend/frontend.ipynb
+245-209 b/‎_sources/frontend/frontend.ipynb
+245-209
diff --git a/‎backend/function_calling.html
+52-67 b/‎backend/function_calling.html
+52-67
diff --git a/‎backend/function_calling.ipynb
+112-144 b/‎backend/function_calling.ipynb
+112-144
diff --git a/‎backend/native_api.html
+141-140 b/‎backend/native_api.html
+141-140
diff --git a/‎backend/native_api.ipynb
+243-218 b/‎backend/native_api.ipynb
+243-218
diff --git a/‎backend/offline_engine_api.html
+48-32 b/‎backend/offline_engine_api.html
+48-32
diff --git a/‎backend/offline_engine_api.ipynb
+428-420 b/‎backend/offline_engine_api.ipynb
+428-420
diff --git a/‎backend/openai_api_completions.html
+120-115 b/‎backend/openai_api_completions.html
+120-115
diff --git a/‎backend/openai_api_completions.ipynb
+193-170 b/‎backend/openai_api_completions.ipynb
+193-170
diff --git a/‎backend/openai_api_embeddings.html
+40-40 b/‎backend/openai_api_embeddings.html
+40-40
@@ -481,39 +481,39 @@ <h2>Launch A Server<a class="headerlink" href="#Launch-A-Server" title="Link to
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2025-03-14 07:04:11] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, quantization_param_path=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, chat_template=None, is_embedding=True, revision=None, host=&#39;0.0.0.0&#39;, port=36081, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;fcfs&#39;, schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=341608108, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path=&#39;sglang_storage&#39;, enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method=&#39;round_robin&#39;, ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, lora_paths=None, max_loras_per_batch=8, lora_backend=&#39;triton&#39;, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=5, speculative_eagle_topk=4, speculative_num_draft_tokens=8, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, cuda_graph_bs=None, torchao_config=&#39;&#39;, enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, enable_flashinfer_mla=False, flashinfer_mla_disable_ragged=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False)
-[2025-03-14 07:04:17] Downcasting torch.float32 to torch.float16.
-[2025-03-14 07:04:30 TP0] Downcasting torch.float32 to torch.float16.
-[2025-03-14 07:04:30 TP0] Overlap scheduler is disabled for embedding models.
-[2025-03-14 07:04:30 TP0] Downcasting torch.float32 to torch.float16.
-[2025-03-14 07:04:30 TP0] Init torch distributed begin.
-[2025-03-14 07:04:31 TP0] Init torch distributed ends. mem usage=0.00 GB
-[2025-03-14 07:04:31 TP0] Load weight begin. avail mem=59.32 GB
-[2025-03-14 07:04:31 TP0] The following error message &#39;operation scheduled before its operands&#39; can be ignored.
-[2025-03-14 07:04:35 TP0] Using model weights format [&#39;*.safetensors&#39;]
+[2025-03-14 07:44:15] server_args=ServerArgs(model_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_path=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, tokenizer_mode=&#39;auto&#39;, skip_tokenizer_init=False, load_format=&#39;auto&#39;, trust_remote_code=False, dtype=&#39;auto&#39;, kv_cache_dtype=&#39;auto&#39;, quantization=None, quantization_param_path=None, context_length=None, device=&#39;cuda&#39;, served_model_name=&#39;Alibaba-NLP/gte-Qwen2-7B-instruct&#39;, chat_template=None, is_embedding=True, revision=None, host=&#39;0.0.0.0&#39;, port=37732, mem_fraction_static=0.88, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy=&#39;fcfs&#39;, schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, stream_interval=1, stream_output=False, random_seed=125932383, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level=&#39;info&#39;, log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_path=&#39;sglang_storage&#39;, enable_cache_report=False, reasoning_parser=None, dp_size=1, load_balance_method=&#39;round_robin&#39;, ep_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args=&#39;{}&#39;, lora_paths=None, max_loras_per_batch=8, lora_backend=&#39;triton&#39;, attention_backend=&#39;flashinfer&#39;, sampling_backend=&#39;flashinfer&#39;, grammar_backend=&#39;outlines&#39;, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=5, speculative_eagle_topk=4, speculative_num_draft_tokens=8, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type=&#39;qk&#39;, ds_sparse_decode_threshold=4096, disable_radix_cache=False, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_nccl_nvls=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_ep_moe=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, cuda_graph_bs=None, torchao_config=&#39;&#39;, enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, tool_call_parser=None, enable_hierarchical_cache=False, enable_flashinfer_mla=False, flashinfer_mla_disable_ragged=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False)
+[2025-03-14 07:44:20] Downcasting torch.float32 to torch.float16.
+[2025-03-14 07:44:34 TP0] Downcasting torch.float32 to torch.float16.
+[2025-03-14 07:44:34 TP0] Overlap scheduler is disabled for embedding models.
+[2025-03-14 07:44:34 TP0] Downcasting torch.float32 to torch.float16.
+[2025-03-14 07:44:34 TP0] Init torch distributed begin.
+[2025-03-14 07:44:34 TP0] Init torch distributed ends. mem usage=0.00 GB
+[2025-03-14 07:44:34 TP0] Load weight begin. avail mem=63.19 GB
+[2025-03-14 07:44:34 TP0] The following error message &#39;operation scheduled before its operands&#39; can be ignored.
+[2025-03-14 07:44:36 TP0] Using model weights format [&#39;*.safetensors&#39;]
 Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00&lt;?, ?it/s]
-Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01&lt;00:10,  1.71s/it]
-Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03&lt;00:09,  1.84s/it]
-Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:05&lt;00:07,  1.87s/it]
-Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:07&lt;00:05,  1.87s/it]
-Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:09&lt;00:03,  1.94s/it]
-Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10&lt;00:01,  1.62s/it]
-Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11&lt;00:00,  1.58s/it]
-Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:11&lt;00:00,  1.71s/it]
-
-[2025-03-14 07:04:48 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=47.33 GB, mem usage=11.99 GB.
-[2025-03-14 07:04:48 TP0] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
-[2025-03-14 07:04:48 TP0] Memory pool end. avail mem=45.96 GB
-[2025-03-14 07:04:48 TP0] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
-[2025-03-14 07:04:48] INFO:     Started server process [2222846]
-[2025-03-14 07:04:48] INFO:     Waiting for application startup.
-[2025-03-14 07:04:48] INFO:     Application startup complete.
-[2025-03-14 07:04:48] INFO:     Uvicorn running on http://0.0.0.0:36081 (Press CTRL+C to quit)
-[2025-03-14 07:04:49] INFO:     127.0.0.1:34548 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
-[2025-03-14 07:04:49] INFO:     127.0.0.1:34554 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
-[2025-03-14 07:04:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
-[2025-03-14 07:04:51] INFO:     127.0.0.1:34556 - &#34;POST /encode HTTP/1.1&#34; 200 OK
-[2025-03-14 07:04:51] The server is fired up and ready to roll!
+Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:01&lt;00:08,  1.39s/it]
+Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:03&lt;00:08,  1.69s/it]
+Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:04&lt;00:05,  1.35s/it]
+Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:06&lt;00:05,  1.73s/it]
+Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:08&lt;00:03,  1.78s/it]
+Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:10&lt;00:01,  1.86s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12&lt;00:00,  1.91s/it]
+Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:12&lt;00:00,  1.78s/it]
+
+[2025-03-14 07:44:48 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=28.27 GB, mem usage=34.93 GB.
+[2025-03-14 07:44:48 TP0] KV Cache is allocated. #tokens: 20480, K size: 0.55 GB, V size: 0.55 GB
+[2025-03-14 07:44:48 TP0] Memory pool end. avail mem=26.90 GB
+[2025-03-14 07:44:49 TP0] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072
+[2025-03-14 07:44:49] INFO:     Started server process [817661]
+[2025-03-14 07:44:49] INFO:     Waiting for application startup.
+[2025-03-14 07:44:49] INFO:     Application startup complete.
+[2025-03-14 07:44:49] INFO:     Uvicorn running on http://0.0.0.0:37732 (Press CTRL+C to quit)
+[2025-03-14 07:44:49] INFO:     127.0.0.1:53222 - &#34;GET /v1/models HTTP/1.1&#34; 200 OK
+[2025-03-14 07:44:50] INFO:     127.0.0.1:53232 - &#34;GET /get_model_info HTTP/1.1&#34; 200 OK
+[2025-03-14 07:44:50 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
+[2025-03-14 07:44:51] INFO:     127.0.0.1:53240 - &#34;POST /encode HTTP/1.1&#34; 200 OK
+[2025-03-14 07:44:51] The server is fired up and ready to roll!
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -549,8 +549,8 @@ <h2>Using cURL<a class="headerlink" href="#Using-cURL" title="Link to this headi
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2025-03-14 07:04:54 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
-[2025-03-14 07:04:54] INFO:     127.0.0.1:36858 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2025-03-14 07:44:54 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
+[2025-03-14 07:44:54] INFO:     127.0.0.1:53250 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -586,8 +586,8 @@ <h2>Using Python Requests<a class="headerlink" href="#Using-Python-Requests" tit
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2025-03-14 07:04:54 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
-[2025-03-14 07:04:54] INFO:     127.0.0.1:36864 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2025-03-14 07:44:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
+[2025-03-14 07:44:55] INFO:     127.0.0.1:53258 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -623,8 +623,8 @@ <h2>Using OpenAI Python Client<a class="headerlink" href="#Using-OpenAI-Python-C
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2025-03-14 07:04:54 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
-[2025-03-14 07:04:54] INFO:     127.0.0.1:36868 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2025-03-14 07:44:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
+[2025-03-14 07:44:55] INFO:     127.0.0.1:53272 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">
@@ -666,8 +666,8 @@ <h2>Using Input IDs<a class="headerlink" href="#Using-Input-IDs" title="Link to
 </div>
 <div class="output_area docutils container">
 <div class="highlight"><pre>
-[2025-03-14 07:04:59 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
-[2025-03-14 07:04:59] INFO:     127.0.0.1:36870 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
+[2025-03-14 07:45:00 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, token usage: 0.00, #running-req: 0, #queue-req: 0,
+[2025-03-14 07:45:00] INFO:     127.0.0.1:35672 - &#34;POST /v1/embeddings HTTP/1.1&#34; 200 OK
 </pre></div></div>
 </div>
 <div class="nboutput nblast docutils container">