vllm-project
diff --git a/‎.buildkite/run-cpu-test.sh
Lines changed: 1 addition & 1 deletion b/‎.buildkite/run-cpu-test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/run-gh200-test.sh
Lines changed: 1 addition & 1 deletion b/‎.buildkite/run-gh200-test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/run-hpu-test.sh
Lines changed: 1 addition & 1 deletion b/‎.buildkite/run-hpu-test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/run-openvino-test.sh
Lines changed: 1 addition & 1 deletion b/‎.buildkite/run-openvino-test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/run-xpu-test.sh
Lines changed: 2 additions & 2 deletions b/‎.buildkite/run-xpu-test.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 6 additions & 6 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/source/generate_examples.py
Lines changed: 2 additions & 2 deletions b/‎docs/source/generate_examples.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/getting_started/installation/cpu/index.md
Lines changed: 2 additions & 2 deletions b/‎docs/source/getting_started/installation/cpu/index.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/getting_started/quickstart.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/getting_started/quickstart.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/models/generative_models.md
Lines changed: 2 additions & 2 deletions b/‎docs/source/models/generative_models.md
Lines changed: 2 additions & 2 deletions
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference/basic.py"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
@@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 EXITCODE=$?
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/basic.py
-    python3 examples/offline_inference/cli.py -tp 2
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
@@ -215,18 +215,18 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic.py
-    - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/chat.py
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
     - python3 offline_inference/vision_language.py
     - python3 offline_inference/vision_language_multi_image.py
     - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder.py
-    - python3 offline_inference/classification.py
-    - python3 offline_inference/embedding.py
-    - python3 offline_inference/scoring.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
     - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
 
@@ -147,7 +147,7 @@ def generate(self) -> str:
             return content
 
         content += "## Example materials\n\n"
-        for file in self.other_files:
+        for file in sorted(self.other_files):
             include = "include" if file.suffix == ".md" else "literalinclude"
             content += f":::{{admonition}} {file.relative_to(self.path)}\n"
             content += ":class: dropdown\n\n"
@@ -194,7 +194,7 @@ def generate_examples():
             path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
             title="Offline Inference",
             description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.",  # noqa: E501
             caption="Examples",
         ),
     }
 
@@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features:
 sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 find / -name *libtcmalloc* # find the dynamic link library path
 export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-python examples/offline_inference/basic.py # run vLLM
+python examples/offline_inference/basic/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/basic.py
+$ python examples/offline_inference/basic/basic.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
 
@@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
 
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
Original file line number	Diff line number	Diff line change
`@@ -24,5 +24,5 @@ remove_docker_container`
`24`	`24`
`25`	`25`	`# Run the image and test offline inference`
`26`	`26`	`docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '`
`27`		`- python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B`
	`27`	`+ python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B`
`28`	`28`	`'`
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,6 @@ remove_docker_container`
`14`	`14`
`15`	`15`	`# Run the image and test offline inference/tensor parallel`
`16`	`16`	`docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '`
`17`		`- python3 examples/offline_inference/basic.py`
`18`		`- python3 examples/offline_inference/cli.py -tp 2`
	`17`	`+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m`
	`18`	`+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2`
`19`	`19`	`'`