runpod · KAJdev · Jun 4, 2026 · May 27, 2026
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -25,7 +25,7 @@ If adding a new example, which category does it belong to?
 ## Checklist
 
 ### Functionality
-- [ ] Example runs successfully with `flash run`
+- [ ] Example runs successfully with `flash dev`
 - [ ] All endpoints return correct responses
 - [ ] Tested locally
 - [ ] Error handling implemented

diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md
@@ -21,14 +21,14 @@ Or create a `.env` file with `RUNPOD_API_KEY=your_api_key_here`.
 ### 3. Run Locally
 
 ```bash
-uv run flash run
+uv run flash dev
 ```
 
 Server starts at **http://localhost:8888**
 
 ### 4. Test the API
 
-Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@Endpoint` functions.
+Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash dev` based on your `@Endpoint` functions.
 
 ```bash
 curl -X POST http://localhost:8888/gpu_worker/runsync \
@@ -133,14 +133,9 @@ The worker uses PyTorch to detect and report GPU information:
 
 ## Development
 
-### Test Worker Locally
-```bash
-python gpu_worker.py
-```
-
 ### Run the Application
 ```bash
-flash run
+flash dev
 ```
 
 ## Next Steps

diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py
@@ -1,6 +1,5 @@
 # gpu serverless worker -- detects available GPU hardware.
-# run with: flash run
-# test directly: python gpu_worker.py
+# run with: flash dev
 from runpod_flash import Endpoint, GpuType
 
 

diff --git a/01_getting_started/02_cpu_worker/README.md b/01_getting_started/02_cpu_worker/README.md
@@ -21,14 +21,14 @@ Or create a `.env` file with `RUNPOD_API_KEY=your_api_key_here`.
 ### 3. Run Locally
 
 ```bash
-uv run flash run
+uv run flash dev
 ```
 
 Server starts at **http://localhost:8888**
 
 ### 4. Test the API
 
-Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@Endpoint` functions.
+Visit **http://localhost:8888/docs** for interactive API documentation. QB endpoints are auto-generated by `flash dev` based on your `@Endpoint` functions.
 
 ```bash
 curl -X POST http://localhost:8888/cpu_worker/runsync \
@@ -135,14 +135,9 @@ The CPU worker scales to zero when idle:
 
 ## Development
 
-### Test Worker Locally
-```bash
-python cpu_worker.py
-```
-
 ### Run the Application
 ```bash
-flash run
+flash dev
 ```
 
 ## When to Use CPU Workers

diff --git a/01_getting_started/02_cpu_worker/cpu_worker.py b/01_getting_started/02_cpu_worker/cpu_worker.py
@@ -1,6 +1,5 @@
 # cpu serverless worker -- lightweight processing without GPU.
-# run with: flash run
-# test directly: python cpu_worker.py
+# run with: flash dev
 from runpod_flash import CpuInstanceType, Endpoint
 
 

diff --git a/01_getting_started/03_mixed_workers/README.md b/01_getting_started/03_mixed_workers/README.md
@@ -44,7 +44,7 @@ Response
 
 ```bash
 cd 01_getting_started/03_mixed_workers
-flash run
+flash dev
 ```
 
 ### Alternative: Standalone Setup
@@ -60,7 +60,7 @@ uv run flash login
 # Or create .env file with RUNPOD_API_KEY=your_api_key_here
 
 # Run
-uv run flash run
+uv run flash dev
 ```
 
 Server starts at http://localhost:8888

diff --git a/01_getting_started/03_mixed_workers/cpu_worker.py b/01_getting_started/03_mixed_workers/cpu_worker.py
@@ -1,7 +1,6 @@
 # cpu workers for text preprocessing and postprocessing.
 # part of the mixed CPU/GPU pipeline example.
-# run with: flash run
-# test directly: python cpu_worker.py
+# run with: flash dev
 from runpod_flash import CpuInstanceType, Endpoint
 
 

diff --git a/01_getting_started/03_mixed_workers/gpu_worker.py b/01_getting_started/03_mixed_workers/gpu_worker.py
@@ -1,7 +1,6 @@
 # gpu worker for ML inference (sentiment classification).
 # part of the mixed CPU/GPU pipeline example.
-# run with: flash run
-# test directly: python gpu_worker.py
+# run with: flash dev
 from runpod_flash import Endpoint, GpuGroup
 
 

diff --git a/01_getting_started/03_mixed_workers/pipeline.py b/01_getting_started/03_mixed_workers/pipeline.py
@@ -1,6 +1,6 @@
 # classification pipeline: CPU preprocess -> GPU inference -> CPU postprocess.
 # demonstrates cross-worker orchestration via a load-balanced endpoint.
-# run with: flash run
+# run with: flash dev
 from runpod_flash import Endpoint
 
 pipeline = Endpoint(name="01_03_classify_pipeline", cpu="cpu3c-1-2", workers=(1, 3))

diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md
@@ -29,7 +29,7 @@ Learn how to manage Python packages and system dependencies in Flash workers.
 
 ```bash
 cd 01_getting_started/04_dependencies
-flash run
+flash dev
 ```
 
 Server starts at http://localhost:8888
@@ -47,7 +47,7 @@ uv run flash login
 # Or create .env file with RUNPOD_API_KEY=your_api_key_here
 
 # Run
-uv run flash run
+uv run flash dev
 ```
 
 ## GPU vs CPU Packaging
@@ -273,9 +273,8 @@ async def fetch_data(url: str):
 ### 3. Test Dependency Compatibility
 
 ```bash
-# Test locally first
-python gpu_worker.py
-python cpu_worker.py
+# test locally
+flash dev
 ```
 
 ### 4. Document Dependencies

diff --git a/01_getting_started/04_dependencies/cpu_worker.py b/01_getting_started/04_dependencies/cpu_worker.py
@@ -1,6 +1,5 @@
 # cpu workers demonstrating data science and zero-dependency patterns.
-# run with: flash run
-# test directly: python cpu_worker.py
+# run with: flash dev
 from runpod_flash import CpuInstanceType, Endpoint
 
 

diff --git a/01_getting_started/04_dependencies/gpu_worker.py b/01_getting_started/04_dependencies/gpu_worker.py
@@ -1,6 +1,5 @@
 # gpu workers demonstrating Python and system dependency management.
-# run with: flash run
-# test directly: python gpu_worker.py
+# run with: flash dev
 from runpod_flash import Endpoint, GpuGroup
 
 

diff --git a/01_getting_started/04_dependencies/mixed_worker.py b/01_getting_started/04_dependencies/mixed_worker.py
@@ -3,8 +3,7 @@
 #   - GPU images (runpod/pytorch:*) have numpy pre-installed
 #   - CPU images (python-slim) install numpy from the build artifact
 #
-# run with: flash run
-# test directly: python mixed_worker.py
+# run with: flash dev
 from runpod_flash import CpuInstanceType, Endpoint, GpuType
 
 

diff --git a/02_ml_inference/01_text_to_speech/README.md b/02_ml_inference/01_text_to_speech/README.md
@@ -33,14 +33,14 @@ Or create a `.env` file with `RUNPOD_API_KEY=your_api_key_here`.
 ### Run
 
 ```bash
-uv run flash run
+uv run flash dev
 ```
 
 First run provisions the endpoint (~1 min). Server starts at http://localhost:8888
 
 ### Test the Endpoint
 
-Visit http://localhost:8888/docs for interactive API documentation. QB endpoints are auto-generated by `flash run` based on your `@Endpoint` functions.
+Visit http://localhost:8888/docs for interactive API documentation. QB endpoints are auto-generated by `flash dev` based on your `@Endpoint` functions.
 
 **Generate speech (JSON with base64 audio):**
 ```bash
@@ -136,7 +136,7 @@ flash deploy send production
 
 ## Common Issues
 
-- **Cold start delay**: First request after idle takes 20-30s to load the model. Use `flash run --auto-provision` during development.
+- **Cold start delay**: First request after idle takes 20-30s to load the model. Use `flash dev --auto-provision` during development.
 - **Out of memory**: The model requires 24GB+ VRAM. Ensure `GpuGroup.ADA_24` or higher is configured.
 - **Invalid speaker/language**: Use `get_voices` to check valid options.
 

diff --git a/02_ml_inference/01_text_to_speech/gpu_worker.py b/02_ml_inference/01_text_to_speech/gpu_worker.py
@@ -1,6 +1,5 @@
 # Qwen3-TTS text-to-speech GPU worker.
-# run with: flash run
-# test directly: python gpu_worker.py
+# run with: flash dev
 from runpod_flash import Endpoint, GpuGroup
 
 

diff --git a/03_advanced_workers/05_load_balancer/README.md b/03_advanced_workers/05_load_balancer/README.md
@@ -40,14 +40,14 @@ Or create a `.env` file with `RUNPOD_API_KEY=your_api_key_here`.
 ### 3. Run Locally (from repository root)
 
 ```bash
-uv run flash run
+uv run flash dev
 ```
 
 Visit **http://localhost:8888/docs** for interactive API documentation (unified app with all examples).
 
 ### 4. Test Endpoints (via unified app)
 
-When using `flash run` from the repository root, routes are prefixed with the example name:
+When using `flash dev` from the repository root, routes are prefixed with the example name:
 
 **GPU Service (Compute)**:
 ```bash
@@ -256,14 +256,10 @@ Response:
 }
 ```
 
-## Testing Workers Locally
+## Testing Locally
 
 ```bash
-# Test GPU worker
-python gpu_lb.py
-
-# Test CPU worker
-python cpu_lb.py
+flash dev
 ```
 
 ## Deployment

diff --git a/03_advanced_workers/05_load_balancer/cpu_lb.py b/03_advanced_workers/05_load_balancer/cpu_lb.py
@@ -1,6 +1,5 @@
 # cpu load-balanced endpoints with custom HTTP routes.
-# run with: flash run
-# test directly: python cpu_lb.py
+# run with: flash dev
 from runpod_flash import Endpoint
 
 api = Endpoint(

diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py
@@ -1,6 +1,5 @@
 # gpu load-balanced endpoints with custom HTTP routes.
-# run with: flash run
-# test directly: python gpu_lb.py
+# run with: flash dev
 from runpod_flash import Endpoint, GpuType
 
 api = Endpoint(

diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md
@@ -8,7 +8,7 @@ Configure Flash worker autoscaling for different workload patterns. This example
 
 ```bash
 cd 04_scaling_performance/01_autoscaling
-flash run
+flash dev
 ```
 
 Server starts at http://localhost:8888 -- visit http://localhost:8888/docs for interactive API docs.

diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py
@@ -1,6 +1,5 @@
 # cpu autoscaling strategies -- scale-to-zero and burst-ready.
-# run with: flash run
-# test directly: python cpu_worker.py
+# run with: flash dev
 from runpod_flash import CpuInstanceType, Endpoint
 
 

diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py
@@ -1,6 +1,5 @@
 # gpu autoscaling strategies -- scale-to-zero, always-on, high-throughput.
-# run with: flash run
-# test directly: python gpu_worker.py
+# run with: flash dev
 from runpod_flash import Endpoint, GpuType, ServerlessScalerType
 
 

diff --git a/04_scaling_performance/02_datacenters/README.md b/04_scaling_performance/02_datacenters/README.md
@@ -10,7 +10,7 @@ By default, endpoints deploy across all available data centers. The `datacenter`
 
 ```bash
 pip install -r requirements.txt
-flash run
+flash dev
 ```
 
 ## What You'll Learn

diff --git a/04_scaling_performance/02_datacenters/cpu_worker.py b/04_scaling_performance/02_datacenters/cpu_worker.py
@@ -1,7 +1,7 @@
 # cpu worker pinned to a cpu-supported datacenter.
 # cpu endpoints are only available in a subset of datacenters
 # (see CPU_DATACENTERS). selecting an unsupported DC raises an error.
-# run with: flash run
+# run with: flash dev
 from runpod_flash import Endpoint, DataCenter
 
 api = Endpoint(

diff --git a/04_scaling_performance/02_datacenters/gpu_worker.py b/04_scaling_performance/02_datacenters/gpu_worker.py
@@ -1,5 +1,5 @@
 # gpu workers pinned to specific datacenters.
-# run with: flash run
+# run with: flash dev
 from runpod_flash import Endpoint, GpuGroup, DataCenter
 
 

diff --git a/05_data_workflows/01_network_volumes/README.md b/05_data_workflows/01_network_volumes/README.md
@@ -25,7 +25,7 @@ Or create a `.env` file with `RUNPOD_API_KEY=your_api_key_here`.
 ### 3. Run Locally
 
 ```bash
-uv run flash run
+uv run flash dev
 ```
 
 Server starts at `http://localhost:8888`

diff --git a/05_data_workflows/01_network_volumes/cpu_worker.py b/05_data_workflows/01_network_volumes/cpu_worker.py
@@ -1,6 +1,5 @@
 # cpu worker with network volume for listing and serving generated images.
-# run with: flash run
-# test directly: python cpu_worker.py
+# run with: flash dev
 from runpod_flash import Endpoint, DataCenter, NetworkVolume
 
 # same volume as gpu_worker.py -- must match name and datacenter

diff --git a/05_data_workflows/01_network_volumes/gpu_worker.py b/05_data_workflows/01_network_volumes/gpu_worker.py
@@ -1,6 +1,5 @@
 # gpu worker with network volume for Stable Diffusion image generation.
-# run with: flash run
-# test directly: python gpu_worker.py
+# run with: flash dev
 import logging
 
 from runpod_flash import Endpoint, GpuType, DataCenter, NetworkVolume

diff --git a/06_real_world/README.md b/06_real_world/README.md
@@ -118,7 +118,7 @@ All real-world examples include:
 ### Development
 ```bash
 cd example_name
-flash run
+flash dev
 ```
 
 ### Production