huggingface · Wauplin · Jul 17, 2025 · Jul 17, 2025
@@ -49,6 +49,7 @@ interface TemplateParams {
 	fullUrl?: string;
 	inputs?: object;
 	providerInputs?: object;
+	autoInputs?: object;
 	model?: ModelDataMinimal;
 	provider?: InferenceProviderOrPolicy;
 	providerModelId?: string;
@@ -202,12 +203,28 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
 			}
 		}
 
+		// Inputs for the "auto" route is strictly the same as "inputs", except the model includes the provider
+		// If not "auto" route, use the providerInputs
+		const autoInputs =
+			provider !== "auto" && !opts?.endpointUrl && !opts?.directRequest
+				? {
+						...inputs,
+						model: `${model.id}:${provider}`,
+				  }
+				: providerInputs;
+
 		/// Prepare template injection data
 		const params: TemplateParams = {
 			accessToken: accessTokenOrPlaceholder,
 			authorizationHeader: (request.info.headers as Record<string, string>)?.Authorization,
-			baseUrl: removeSuffix(request.url, "/chat/completions"),
-			fullUrl: request.url,
+			baseUrl:
+				task === "conversational" && !opts?.endpointUrl && !opts?.directRequest
+					? HF_ROUTER_AUTO_ENDPOINT
+					: removeSuffix(request.url, "/chat/completions"),
+			fullUrl:
+				task === "conversational" && !opts?.endpointUrl && !opts?.directRequest
+					? HF_ROUTER_AUTO_ENDPOINT + "/chat/completions"
+					: request.url,
 			inputs: {
 				asObj: inputs,
 				asCurlString: formatBody(inputs, "curl"),
@@ -222,9 +239,21 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
 				asPythonString: formatBody(providerInputs, "python"),
 				asTsString: formatBody(providerInputs, "ts"),
 			},
+			autoInputs: {
+				asObj: autoInputs,
+				asCurlString: formatBody(autoInputs, "curl"),
+				asJsonString: formatBody(autoInputs, "json"),
+				asPythonString: formatBody(autoInputs, "python"),
+				asTsString: formatBody(autoInputs, "ts"),
+			},
 			model,
 			provider,
-			providerModelId: providerModelId ?? model.id,
+			providerModelId:
+				task === "conversational" && !opts?.endpointUrl && !opts?.directRequest
+					? provider !== "auto"
+						? `${model.id}:${provider}` // e.g. "moonshotai/Kimi-K2-Instruct:groq"
+						: model.id
+					: providerModelId ?? model.id,
 			billTo: opts?.billTo,
 			endpointUrl: opts?.endpointUrl,
 		};

@@ -3,7 +3,7 @@ def query(payload):
     return response.json()
 
 response = query({
-{{ providerInputs.asJsonString }}
+{{ autoInputs.asJsonString }}
 })
 
 print(response["choices"][0]["message"])
@@ -8,7 +8,7 @@ def query(payload):
         yield json.loads(line.decode("utf-8").lstrip("data:").rstrip("/n"))
 
 chunks = query({
-{{ providerInputs.asJsonString }},
+{{ autoInputs.asJsonString }},
     "stream": True,
 })
 

@@ -5,6 +5,6 @@ curl {{ fullUrl }} \
     -H 'X-HF-Bill-To: {{ billTo }}' \
 {% endif %}
     -d '{
-{{ providerInputs.asCurlString }},
+{{ autoInputs.asCurlString }},
         "stream": false
     }'
@@ -5,6 +5,6 @@ curl {{ fullUrl }} \
     -H 'X-HF-Bill-To: {{ billTo }}' \
 {% endif %}
     -d '{
-{{ providerInputs.asCurlString }},
+{{ autoInputs.asCurlString }},
         "stream": true
     }'
diff --git a/packages/tasks-gen/snippets-fixtures/bill-to-param/js/openai/0.hf-inference.js b/packages/tasks-gen/snippets-fixtures/bill-to-param/js/openai/0.hf-inference.js
@@ -1,15 +1,15 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-	baseURL: "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
+	baseURL: "https://router.huggingface.co/v1",
 	apiKey: process.env.HF_TOKEN,
 	defaultHeaders: {
 		"X-HF-Bill-To": "huggingface" 
 	}
 });
 
 const chatCompletion = await client.chat.completions.create({
-	model: "meta-llama/Llama-3.1-8B-Instruct",
+	model: "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     messages: [
         {
             role: "user",

diff --git a/packages/tasks-gen/snippets-fixtures/bill-to-param/python/openai/0.hf-inference.py b/packages/tasks-gen/snippets-fixtures/bill-to-param/python/openai/0.hf-inference.py
@@ -2,15 +2,15 @@
 from openai import OpenAI
 
 client = OpenAI(
-    base_url="https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
+    base_url="https://router.huggingface.co/v1",
     api_key=os.environ["HF_TOKEN"],
     default_headers={
         "X-HF-Bill-To": "huggingface"
     }
 )
 
 completion = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     messages=[
         {
             "role": "user",

diff --git a/packages/tasks-gen/snippets-fixtures/bill-to-param/python/requests/0.hf-inference.py b/packages/tasks-gen/snippets-fixtures/bill-to-param/python/requests/0.hf-inference.py
@@ -1,7 +1,7 @@
 import os
 import requests
 
-API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions"
+API_URL = "https://router.huggingface.co/v1/chat/completions"
 headers = {
     "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
     "X-HF-Bill-To": "huggingface"
@@ -18,7 +18,7 @@ def query(payload):
             "content": "What is the capital of France?"
         }
     ],
-    "model": "meta-llama/Llama-3.1-8B-Instruct"
+    "model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference"
 })
 
 print(response["choices"][0]["message"])
diff --git a/packages/tasks-gen/snippets-fixtures/bill-to-param/sh/curl/0.hf-inference.sh b/packages/tasks-gen/snippets-fixtures/bill-to-param/sh/curl/0.hf-inference.sh
@@ -1,4 +1,4 @@
-curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions \
+curl https://router.huggingface.co/v1/chat/completions \
     -H "Authorization: Bearer $HF_TOKEN" \
     -H 'Content-Type: application/json' \
     -H 'X-HF-Bill-To: huggingface' \
@@ -9,6 +9,6 @@ curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-I
                 "content": "What is the capital of France?"
             }
         ],
-        "model": "meta-llama/Llama-3.1-8B-Instruct",
+        "model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
         "stream": false
     }'
diff --git a/...ges/tasks-gen/snippets-fixtures/conversational-llm-non-stream/js/openai/0.hf-inference.js b/...ges/tasks-gen/snippets-fixtures/conversational-llm-non-stream/js/openai/0.hf-inference.js
@@ -1,12 +1,12 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-	baseURL: "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
+	baseURL: "https://router.huggingface.co/v1",
 	apiKey: process.env.HF_TOKEN,
 });
 
 const chatCompletion = await client.chat.completions.create({
-	model: "meta-llama/Llama-3.1-8B-Instruct",
+	model: "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     messages: [
         {
             role: "user",

diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/js/openai/0.together.js b/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/js/openai/0.together.js
@@ -1,12 +1,12 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-	baseURL: "https://router.huggingface.co/together/v1",
+	baseURL: "https://router.huggingface.co/v1",
 	apiKey: process.env.HF_TOKEN,
 });
 
 const chatCompletion = await client.chat.completions.create({
-	model: "<together alias for meta-llama/Llama-3.1-8B-Instruct>",
+	model: "meta-llama/Llama-3.1-8B-Instruct:together",
     messages: [
         {
             role: "user",

diff --git a/...tasks-gen/snippets-fixtures/conversational-llm-non-stream/python/openai/0.hf-inference.py b/...tasks-gen/snippets-fixtures/conversational-llm-non-stream/python/openai/0.hf-inference.py
@@ -2,12 +2,12 @@
 from openai import OpenAI
 
 client = OpenAI(
-    base_url="https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
+    base_url="https://router.huggingface.co/v1",
     api_key=os.environ["HF_TOKEN"],
 )
 
 completion = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     messages=[
         {
             "role": "user",

diff --git a/...ges/tasks-gen/snippets-fixtures/conversational-llm-non-stream/python/openai/0.together.py b/...ges/tasks-gen/snippets-fixtures/conversational-llm-non-stream/python/openai/0.together.py
@@ -2,12 +2,12 @@
 from openai import OpenAI
 
 client = OpenAI(
-    base_url="https://router.huggingface.co/together/v1",
+    base_url="https://router.huggingface.co/v1",
     api_key=os.environ["HF_TOKEN"],
 )
 
 completion = client.chat.completions.create(
-    model="<together alias for meta-llama/Llama-3.1-8B-Instruct>",
+    model="meta-llama/Llama-3.1-8B-Instruct:together",
     messages=[
         {
             "role": "user",

diff --git a/...sks-gen/snippets-fixtures/conversational-llm-non-stream/python/requests/0.hf-inference.py b/...sks-gen/snippets-fixtures/conversational-llm-non-stream/python/requests/0.hf-inference.py
@@ -1,7 +1,7 @@
 import os
 import requests
 
-API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions"
+API_URL = "https://router.huggingface.co/v1/chat/completions"
 headers = {
     "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
 }
@@ -17,7 +17,7 @@ def query(payload):
             "content": "What is the capital of France?"
         }
     ],
-    "model": "meta-llama/Llama-3.1-8B-Instruct"
+    "model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference"
 })
 
 print(response["choices"][0]["message"])
diff --git a/...s/tasks-gen/snippets-fixtures/conversational-llm-non-stream/python/requests/0.together.py b/...s/tasks-gen/snippets-fixtures/conversational-llm-non-stream/python/requests/0.together.py
@@ -1,7 +1,7 @@
 import os
 import requests
 
-API_URL = "https://router.huggingface.co/together/v1/chat/completions"
+API_URL = "https://router.huggingface.co/v1/chat/completions"
 headers = {
     "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
 }
@@ -17,7 +17,7 @@ def query(payload):
             "content": "What is the capital of France?"
         }
     ],
-    "model": "<together alias for meta-llama/Llama-3.1-8B-Instruct>"
+    "model": "meta-llama/Llama-3.1-8B-Instruct:together"
 })
 
 print(response["choices"][0]["message"])
diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/sh/curl/0.hf-inference.sh b/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/sh/curl/0.hf-inference.sh
@@ -1,4 +1,4 @@
-curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions \
+curl https://router.huggingface.co/v1/chat/completions \
     -H "Authorization: Bearer $HF_TOKEN" \
     -H 'Content-Type: application/json' \
     -d '{
@@ -8,6 +8,6 @@ curl https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-I
                 "content": "What is the capital of France?"
             }
         ],
-        "model": "meta-llama/Llama-3.1-8B-Instruct",
+        "model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
         "stream": false
     }'
diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/sh/curl/0.together.sh b/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/sh/curl/0.together.sh
@@ -1,4 +1,4 @@
-curl https://router.huggingface.co/together/v1/chat/completions \
+curl https://router.huggingface.co/v1/chat/completions \
     -H "Authorization: Bearer $HF_TOKEN" \
     -H 'Content-Type: application/json' \
     -d '{
@@ -8,6 +8,6 @@ curl https://router.huggingface.co/together/v1/chat/completions \
                 "content": "What is the capital of France?"
             }
         ],
-        "model": "<together alias for meta-llama/Llama-3.1-8B-Instruct>",
+        "model": "meta-llama/Llama-3.1-8B-Instruct:together",
         "stream": false
     }'
diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/js/openai/0.hf-inference.js b/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/js/openai/0.hf-inference.js
@@ -1,12 +1,12 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-	baseURL: "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
+	baseURL: "https://router.huggingface.co/v1",
 	apiKey: process.env.HF_TOKEN,
 });
 
 const stream = await client.chat.completions.create({
-    model: "meta-llama/Llama-3.1-8B-Instruct",
+    model: "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     messages: [
         {
             role: "user",

diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/js/openai/0.together.js b/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/js/openai/0.together.js
@@ -1,12 +1,12 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-	baseURL: "https://router.huggingface.co/together/v1",
+	baseURL: "https://router.huggingface.co/v1",
 	apiKey: process.env.HF_TOKEN,
 });
 
 const stream = await client.chat.completions.create({
-    model: "<together alias for meta-llama/Llama-3.1-8B-Instruct>",
+    model: "meta-llama/Llama-3.1-8B-Instruct:together",
     messages: [
         {
             role: "user",

diff --git a/...ges/tasks-gen/snippets-fixtures/conversational-llm-stream/python/openai/0.hf-inference.py b/...ges/tasks-gen/snippets-fixtures/conversational-llm-stream/python/openai/0.hf-inference.py
@@ -2,12 +2,12 @@
 from openai import OpenAI
 
 client = OpenAI(
-    base_url="https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
+    base_url="https://router.huggingface.co/v1",
     api_key=os.environ["HF_TOKEN"],
 )
 
 stream = client.chat.completions.create(
-    model="meta-llama/Llama-3.1-8B-Instruct",
+    model="meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     messages=[
         {
             "role": "user",

diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/python/openai/0.together.py b/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/python/openai/0.together.py
@@ -2,12 +2,12 @@
 from openai import OpenAI
 
 client = OpenAI(
-    base_url="https://router.huggingface.co/together/v1",
+    base_url="https://router.huggingface.co/v1",
     api_key=os.environ["HF_TOKEN"],
 )
 
 stream = client.chat.completions.create(
-    model="<together alias for meta-llama/Llama-3.1-8B-Instruct>",
+    model="meta-llama/Llama-3.1-8B-Instruct:together",
     messages=[
         {
             "role": "user",

diff --git a/...s/tasks-gen/snippets-fixtures/conversational-llm-stream/python/requests/0.hf-inference.py b/...s/tasks-gen/snippets-fixtures/conversational-llm-stream/python/requests/0.hf-inference.py
@@ -2,7 +2,7 @@
 import json
 import requests
 
-API_URL = "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1/chat/completions"
+API_URL = "https://router.huggingface.co/v1/chat/completions"
 headers = {
     "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
 }
@@ -23,7 +23,7 @@ def query(payload):
             "content": "What is the capital of France?"
         }
     ],
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct:hf-inference",
     "stream": True,
 })
 

diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/python/requests/0.together.py b/packages/tasks-gen/snippets-fixtures/conversational-llm-stream/python/requests/0.together.py
@@ -2,7 +2,7 @@
 import json
 import requests
 
-API_URL = "https://router.huggingface.co/together/v1/chat/completions"
+API_URL = "https://router.huggingface.co/v1/chat/completions"
 headers = {
     "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
 }
@@ -23,7 +23,7 @@ def query(payload):
             "content": "What is the capital of France?"
         }
     ],
-    "model": "<together alias for meta-llama/Llama-3.1-8B-Instruct>",
+    "model": "meta-llama/Llama-3.1-8B-Instruct:together",
     "stream": True,
 })