ThinkInAIXYZ · zerob13 · May 16, 2026 · May 15, 2026
diff --git a/docs/features/unified-tts-provider/plan.md b/docs/features/unified-tts-provider/plan.md
@@ -0,0 +1,37 @@
+# Plan
+
+## Approach
+Treat TTS as a first-class model capability and follow the `ImageGeneration` routing strategy:
+- Extend shared model/type schema to include `tts`.
+- Add runtime TTS routing ahead of default chat generation.
+- Dispatch by model pattern:
+  - Pattern A: `/v1/audio/speech`
+  - Pattern B: `/v1/chat/completions` with `audio` output
+- Normalize returned audio into data URL and cache through existing device cache, then emit `image_data` with audio MIME type.
+
+## Affected Areas
+- Shared types/contracts:
+  - `src/shared/model.ts`
+  - `src/shared/types/model-db.ts`
+  - `src/shared/types/presenters/legacy.presenters.d.ts`
+  - `src/shared/contracts/common.ts`
+  - `src/shared/contracts/domainSchemas.ts`
+  - `src/shared/ttsSettings.ts` (new)
+- Main runtime/provider:
+  - `src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts`
+  - `src/main/presenter/llmProviderPresenter/providers/aiSdkProvider.ts`
+- Model DB:
+  - `resources/model-db/providers.json`
+- Renderer model type detection:
+  - `src/renderer/src/composables/useModelTypeDetection.ts`
+
+## Compatibility
+- Existing chat and image generation paths remain unchanged.
+- Existing renderer audio playback remains unchanged because it already handles `image_data` with `audio/*` MIME.
+
+## Verification Strategy
+Run:
+- `pnpm run typecheck`
+- `pnpm run format`
+- `pnpm run i18n`
+- `pnpm run lint`
diff --git a/docs/features/unified-tts-provider/spec.md b/docs/features/unified-tts-provider/spec.md
@@ -0,0 +1,34 @@
+# Unified TTS Provider (Model-Level)
+
+## User Need
+Users want TTS integrated as a model capability (`ModelType.TTS`) instead of per-provider custom integration, so any OpenAI-compatible provider can work if its model metadata marks TTS support.
+
+## Goal
+Enable model-level TTS routing in DeepChat similar to image generation routing, including:
+- Standard OpenAI `/v1/audio/speech` TTS models
+- Chat-completions-audio TTS models that return base64 audio
+
+## Acceptance Criteria
+1. `ModelType.TTS` is available in shared model contracts and model-db schema.
+2. Runtime can route TTS models by model capability metadata and endpoint hints.
+3. Runtime supports both TTS patterns and emits `image_data` events with `audio/*` MIME type for existing renderer playback.
+4. Model DB can represent TTS model type for built-in provider entries.
+5. Frontend model type detection exposes TTS model state for UI behavior alignment.
+6. Validation commands pass:
+- `pnpm run typecheck`
+- `pnpm run format`
+- `pnpm run i18n`
+- `pnpm run lint`
+
+## Constraints
+- Reuse existing audio rendering path via `image_data`; avoid introducing new stream event types.
+- Keep provider integration generic for OpenAI-compatible providers.
+- Do not introduce dedicated UI for TTS settings in this scope.
+
+## Non-Goals
+- New TTS player UI.
+- Voice catalog fetching UX.
+- VoiceAI provider refactor.
+
+## Open Questions
+- None for current scope.
diff --git a/docs/features/unified-tts-provider/tasks.md b/docs/features/unified-tts-provider/tasks.md
@@ -0,0 +1,22 @@
+# Tasks
+
+## Shared Types + Runtime
+- [x] Add `ModelType.TTS` and `ApiEndpointType.AudioSpeech` in shared model enums.
+- [x] Extend model-db schema and parser for `tts` type.
+- [x] Add `src/shared/ttsSettings.ts` helpers for pattern detection and format normalization.
+- [x] Extend presenter model config contracts with optional `tts` settings.
+- [x] Add TTS route in runtime supporting pattern A and pattern B.
+- [x] Inject `shouldUseTts` capability check from AI SDK provider.
+
+## Model DB
+- [x] Mark relevant `aihubmix` models as `type: "tts"` in provider model list.
+- [x] Evaluate whether built-in `xiaomimimo` provider entry exists; it does not, so built-in DB coverage is skipped.
+
+## Renderer
+- [x] Extend `useModelTypeDetection` to include `tts` and expose `isTtsModel`.
+
+## Validation
+- [x] Run `pnpm run typecheck`.
+- [x] Run `pnpm run format`.
+- [x] Run `pnpm run i18n`.
+- [x] Run `pnpm run lint`.
diff --git a/resources/model-db/providers.json b/resources/model-db/providers.json
@@ -181526,6 +181526,160 @@
           },
           "type": "chat"
         },
+        {
+          "id": "tts-1",
+          "name": "tts-1",
+          "display_name": "tts-1",
+          "modalities": {
+            "input": [
+              "text"
+            ],
+            "output": [
+              "audio"
+            ]
+          },
+          "limit": {
+            "context": 8192,
+            "output": 8192
+          },
+          "temperature": false,
+          "tool_call": false,
+          "reasoning": {
+            "supported": false
+          },
+          "attachment": false,
+          "open_weights": false,
+          "cost": {
+            "input": 15,
+            "output": 15
+          },
+          "type": "tts"
+        },
+        {
+          "id": "tts-1-hd",
+          "name": "tts-1-hd",
+          "display_name": "tts-1-hd",
+          "modalities": {
+            "input": [
+              "text"
+            ],
+            "output": [
+              "audio"
+            ]
+          },
+          "limit": {
+            "context": 8192,
+            "output": 8192
+          },
+          "temperature": false,
+          "tool_call": false,
+          "reasoning": {
+            "supported": false
+          },
+          "attachment": false,
+          "open_weights": false,
+          "cost": {
+            "input": 30,
+            "output": 30
+          },
+          "type": "tts"
+        },
+        {
+          "id": "gpt-4o-mini-tts",
+          "name": "gpt-4o-mini-tts",
+          "display_name": "gpt-4o-mini-tts",
+          "modalities": {
+            "input": [
+              "text"
+            ],
+            "output": [
+              "audio"
+            ]
+          },
+          "temperature": false,
+          "tool_call": false,
+          "reasoning": {
+            "supported": false
+          },
+          "attachment": false,
+          "cost": {
+            "input": 0.48,
+            "output": 0.96
+          },
+          "type": "tts"
+        },
+        {
+          "id": "gemini-2.5-flash-preview-tts",
+          "name": "gemini-2.5-flash-preview-tts",
+          "display_name": "gemini-2.5-flash-preview-tts",
+          "modalities": {
+            "input": [
+              "text"
+            ],
+            "output": [
+              "audio"
+            ]
+          },
+          "limit": {
+            "context": 8192,
+            "output": 8192
+          },
+          "temperature": false,
+          "tool_call": false,
+          "reasoning": {
+            "supported": false
+          },
+          "attachment": false,
+          "open_weights": false,
+          "knowledge": "2025-01",
+          "release_date": "2025-05-01",
+          "last_updated": "2025-05-01",
+          "cost": {
+            "input": 0.5,
+            "output": 0.5,
+            "cache_read": 0
+          },
+          "type": "tts"
+        },
+        {
+          "id": "gemini-2.5-pro-preview-tts",
+          "name": "gemini-2.5-pro-preview-tts",
+          "display_name": "gemini-2.5-pro-preview-tts",
+          "modalities": {
+            "input": [
+              "text"
+            ],
+            "output": [
+              "audio"
+            ]
+          },
+          "limit": {
+            "context": 8192,
+            "output": 8192
+          },
+          "temperature": false,
+          "tool_call": false,
+          "reasoning": {
+            "supported": true,
+            "default": true
+          },
+          "extra_capabilities": {
+            "reasoning": {
+              "supported": true
+            }
+          },
+          "attachment": false,
+          "open_weights": false,
+          "knowledge": "2025-01",
+          "release_date": "2025-05-01",
+          "last_updated": "2025-05-01",
+          "cost": {
+            "input": 0.5,
+            "output": 0.5,
+            "cache_read": 0
+          },
+          "type": "tts"
+        },
         {
           "id": "doubao-seed-2-0-pro",
           "name": "doubao-seed-2-0-pro",

diff --git a/src/main/presenter/configPresenter/index.ts b/src/main/presenter/configPresenter/index.ts
@@ -984,6 +984,8 @@ export class ConfigPresenter implements IConfigPresenter {
         return ModelType.Rerank
       case 'imageGeneration':
         return ModelType.ImageGeneration
+      case 'tts':
+        return ModelType.TTS
       case 'chat':
       default:
         return ModelType.Chat

diff --git a/src/main/presenter/configPresenter/modelConfig.ts b/src/main/presenter/configPresenter/modelConfig.ts
@@ -121,6 +121,8 @@ export class ModelConfigHelper {
           return ModelType.Rerank
         case 'imageGeneration':
           return ModelType.ImageGeneration
+        case 'tts':
+          return ModelType.TTS
         default:
           // Invalid type, fall through to default
           break
@@ -176,7 +178,11 @@ export class ModelConfigHelper {
       reasoning: Boolean(reasoningEnabled),
       type: modelType,
       apiEndpoint:
-        modelType === ModelType.ImageGeneration ? ApiEndpointType.Image : ApiEndpointType.Chat,
+        modelType === ModelType.ImageGeneration
+          ? ApiEndpointType.Image
+          : modelType === ModelType.TTS
+            ? ApiEndpointType.AudioSpeech
+            : ApiEndpointType.Chat,
       thinkingBudget,
       forceInterleavedThinkingCompat,
       reasoningEffort,