inclusionAI · dongxingning · Mar 17, 2026 · gemini-code-assist · Mar 17, 2026 · gemini-code-assist
diff --git a/README.md b/README.md
@@ -233,6 +233,275 @@ print(output_text)
 ```
 
 
+## Ming SDK
+
+Ming SDK provides a simple and easy-to-use Python API for quickly integrating the multimodal capabilities of Ming-flash-omni 2.0.
+
+### SDK Features
+
+- **Unified API Interface**: Supports text generation, speech synthesis, image generation/editing, and more
+- **Streaming Output Support**: Supports streaming generation for text and speech, suitable for real-time interaction scenarios
+- **Flexible Device Configuration**: Supports multi-GPU deployment and memory optimization
+- **Complete Usage Statistics**: Provides detailed statistics on token usage, audio duration, etc.
+
+### SDK Installation
+
+#### Install VLLM
+pip install vllm-0.8.5.post3.dev90+gffc0d5a3f.ant-cp310-cp310-linux_x86_64.whl
+
+#### Install Ming SDK
+#### Option 1: Build from Source
+
+```bash
+# 1. Clone the repository
+git clone https://github.com/inclusionAI/Ming.git
+cd Ming
+
+# 2. Install dependencies
+pip install -r requirements.txt
+
+python ming_sdk/setup.py bdist_wheel
+
+pip3 install dist/ming_sdk-1.0.0-py3-none-any.whl
+```
+
+### SDK Usage Examples
+
+#### Initialize SDK
+
+```python
+from ming_sdk import Ming
+
+# Configuration parameters
+model_path = "your model path"  # Model path
+device = "0,1,2,3"  # GPU devices, supports multi-GPU parallelism
+gpu_memory_utilization = {"moe": 0.8, "talker": 0.17}  # GPU memory utilization
+device_map = {"talker": ["cuda:0"]}  # Module device mapping
+
+# Initialize Ming instance
+ming = Ming(
+    model_path=model_path,
+    device=device,
+    gpu_memory_utilization=gpu_memory_utilization,
+    device_map=device_map,
+    speaker="DB30",  # TTS speaker ID
+    with_async = True,
+    use_talker = True
+)
+```
+
+#### Text Generation
+
+```python
+# Non-streaming text generation
+text, usage = ming.generate(text="介绍一下杭州")
+print(f"text:{text}")
+print(f"usage:{usage}")
+assert text is not None
+
+
+# Streaming text generation
+all_text = ""
+request_id = ""
+for text, request_id, usage in ming.generate_stream(
+    text="介绍一下杭州", max_new_tokens=128
+):
+    all_text += text
+print(f"request_id:{request_id},text={all_text},usage={usage}")
+assert text is not None
+print(f"\nFull text: {all_text}")
+```
+#### Speech QA
+```python
+# Speech QA
+output_audio_path = "test.wav"
+waveform, gen_text, usage = ming.generate(
+    text="介绍一下杭州", output_type="speech", max_new_tokens=128
+)
+sr = 44100
+torchaudio.save(output_audio_path, waveform, sr)
+assert os.path.exists(output_audio_path)
+print(f"request_id:{gen_text},usage={usage}")
+
+
+# Streaming speech QA
+all_wavs = []
+all_text = ""
+request_id = ""
+output_audio_path = "test_stream.wav"
+for data_type, data_content in ming.generate_stream(
+    text="介绍一下杭州", output_type="speech", max_new_tokens=128
+):
+    if data_type == "text_data":
+        text, usage = data_content
+    elif data_type == "text_audio_data":
+        tts_speech, text, meta_info, session_id, usage  = data_content
+        all_text += text
+        all_wavs.append(tts_speech)
+waveform = torch.cat(all_wavs, dim=-1)
+sr = 44100
+torchaudio.save(output_audio_path, waveform, sr)
+print(
+    f"request_id:{request_id},audio:{output_audio_path},text={all_text},usage={usage}"
+)
+assert os.path.exists(output_audio_path)
+
+
+# Streaming speech QA with interruption
+all_wavs = []
+all_text = ""
+request_id = ""
+output_audio_path = "test_stream.wav"
+for data_type, data_content in ming.generate_stream(
+    text="介绍一下杭州", output_type="speech", max_new_tokens=128
+):
+    if data_type == "text_data":
+        text, usage = data_content
+    elif data_type == "text_audio_data":
+        tts_speech, text, meta_info, session_id, usage  = data_content
+        all_text += text
+        all_wavs.append(tts_speech)
+    if len(all_text) > 20:
+        ming.generate_interrupt(request_id)
+waveform = torch.cat(all_wavs, dim=-1)
+sr = 44100
+torchaudio.save(output_audio_path, waveform, sr)
+print(f"request_id:{request_id},audio:{output_audio_path},text={all_text}")
+assert os.path.exists(output_audio_path)
+
+```
+
+#### ASR Task
+```python
+# ASR
+asr_result, usage = ming.generate(
+    text="Please recognize the language of this speech and transcribe it. Format: oral.",
+    audio="https://example.com/audio.wav",
+)
+print(f"asr_result:{asr_result},usage={usage}")
+assert asr_result is not None
+```
+
+#### Text-to-Speech (TTS)
+
+```python
+import torchaudio
+
+# Non-streaming TTS
+waveform, usage = ming.generate(
+    text="我爱北京故宫",
+    output_type="speech"
+)
+torchaudio.save("output_tts.wav", waveform, 44100)
+
+# Streaming TTS
+all_wavs = []
+all_text = ""
+for data_type, data_content in ming.generate_stream(
+    text="我爱北京故宫",
+    output_type="speech"
+):
+    if data_type == "text_audio_data":
+        tts_speech, sentence, meta_info, session_id, usage = data_content
+        all_text += sentence
+        all_wavs.append(tts_speech)
+
+# Save audio
+waveform = torch.cat(all_wavs, dim=-1)
+torchaudio.save("output_tts_stream.wav", waveform, 44100)
+```
+
+#### Speech-to-Speech
+
+```python
+# Non-streaming speech-to-speech
+waveform, gen_text, usage = ming.generate(
+    audio="https://example.com/audio.wav",
+    output_type="speech",
+    max_new_tokens=128
+)
+torchaudio.save("output_speech.wav", waveform, 44100)
+
+# Streaming speech-to-speech
+all_wavs = []
+all_text = ""
+for data_type, data_content in ming.generate_stream(
+    audio="https://example.com/audio.wav",
+    output_type="speech",
+    max_new_tokens=128
+):
+    if data_type == "text_data":
+        text, usage = data_content
+    elif data_type == "text_audio_data":
+        tts_speech, text, meta_info, session_id, usage = data_content
+        all_text += text
+        all_wavs.append(tts_speech)
+
+waveform = torch.cat(all_wavs, dim=-1)
+torchaudio.save("output_speech_stream.wav", waveform, 16000)
+```
+
+
+#### Video Understanding
+
+```python
+# Video QA
+text, usage = ming.generate(
+    text="详细描述一下这段视频",
+    video="test.mp4",
+    output_type="text"
+)
+print(f"Video description: {text}")
+```
+
+#### Request Interruption
+
+```python
+# You can interrupt the request during streaming generation
+msg_request_id = "your-request-id"
+for data_type, data_content in ming.generate_stream(
+    text="介绍一下杭州",
+    output_type="speech",
+    msg_request_id=msg_request_id
+):
+    # Interrupt when condition is met
+    if some_condition:
+        ming.generate_interrupt(msg_request_id)
+        break
+```
+
+### Parameter Reference
+
+#### Ming Initialization Parameters
+
+| Parameter | Type | Default | Description |
+|------|------|--------|------|
+| `model_path` | str | Required | Model weights path, must contain config.json and am.mvn |
+| `sys_prompt` | str | "" | System prompt, prepended to all conversations |
+| `device` | str | "0" | GPU device IDs, comma-separated for multi-GPU, e.g., "0,1,2,3" |
+| `gpu_memory_utilization` | dict | {"moe": 0.6, "talker": 0.1} | GPU memory utilization for each module |
+| `device_map` | dict | {"talker": ["cuda:0"], "image": "cuda:0"} | Mapping from modules to GPUs |
+| `speaker` | str | "DB30" | TTS speaker ID |
+| `quantization` | str \| None | None | Quantization configuration |
+| `use_talker` | bool | True | Whether to load TTS module |
+
+#### generate Method Parameters
+
+| Parameter | Type | Default | Description |
+|------|------|--------|------|
+| `text` | str \| None | None | Input text |
+| `audio` | str \| bytes \| List | None | Audio input (file path/binary/list) |
+| `video` | str \| bytes \| List | None | Video input (file path/binary/list) |
+| `image` | str \| bytes \| List | None | Image input (file path/binary/PIL Image/list) |
+| `history` | list | [] | Conversation history |
+| `output_type` | str | "text" | Output type: text/speech/image/tts |
+| `max_new_tokens` | int | 512 | Maximum number of tokens to generate |
+
+### Complete Examples
+
+For more complete examples, please refer to [ming_sdk/ming_test.py](ming_sdk/ming_test.py).
+
+
 ## Citation
 
 If you find our work helpful, feel free to give us a cite.

diff --git a/bailingmm_utils_video.py b/bailingmm_utils_video.py
@@ -286,22 +286,21 @@ def v1_smart_nframes(
         int: the number of frames for video used for model inputs.
     """
     assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
-
-    min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
-    max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
-
+    max_frames = max(
+        1,
+        floor_by_factor(
+            ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
+        ),
+    )
     if "nframes" in ele:
         nframes = min(total_frames, round_by_factor(ele["nframes"], FRAME_FACTOR), max_frames)
     else:
         fps = ele.get("max_video_fps", FPS)
-        nframes = total_frames / video_fps * fps
+        nframes = max(1, total_frames / video_fps * fps)
         if nframes > total_frames:
             logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
-        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
-        nframes = floor_by_factor(nframes, FRAME_FACTOR)
-    if not (FRAME_FACTOR <= nframes <= total_frames):
-        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
-    return nframes
+        nframes = min(min(nframes, max_frames), total_frames)
+    return int(nframes)
 
 
 def v1_sample_video(video_fps, total_frames, ele: dict) -> List[int]:
@@ -367,8 +366,9 @@ def v1_fetch_video(
     return_metadata: bool = False,
 ) -> torch.Tensor | list[Image.Image]:
     if isinstance(ele["video"], str):
-        video, smp_fps = load_video(ele["video"], sampler=v2_sample_video)
-
+        video, smp_fps = load_video(
+            ele["video"], sampler=partial(v1_sample_video, ele=ele)
+        )
         if "resized_height" in ele and "resized_width" in ele:
             resized_height, resized_width = smart_resize(
                 ele["resized_height"],

diff --git a/configuration_bailing_moe_v2.py b/configuration_bailing_moe_v2.py
@@ -54,6 +54,7 @@ def __init__(
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.use_qkv_bias = use_qkv_bias
+        self.use_qk_norm = use_qk_norm
         self.use_bias = use_bias
         self.norm_head = norm_head
         self.rms_norm_eps = rms_norm_eps

diff --git a/configuration_bailingmm.py b/configuration_bailingmm.py
@@ -0,0 +1,41 @@
+# coding=utf-8
+# Copyright 2024 ANT Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import PretrainedConfig
+from configuration_bailing_moe import BailingMoeConfig
+from audio_tokenizer.configuration_audio_vae import AudioVAEconfig
+
+
+class BailingMMConfig(PretrainedConfig):
+    model_type = "bailingmm"
+
+    def __init__(
+        self,
+        llm_config: BailingMoeConfig = None,
+        audio_tokenizer_config: AudioVAEconfig = None,
+        ditar_config: dict = None,
+        aggregator_config: dict = None,
+        model_type: str = None,
+        **kwargs
+    ):
+        self.model_type = model_type
+        if self.model_type == 'dense':
+            self.llm_config = llm_config
+        else:
+            self.llm_config = BailingMoeConfig(**llm_config) if isinstance(llm_config, dict) else llm_config
+        self.audio_tokenizer_config = AudioVAEconfig(**audio_tokenizer_config) if isinstance(audio_tokenizer_config, dict) else audio_tokenizer_config
+        self.ditar_config = ditar_config
+        self.aggregator_config = aggregator_config
+        super().__init__(**kwargs)