Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 269 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,275 @@ print(output_text)
```


## Ming SDK

Ming SDK provides a simple and easy-to-use Python API for quickly integrating the multimodal capabilities of Ming-flash-omni 2.0.

### SDK Features

- **Unified API Interface**: Supports text generation, speech synthesis, image generation/editing, and more
- **Streaming Output Support**: Supports streaming generation for text and speech, suitable for real-time interaction scenarios
- **Flexible Device Configuration**: Supports multi-GPU deployment and memory optimization
- **Complete Usage Statistics**: Provides detailed statistics on token usage, audio duration, etc.

### SDK Installation

#### Install VLLM
pip install vllm-0.8.5.post3.dev90+gffc0d5a3f.ant-cp310-cp310-linux_x86_64.whl

#### Install Ming SDK
#### Option 1: Build from Source

```bash
# 1. Clone the repository
git clone https://github.com/inclusionAI/Ming.git
cd Ming

# 2. Install dependencies
pip install -r requirements.txt

python ming_sdk/setup.py bdist_wheel

pip3 install dist/ming_sdk-1.0.0-py3-none-any.whl
```

### SDK Usage Examples

#### Initialize SDK

```python
from ming_sdk import Ming

# Configuration parameters
model_path = "your model path" # Model path
device = "0,1,2,3" # GPU devices, supports multi-GPU parallelism
gpu_memory_utilization = {"moe": 0.8, "talker": 0.17} # GPU memory utilization
device_map = {"talker": ["cuda:0"]} # Module device mapping

# Initialize Ming instance
ming = Ming(
model_path=model_path,
device=device,
gpu_memory_utilization=gpu_memory_utilization,
device_map=device_map,
speaker="DB30", # TTS speaker ID
with_async = True,
use_talker = True
)
```

#### Text Generation

```python
# Non-streaming text generation
text, usage = ming.generate(text="介绍一下杭州")
print(f"text:{text}")
print(f"usage:{usage}")
assert text is not None


# Streaming text generation
all_text = ""
request_id = ""
for text, request_id, usage in ming.generate_stream(
text="介绍一下杭州", max_new_tokens=128
):
all_text += text
print(f"request_id:{request_id},text={all_text},usage={usage}")
assert text is not None
print(f"\nFull text: {all_text}")
```
#### Speech QA
```python
# Speech QA
output_audio_path = "test.wav"
waveform, gen_text, usage = ming.generate(
text="介绍一下杭州", output_type="speech", max_new_tokens=128
)
sr = 44100
torchaudio.save(output_audio_path, waveform, sr)
assert os.path.exists(output_audio_path)
print(f"request_id:{gen_text},usage={usage}")


# Streaming speech QA
all_wavs = []
all_text = ""
request_id = ""
output_audio_path = "test_stream.wav"
for data_type, data_content in ming.generate_stream(
text="介绍一下杭州", output_type="speech", max_new_tokens=128
):
if data_type == "text_data":
text, usage = data_content
elif data_type == "text_audio_data":
tts_speech, text, meta_info, session_id, usage = data_content
all_text += text
all_wavs.append(tts_speech)
waveform = torch.cat(all_wavs, dim=-1)
sr = 44100
torchaudio.save(output_audio_path, waveform, sr)
print(
f"request_id:{request_id},audio:{output_audio_path},text={all_text},usage={usage}"
)
assert os.path.exists(output_audio_path)


# Streaming speech QA with interruption
all_wavs = []
all_text = ""
request_id = ""
output_audio_path = "test_stream.wav"
for data_type, data_content in ming.generate_stream(
text="介绍一下杭州", output_type="speech", max_new_tokens=128
):
if data_type == "text_data":
text, usage = data_content
elif data_type == "text_audio_data":
tts_speech, text, meta_info, session_id, usage = data_content
all_text += text
all_wavs.append(tts_speech)
if len(all_text) > 20:
ming.generate_interrupt(request_id)
waveform = torch.cat(all_wavs, dim=-1)
sr = 44100
torchaudio.save(output_audio_path, waveform, sr)
print(f"request_id:{request_id},audio:{output_audio_path},text={all_text}")
assert os.path.exists(output_audio_path)

```
Comment on lines +350 to +372
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The example for streaming speech QA with interruption has a bug. The request_id variable is initialized as an empty string and is never updated within the loop. However, it's passed to ming.generate_interrupt(request_id). This will not work as intended.

To fix this, you should pass a unique msg_request_id to the generate_stream call and use that same ID for the interruption, as shown in the 'Request Interruption' example later in the README. The session_id returned by the stream should also be used correctly if it's intended for this purpose.


#### ASR Task
```python
# ASR
asr_result, usage = ming.generate(
text="Please recognize the language of this speech and transcribe it. Format: oral.",
audio="https://example.com/audio.wav",
)
print(f"asr_result:{asr_result},usage={usage}")
assert asr_result is not None
```

#### Text-to-Speech (TTS)

```python
import torchaudio

# Non-streaming TTS
waveform, usage = ming.generate(
text="我爱北京故宫",
output_type="speech"
)
torchaudio.save("output_tts.wav", waveform, 44100)

# Streaming TTS
all_wavs = []
all_text = ""
for data_type, data_content in ming.generate_stream(
text="我爱北京故宫",
output_type="speech"
):
if data_type == "text_audio_data":
tts_speech, sentence, meta_info, session_id, usage = data_content
all_text += sentence
all_wavs.append(tts_speech)

# Save audio
waveform = torch.cat(all_wavs, dim=-1)
torchaudio.save("output_tts_stream.wav", waveform, 44100)
```

#### Speech-to-Speech

```python
# Non-streaming speech-to-speech
waveform, gen_text, usage = ming.generate(
audio="https://example.com/audio.wav",
output_type="speech",
max_new_tokens=128
)
torchaudio.save("output_speech.wav", waveform, 44100)

# Streaming speech-to-speech
all_wavs = []
all_text = ""
for data_type, data_content in ming.generate_stream(
audio="https://example.com/audio.wav",
output_type="speech",
max_new_tokens=128
):
if data_type == "text_data":
text, usage = data_content
elif data_type == "text_audio_data":
tts_speech, text, meta_info, session_id, usage = data_content
all_text += text
all_wavs.append(tts_speech)

waveform = torch.cat(all_wavs, dim=-1)
torchaudio.save("output_speech_stream.wav", waveform, 16000)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There's an inconsistency in the sample rates used for saving audio files across different examples. Here, 16000 is used, while other examples (e.g., non-streaming TTS on line 395, non-streaming speech-to-speech on line 423) use 44100. This can be confusing for users. It would be best to use a consistent sample rate that matches the model's native output, and perhaps mention it as a configurable parameter.

```


#### Video Understanding

```python
# Video QA
text, usage = ming.generate(
text="详细描述一下这段视频",
video="test.mp4",
output_type="text"
)
print(f"Video description: {text}")
```

#### Request Interruption

```python
# You can interrupt the request during streaming generation
msg_request_id = "your-request-id"
for data_type, data_content in ming.generate_stream(
text="介绍一下杭州",
output_type="speech",
msg_request_id=msg_request_id
):
# Interrupt when condition is met
if some_condition:
ming.generate_interrupt(msg_request_id)
break
```

### Parameter Reference

#### Ming Initialization Parameters

| Parameter | Type | Default | Description |
|------|------|--------|------|
| `model_path` | str | Required | Model weights path, must contain config.json and am.mvn |
| `sys_prompt` | str | "" | System prompt, prepended to all conversations |
| `device` | str | "0" | GPU device IDs, comma-separated for multi-GPU, e.g., "0,1,2,3" |
| `gpu_memory_utilization` | dict | {"moe": 0.6, "talker": 0.1} | GPU memory utilization for each module |
| `device_map` | dict | {"talker": ["cuda:0"], "image": "cuda:0"} | Mapping from modules to GPUs |
| `speaker` | str | "DB30" | TTS speaker ID |
| `quantization` | str \| None | None | Quantization configuration |
| `use_talker` | bool | True | Whether to load TTS module |

#### generate Method Parameters

| Parameter | Type | Default | Description |
|------|------|--------|------|
| `text` | str \| None | None | Input text |
| `audio` | str \| bytes \| List | None | Audio input (file path/binary/list) |
| `video` | str \| bytes \| List | None | Video input (file path/binary/list) |
| `image` | str \| bytes \| List | None | Image input (file path/binary/PIL Image/list) |
| `history` | list | [] | Conversation history |
| `output_type` | str | "text" | Output type: text/speech/image/tts |
| `max_new_tokens` | int | 512 | Maximum number of tokens to generate |

### Complete Examples

For more complete examples, please refer to [ming_sdk/ming_test.py](ming_sdk/ming_test.py).


## Citation

If you find our work helpful, feel free to give us a cite.
Expand Down
24 changes: 12 additions & 12 deletions bailingmm_utils_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,22 +286,21 @@ def v1_smart_nframes(
int: the number of frames for video used for model inputs.
"""
assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"

min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)

max_frames = max(
1,
floor_by_factor(
ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
),
)
if "nframes" in ele:
nframes = min(total_frames, round_by_factor(ele["nframes"], FRAME_FACTOR), max_frames)
else:
fps = ele.get("max_video_fps", FPS)
nframes = total_frames / video_fps * fps
nframes = max(1, total_frames / video_fps * fps)
if nframes > total_frames:
logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
nframes = floor_by_factor(nframes, FRAME_FACTOR)
if not (FRAME_FACTOR <= nframes <= total_frames):
raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
return nframes
nframes = min(min(nframes, max_frames), total_frames)
return int(nframes)
Comment on lines +289 to +303
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The refactoring of v1_smart_nframes has removed the validation that ensured the number of frames (nframes) is at least FRAME_FACTOR (which is 2). The new logic only guarantees nframes >= 1. This could introduce a regression if downstream code expects at least 2 frames. Additionally, the ValueError that was previously raised for out-of-bounds nframes has been removed, which might hide potential configuration issues from the user.



def v1_sample_video(video_fps, total_frames, ele: dict) -> List[int]:
Expand Down Expand Up @@ -367,8 +366,9 @@ def v1_fetch_video(
return_metadata: bool = False,
) -> torch.Tensor | list[Image.Image]:
if isinstance(ele["video"], str):
video, smp_fps = load_video(ele["video"], sampler=v2_sample_video)

video, smp_fps = load_video(
ele["video"], sampler=partial(v1_sample_video, ele=ele)
)
if "resized_height" in ele and "resized_width" in ele:
resized_height, resized_width = smart_resize(
ele["resized_height"],
Expand Down
1 change: 1 addition & 0 deletions configuration_bailing_moe_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.use_qkv_bias = use_qkv_bias
self.use_qk_norm = use_qk_norm
self.use_bias = use_bias
self.norm_head = norm_head
self.rms_norm_eps = rms_norm_eps
Expand Down
41 changes: 41 additions & 0 deletions configuration_bailingmm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# coding=utf-8
# Copyright 2024 ANT Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import PretrainedConfig
from configuration_bailing_moe import BailingMoeConfig
from audio_tokenizer.configuration_audio_vae import AudioVAEconfig


class BailingMMConfig(PretrainedConfig):
model_type = "bailingmm"

def __init__(
self,
llm_config: BailingMoeConfig = None,
audio_tokenizer_config: AudioVAEconfig = None,
ditar_config: dict = None,
aggregator_config: dict = None,
model_type: str = None,
**kwargs
):
self.model_type = model_type
if self.model_type == 'dense':
self.llm_config = llm_config
else:
self.llm_config = BailingMoeConfig(**llm_config) if isinstance(llm_config, dict) else llm_config
self.audio_tokenizer_config = AudioVAEconfig(**audio_tokenizer_config) if isinstance(audio_tokenizer_config, dict) else audio_tokenizer_config
self.ditar_config = ditar_config
self.aggregator_config = aggregator_config
super().__init__(**kwargs)
Loading