Skip to content

Commit 5977b58

Browse files
authored
Add read_audio() and ASR test (#71)
* Add read_audio() and ASR test * Fix * Update audio type conversion
1 parent cf3468c commit 5977b58

File tree

3 files changed

+77
-1
lines changed

3 files changed

+77
-1
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import type { PyodideInterface } from "pyodide";
2+
import { beforeEach, describe, it, expect } from "vitest";
3+
import { setupPyodideForTest } from "./utils";
4+
5+
import { downloadFile } from "./utils";
6+
7+
describe("read_audio() and ASR", () => {
8+
let pyodide: PyodideInterface;
9+
10+
beforeEach(async () => {
11+
pyodide = await setupPyodideForTest(["scipy"]);
12+
});
13+
14+
it("can read an audio file from a local file", async () => {
15+
await downloadFile(pyodide, "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav", "/tmp/jfk.wav")
16+
17+
await pyodide.runPythonAsync(`
18+
from transformers_js_py import import_transformers_js, read_audio
19+
import numpy as np
20+
21+
transformers = await import_transformers_js()
22+
pipeline = transformers.pipeline
23+
pipe = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en')
24+
25+
audio = read_audio("/tmp/jfk.wav", 16000)
26+
result = await pipe(audio)
27+
text = result["text"]
28+
`);
29+
const text = await pyodide.globals.get("text");
30+
expect(text).toEqual(" And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.");
31+
});
32+
});

transformers_js_py/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1+
from .audio import read_audio
12
from .proxies import import_transformers_js
23
from .url import as_url
34

4-
__all__ = ["as_url", "import_transformers_js"]
5+
__all__ = ["as_url", "read_audio", "import_transformers_js"]

transformers_js_py/audio.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
try:
2+
import numpy as np
3+
except ImportError:
4+
np = None # type: ignore
5+
6+
7+
def read_audio(filename, sampling_rate: int) -> "np.ndarray":
8+
# Refs:
9+
# * https://github.com/xenova/transformers.js/blob/2.15.1/src/utils/audio.js#L42-L77
10+
# * https://huggingface.co/docs/transformers.js/guides/node-audio-processing
11+
12+
try:
13+
import numpy as np
14+
import scipy # type: ignore
15+
except ImportError:
16+
raise ImportError(
17+
"You need to have `numpy` and `scipy` installed to use this feature."
18+
)
19+
20+
original_sample_rate, samples = scipy.io.wavfile.read(filename, mmap=False)
21+
22+
# Ensure samples are float32
23+
# Ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.read.html # noqa: E501
24+
if samples.dtype == np.int16:
25+
samples = samples.astype(np.float32) / 32768.0
26+
elif samples.dtype == np.int32:
27+
samples = samples.astype(np.float32) / 2147483648.0
28+
elif samples.dtype == np.uint8:
29+
samples = (samples.astype(np.float32) - 128.0) / 128.0
30+
31+
if original_sample_rate != sampling_rate:
32+
samples = scipy.signal.resample(
33+
samples, int(len(samples) * sampling_rate / original_sample_rate)
34+
)
35+
36+
if samples.ndim > 1 and samples.shape[1] > 1:
37+
SCALING_FACTOR = np.sqrt(2)
38+
# Merge channels (into first channel to save memory)
39+
left = samples[:, 0]
40+
right = samples[:, 1]
41+
samples = SCALING_FACTOR * (left + right) / 2
42+
43+
return samples

0 commit comments

Comments
 (0)