Add read_audio() and ASR test (#71)

whitphx · web-flow · commit 5977b586985d · 2024-02-26T20:02:58.000+09:00
* Add read_audio() and ASR test

* Fix

* Update audio type conversion
diff --git a/pyodide-e2e/src/tests/audio.test.ts b/pyodide-e2e/src/tests/audio.test.ts
@@ -0,0 +1,32 @@
+import type { PyodideInterface } from "pyodide";
+import { beforeEach, describe, it, expect } from "vitest";
+import { setupPyodideForTest } from "./utils";
+
+import { downloadFile } from "./utils";
+
+describe("read_audio() and ASR", () => {
+  let pyodide: PyodideInterface;
+
+  beforeEach(async () => {
+    pyodide = await setupPyodideForTest(["scipy"]);
+  });
+
+  it("can read an audio file from a local file", async () => {
+    await downloadFile(pyodide, "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav", "/tmp/jfk.wav")
+
+    await pyodide.runPythonAsync(`
+from transformers_js_py import import_transformers_js, read_audio
+import numpy as np
+
+transformers = await import_transformers_js()
+pipeline = transformers.pipeline
+pipe = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en')
+
+audio = read_audio("/tmp/jfk.wav", 16000)
+result = await pipe(audio)
+text = result["text"]
+`);
+    const text = await pyodide.globals.get("text");
+    expect(text).toEqual(" And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.");
+  });
+});
diff --git a/transformers_js_py/__init__.py b/transformers_js_py/__init__.py
@@ -1,4 +1,5 @@
+from .audio import read_audio
 from .proxies import import_transformers_js
 from .url import as_url
 
-__all__ = ["as_url", "import_transformers_js"]
+__all__ = ["as_url", "read_audio", "import_transformers_js"]
diff --git a/transformers_js_py/audio.py b/transformers_js_py/audio.py
@@ -0,0 +1,43 @@
+try:
+    import numpy as np
+except ImportError:
+    np = None  # type: ignore
+
+
+def read_audio(filename, sampling_rate: int) -> "np.ndarray":
+    # Refs:
+    # * https://github.com/xenova/transformers.js/blob/2.15.1/src/utils/audio.js#L42-L77
+    # * https://huggingface.co/docs/transformers.js/guides/node-audio-processing
+
+    try:
+        import numpy as np
+        import scipy  # type: ignore
+    except ImportError:
+        raise ImportError(
+            "You need to have `numpy` and `scipy` installed to use this feature."
+        )
+
+    original_sample_rate, samples = scipy.io.wavfile.read(filename, mmap=False)
+
+    # Ensure samples are float32
+    # Ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.read.html  # noqa: E501
+    if samples.dtype == np.int16:
+        samples = samples.astype(np.float32) / 32768.0
+    elif samples.dtype == np.int32:
+        samples = samples.astype(np.float32) / 2147483648.0
+    elif samples.dtype == np.uint8:
+        samples = (samples.astype(np.float32) - 128.0) / 128.0
+
+    if original_sample_rate != sampling_rate:
+        samples = scipy.signal.resample(
+            samples, int(len(samples) * sampling_rate / original_sample_rate)
+        )
+
+    if samples.ndim > 1 and samples.shape[1] > 1:
+        SCALING_FACTOR = np.sqrt(2)
+        # Merge channels (into first channel to save memory)
+        left = samples[:, 0]
+        right = samples[:, 1]
+        samples = SCALING_FACTOR * (left + right) / 2
+
+    return samples