Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 36 additions & 13 deletions src/translation/audio_translation.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,66 @@
#!/usr/bin/env python3
"""
AudioTranslation class is the main entry point for the audio related processing.
High-level audio translation for Mimasa.

This class currently performs vocal/music separation using the configured
audio separator and then returns the path to the isolated vocals file. It
provides a hook where future implementations can perform speech-to-text,
machine translation and text-to-speech synthesis. The original code in
``src/translation/audio_translation.py`` did not return a value from the
``translate`` coroutine, which prevented the pipeline from obtaining the
location of the translated audio. This refactored version fixes that and
stores the translated audio path on the instance as ``translated_audio_file``.
"""

from typing import Optional, Tuple

from src.audioseparator import utils as separator_utils
from src.common.libraries import Audio, Config, Logger


class AudioTranslation:
"""Class for translating audio in a file"""
"""Class for translating audio in a file."""

def __init__(self, audio: Audio, output_language: str, input_language: str = "unknown"):
"""
Initialize an AudioTranslation object with an audio object

:param audio: a valid audio object
"""
self.audio = audio
self.output_language = output_language
self.input_language = input_language
self.audio_file = audio.get_filename()
self.audio_separator = None
self.translated_audio_file: Optional[str] = None

self.logger = Logger(self.__class__.__name__)
self.logger.add_file_handler("audio_translation.log")

self._initialize()

def _initialize(self):
def _initialize(self) -> None:
self.logger.debug("Initializing audio translation unit...")
self.audio_separator = separator_utils.get_audio_separator(Config.AUDIO_SEPARATOR)
self.logger.debug("Audio translation unit is initialized successfully")

async def translate(self):
"""Asynchronous coroutine for performing the audio translation"""
async def translate(self) -> Optional[str]:
"""
Asynchronous coroutine for performing the audio translation.

Returns the path to the translated audio (currently the isolated vocals).
Downstream consumers should check for ``None`` and fall back to the
extracted audio if necessary.
"""
self.logger.debug(f"Starting audio translation for file: {self.audio_file}")
try:
await self.audio_separator.separate_vocals_and_music(
audio=self.audio, destination=f"{Config.TRANSLATION_OUTPUT_PATH}"
# Separate vocals and music; use the vocals as the translated audio
music_path, vocals_path = await self.audio_separator.separate_vocals_and_music(
audio=self.audio,
destination=f"{Config.TRANSLATION_OUTPUT_PATH}"
)
self.logger.info(
f"Audio translation completed successfully for file: {self.audio_file}."
)
self.logger.info(f"Audio translation completed successfully for file: {self.audio_file}")
# In a future implementation you would perform speech-to-text,
# translation and text-to-speech here. For now return the vocals.
self.translated_audio_file = vocals_path
return self.translated_audio_file
except Exception as e:
self.logger.error(f"Error during audio translation for file: {self.audio_file}: {e}")
return None
174 changes: 124 additions & 50 deletions src/translation/translation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
#!/usr/bin/env python3
"""
Translation class is the main entry point for the application, and it creates two
separate asynchronous tasks for the audio and video translations.
Extended translation pipeline for Mimasa.

This implementation orchestrates the audio and video translation components and
then combines the translated audio with the processed video. The original
repository left the combination step unimplemented and did not return the
path of the translated audio from the audio translation coroutine. This
module adds those missing pieces so that the pipeline can produce a final
video file with the updated audio track.

Note: because network‑bound translation services (e.g. Google Translate,
Google Speech‑to‑Text, gTTS) are not available in this environment, the
current AudioTranslation implementation simply separates vocals from the
input audio and returns the vocals path as the "translated" audio. The
combine step then muxes this audio back into the video. Users can replace
the translation logic in `src/translation/audio_translation.py` with their
preferred implementation if they have access to external APIs.
"""

import asyncio
import os
import subprocess
from typing import Optional

from src.common.libraries import Audio, Config, Logger, Video
from src.translation.audio_extractor import AudioExtractor
Expand All @@ -13,54 +31,64 @@


class Translation:
"""Class for extracting audio from a video file and performing audio and video translations in parallel"""
"""Main class for orchestrating audio and video translation.

def __init__(self, video: Video, output_language: str, input_language: str = "Unknown"):
"""
Initialize a Translation object with a video file
This class extracts audio from a video, runs the audio and video
translation tasks concurrently and finally combines the translated audio
with the processed video into a single output file. It exposes a
synchronous wrapper via :meth:`translate_sync` for convenience.
"""

:param video: a valid video object
"""
def __init__(self, video: Video, output_language: str, input_language: str = "Unknown"):
self.video = video
self.output_language = output_language
self.input_language = input_language

# Paths for intermediate and final artifacts
self.video_file = video.get_filename()
self.extracted_audio_file = f"{Config.TRANSLATION_OUTPUT_PATH}/extracted_audio_{utils.get_filename_from_path(self.video_file).split('.')[0]}.wav"
self.extracted_audio = Audio(file_path=self.extracted_audio_file, language=self.input_language)

self.output_video_filename = None
self.output_video_filename: Optional[str] = None

self.logger = Logger(self.__class__.__name__)
self.logger.add_file_handler("translation.log")

self._initialize()

def _initialize(self):
def _initialize(self) -> None:
self.logger.debug("Initializing translation unit...")
# Initialise the audio extractor and translation units
self.audio_extractor = AudioExtractor(video=self.video)
self.audio_translation = AudioTranslation(
audio=self.extracted_audio, output_language=self.output_language, input_language=self.input_language
)
self.audio_translation = AudioTranslation(audio=self.extracted_audio, output_language=self.output_language, input_language=self.input_language)
self.video_translation = VideoTranslation(self.video)
self.logger.debug("Translation unit initialized successfully")

def get_output_video(self):
def get_output_video(self) -> Optional[str]:
"""Return the path to the combined output video once translation is complete."""
return self.output_video_filename

async def translate_audio(self):
"""Asynchronous coroutine for performing the audio translation"""
async def translate_audio(self) -> Optional[str]:
"""Asynchronous coroutine for performing the audio translation.

Returns the path to the translated audio on success. If the
underlying AudioTranslation returns ``None`` the pipeline will fall
back to using the extracted audio.
"""
self.logger.debug("Starting audio translation")
try:
await self.audio_translation.translate()
result = await self.audio_translation.translate()
self.logger.info("Audio translation completed successfully")
return "success"
return result
except Exception as e:
self.logger.error(f"Error during audio translation: {e}")
raise

async def translate_video(self):
"""Asynchronous coroutine for performing the video translation"""
async def translate_video(self) -> Video:
"""Asynchronous coroutine for performing the video translation.

Returns a :class:`Video` instance pointing at the processed video file.
"""
self.logger.debug("Starting video translation")
try:
result = await self.video_translation.translate()
Expand All @@ -70,52 +98,98 @@ async def translate_video(self):
self.logger.error(f"Error during video translation: {e}")
raise

async def translate(self):
"""Asynchronous coroutine for performing both audio and video translations in parallel"""
def _combine_audio_video(self, video_path: str, audio_path: str) -> str:
"""Combine the provided audio with the video into a final output file.

Uses ffmpeg for muxing; if ffmpeg is unavailable or fails the original
video is copied to the output location. The output file is stored
under ``Config.TRANSLATION_OUTPUT_PATH`` and its name is derived from
the original video filename.
"""
basename = os.path.basename(video_path)
name, ext = os.path.splitext(basename)
output_dir = str(Config.TRANSLATION_OUTPUT_PATH)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"translated_{name}{ext}")

# If no translated audio was produced, simply copy the video and return
if audio_path is None or not os.path.exists(audio_path):
self.logger.warning("No translated audio provided; copying original video to output.")
try:
import shutil
shutil.copy(video_path, output_path)
except Exception as e:
self.logger.error(f"Failed to copy video to output path: {e}")
raise
return output_path

# Try to merge audio and video with ffmpeg
command = [
"ffmpeg",
"-y",
"-i",
video_path,
"-i",
audio_path,
"-c:v",
"copy",
"-c:a",
"aac",
output_path,
]
self.logger.debug(f"Combining audio and video with command: {' '.join(command)}")
try:
subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
self.logger.info(f"Combined audio and video saved to {output_path}")
except Exception as e:
self.logger.error(f"Error combining audio and video: {e}; falling back to copying video.")
import shutil
shutil.copy(video_path, output_path)
return output_path

async def translate(self) -> None:
"""Perform both audio and video translations and combine the results."""
self.logger.debug("Starting parallel audio and video translations")
try:
utils.setup()

# Extract audio from the video synchronously
self.audio_extractor.extract(output_file=self.extracted_audio_file)

# Kick off audio and video translation tasks concurrently
translation_tasks = [
self.translate_audio(),
self.translate_video(),
]
audio_task, video_task = await asyncio.gather(*translation_tasks)

output_video_filename = video_task.get_filename()
self.output_video_filename = output_video_filename

# with ThreadPoolExecutor() as executor:
# audio_future = executor.submit(self.translate_audio)
# video_future = executor.submit(self.translate_video)
# audio_future.result()
# self.output_video_filename = video_future.result()

# translation_tasks = [
# asyncio.ensure_future(self.translate_audio()),
# asyncio.ensure_future(self.translate_video()),
# ]
# await asyncio.gather(*translation_tasks)

# output_video_filename = translation_tasks[1].result()
# self.output_video_filename = output_video_filename

# audio_task.result()
# done, pending = await asyncio.wait(translation_tasks)
# for task in done:
# if task == translation_tasks[1]:
# output_video_filename = task.result()
# self.output_video_filename = output_video_filename
audio_path, video_obj = await asyncio.gather(*translation_tasks)

# Determine which audio to use; fall back to extracted audio if necessary
final_audio_path = audio_path or self.extracted_audio_file
final_video_path = video_obj.get_filename() if isinstance(video_obj, Video) else None

# Combine audio and video
if final_video_path:
combined_path = self._combine_audio_video(final_video_path, final_audio_path)
self.output_video_filename = combined_path
else:
# If video translation failed to return a Video object, just copy the input video
self.logger.warning("Video translation did not return a Video object; copying original video.")
import shutil
output_dir = str(Config.TRANSLATION_OUTPUT_PATH)
os.makedirs(output_dir, exist_ok=True)
dest_path = os.path.join(output_dir, os.path.basename(self.video_file))
shutil.copy(self.video_file, dest_path)
self.output_video_filename = dest_path

self.logger.info("Parallel audio and video translations completed successfully")
except Exception as e:
self.logger.critical(f"Error during parallel audio and video translations: {e}")
raise
finally:
utils.teardown()

def translate_sync(self):
"""Asynchronous coroutine for performing both audio and video translations in parallel"""
def translate_sync(self) -> None:
"""Synchronous wrapper around the asynchronous translate coroutine."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self.translate())
Expand Down
Loading