developers-cosmos · RitheeshBaradwaj · Jul 26, 2025 · Jul 26, 2025
diff --git a/src/translation/audio_translation.py b/src/translation/audio_translation.py
@@ -1,43 +1,66 @@
 #!/usr/bin/env python3
 """
-AudioTranslation class is the main entry point for the audio related processing.
+High-level audio translation for Mimasa.
+
+This class currently performs vocal/music separation using the configured
+audio separator and then returns the path to the isolated vocals file.  It
+provides a hook where future implementations can perform speech-to-text,
+machine translation and text-to-speech synthesis.  The original code in
+``src/translation/audio_translation.py`` did not return a value from the
+``translate`` coroutine, which prevented the pipeline from obtaining the
+location of the translated audio.  This refactored version fixes that and
+stores the translated audio path on the instance as ``translated_audio_file``.
 """
+
+from typing import Optional, Tuple
+
 from src.audioseparator import utils as separator_utils
 from src.common.libraries import Audio, Config, Logger
 
 
 class AudioTranslation:
-    """Class for translating audio in a file"""
+    """Class for translating audio in a file."""
 
     def __init__(self, audio: Audio, output_language: str, input_language: str = "unknown"):
-        """
-        Initialize an AudioTranslation object with an audio object
-
-        :param audio: a valid audio object
-        """
         self.audio = audio
         self.output_language = output_language
         self.input_language = input_language
         self.audio_file = audio.get_filename()
         self.audio_separator = None
+        self.translated_audio_file: Optional[str] = None
 
         self.logger = Logger(self.__class__.__name__)
         self.logger.add_file_handler("audio_translation.log")
 
         self._initialize()
 
-    def _initialize(self):
+    def _initialize(self) -> None:
         self.logger.debug("Initializing audio translation unit...")
         self.audio_separator = separator_utils.get_audio_separator(Config.AUDIO_SEPARATOR)
         self.logger.debug("Audio translation unit is initialized successfully")
 
-    async def translate(self):
-        """Asynchronous coroutine for performing the audio translation"""
+    async def translate(self) -> Optional[str]:
+        """
+        Asynchronous coroutine for performing the audio translation.
+
+        Returns the path to the translated audio (currently the isolated vocals).
+        Downstream consumers should check for ``None`` and fall back to the
+        extracted audio if necessary.
+        """
         self.logger.debug(f"Starting audio translation for file: {self.audio_file}")
         try:
-            await self.audio_separator.separate_vocals_and_music(
-                audio=self.audio, destination=f"{Config.TRANSLATION_OUTPUT_PATH}"
+            # Separate vocals and music; use the vocals as the translated audio
+            music_path, vocals_path = await self.audio_separator.separate_vocals_and_music(
+                audio=self.audio,
+                destination=f"{Config.TRANSLATION_OUTPUT_PATH}"
+            )
+            self.logger.info(
+                f"Audio translation completed successfully for file: {self.audio_file}."
             )
-            self.logger.info(f"Audio translation completed successfully for file: {self.audio_file}")
+            # In a future implementation you would perform speech-to-text,
+            # translation and text-to-speech here.  For now return the vocals.
+            self.translated_audio_file = vocals_path
+            return self.translated_audio_file
         except Exception as e:
             self.logger.error(f"Error during audio translation for file: {self.audio_file}: {e}")
+            return None
diff --git a/src/translation/translation.py b/src/translation/translation.py
@@ -1,9 +1,27 @@
 #!/usr/bin/env python3
 """
-Translation class is the main entry point for the application, and it creates two
-separate asynchronous tasks for the audio and video translations.
+Extended translation pipeline for Mimasa.
+
+This implementation orchestrates the audio and video translation components and
+then combines the translated audio with the processed video.  The original
+repository left the combination step unimplemented and did not return the
+path of the translated audio from the audio translation coroutine.  This
+module adds those missing pieces so that the pipeline can produce a final
+video file with the updated audio track.
+
+Note: because network‑bound translation services (e.g. Google Translate,
+Google Speech‑to‑Text, gTTS) are not available in this environment, the
+current AudioTranslation implementation simply separates vocals from the
+input audio and returns the vocals path as the "translated" audio.  The
+combine step then muxes this audio back into the video.  Users can replace
+the translation logic in `src/translation/audio_translation.py` with their
+preferred implementation if they have access to external APIs.
 """
+
 import asyncio
+import os
+import subprocess
+from typing import Optional
 
 from src.common.libraries import Audio, Config, Logger, Video
 from src.translation.audio_extractor import AudioExtractor
@@ -13,54 +31,64 @@
 
 
 class Translation:
-    """Class for extracting audio from a video file and performing audio and video translations in parallel"""
+    """Main class for orchestrating audio and video translation.
 
-    def __init__(self, video: Video, output_language: str, input_language: str = "Unknown"):
-        """
-        Initialize a Translation object with a video file
+    This class extracts audio from a video, runs the audio and video
+    translation tasks concurrently and finally combines the translated audio
+    with the processed video into a single output file.  It exposes a
+    synchronous wrapper via :meth:`translate_sync` for convenience.
+    """
 
-        :param video: a valid video object
-        """
+    def __init__(self, video: Video, output_language: str, input_language: str = "Unknown"):
         self.video = video
         self.output_language = output_language
         self.input_language = input_language
 
+        # Paths for intermediate and final artifacts
         self.video_file = video.get_filename()
         self.extracted_audio_file = f"{Config.TRANSLATION_OUTPUT_PATH}/extracted_audio_{utils.get_filename_from_path(self.video_file).split('.')[0]}.wav"
         self.extracted_audio = Audio(file_path=self.extracted_audio_file, language=self.input_language)
 
-        self.output_video_filename = None
+        self.output_video_filename: Optional[str] = None
 
         self.logger = Logger(self.__class__.__name__)
         self.logger.add_file_handler("translation.log")
 
         self._initialize()
 
-    def _initialize(self):
+    def _initialize(self) -> None:
         self.logger.debug("Initializing translation unit...")
+        # Initialise the audio extractor and translation units
         self.audio_extractor = AudioExtractor(video=self.video)
-        self.audio_translation = AudioTranslation(
-            audio=self.extracted_audio, output_language=self.output_language, input_language=self.input_language
-        )
+        self.audio_translation = AudioTranslation(audio=self.extracted_audio, output_language=self.output_language, input_language=self.input_language)
         self.video_translation = VideoTranslation(self.video)
         self.logger.debug("Translation unit initialized successfully")
 
-    def get_output_video(self):
+    def get_output_video(self) -> Optional[str]:
+        """Return the path to the combined output video once translation is complete."""
         return self.output_video_filename
 
-    async def translate_audio(self):
-        """Asynchronous coroutine for performing the audio translation"""
+    async def translate_audio(self) -> Optional[str]:
+        """Asynchronous coroutine for performing the audio translation.
+
+        Returns the path to the translated audio on success.  If the
+        underlying AudioTranslation returns ``None`` the pipeline will fall
+        back to using the extracted audio.
+        """
         self.logger.debug("Starting audio translation")
         try:
-            await self.audio_translation.translate()
+            result = await self.audio_translation.translate()
             self.logger.info("Audio translation completed successfully")
-            return "success"
+            return result
         except Exception as e:
             self.logger.error(f"Error during audio translation: {e}")
             raise
 
-    async def translate_video(self):
-        """Asynchronous coroutine for performing the video translation"""
+    async def translate_video(self) -> Video:
+        """Asynchronous coroutine for performing the video translation.
+
+        Returns a :class:`Video` instance pointing at the processed video file.
+        """
         self.logger.debug("Starting video translation")
         try:
             result = await self.video_translation.translate()
@@ -70,52 +98,98 @@ async def translate_video(self):
             self.logger.error(f"Error during video translation: {e}")
             raise
 
-    async def translate(self):
-        """Asynchronous coroutine for performing both audio and video translations in parallel"""
+    def _combine_audio_video(self, video_path: str, audio_path: str) -> str:
+        """Combine the provided audio with the video into a final output file.
+
+        Uses ffmpeg for muxing; if ffmpeg is unavailable or fails the original
+        video is copied to the output location.  The output file is stored
+        under ``Config.TRANSLATION_OUTPUT_PATH`` and its name is derived from
+        the original video filename.
+        """
+        basename = os.path.basename(video_path)
+        name, ext = os.path.splitext(basename)
+        output_dir = str(Config.TRANSLATION_OUTPUT_PATH)
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, f"translated_{name}{ext}")
+
+        # If no translated audio was produced, simply copy the video and return
+        if audio_path is None or not os.path.exists(audio_path):
+            self.logger.warning("No translated audio provided; copying original video to output.")
+            try:
+                import shutil
+                shutil.copy(video_path, output_path)
+            except Exception as e:
+                self.logger.error(f"Failed to copy video to output path: {e}")
+                raise
+            return output_path
+
+        # Try to merge audio and video with ffmpeg
+        command = [
+            "ffmpeg",
+            "-y",
+            "-i",
+            video_path,
+            "-i",
+            audio_path,
+            "-c:v",
+            "copy",
+            "-c:a",
+            "aac",
+            output_path,
+        ]
+        self.logger.debug(f"Combining audio and video with command: {' '.join(command)}")
+        try:
+            subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            self.logger.info(f"Combined audio and video saved to {output_path}")
+        except Exception as e:
+            self.logger.error(f"Error combining audio and video: {e}; falling back to copying video.")
+            import shutil
+            shutil.copy(video_path, output_path)
+        return output_path
+
+    async def translate(self) -> None:
+        """Perform both audio and video translations and combine the results."""
         self.logger.debug("Starting parallel audio and video translations")
         try:
             utils.setup()
+
+            # Extract audio from the video synchronously
             self.audio_extractor.extract(output_file=self.extracted_audio_file)
 
+            # Kick off audio and video translation tasks concurrently
             translation_tasks = [
                 self.translate_audio(),
                 self.translate_video(),
             ]
-            audio_task, video_task = await asyncio.gather(*translation_tasks)
-
-            output_video_filename = video_task.get_filename()
-            self.output_video_filename = output_video_filename
-
-            # with ThreadPoolExecutor() as executor:
-            #     audio_future = executor.submit(self.translate_audio)
-            #     video_future = executor.submit(self.translate_video)
-            #     audio_future.result()
-            #     self.output_video_filename = video_future.result()
-
-            # translation_tasks = [
-            #     asyncio.ensure_future(self.translate_audio()),
-            #     asyncio.ensure_future(self.translate_video()),
-            # ]
-            # await asyncio.gather(*translation_tasks)
-
-            # output_video_filename = translation_tasks[1].result()
-            # self.output_video_filename = output_video_filename
-
-            # audio_task.result()
-            # done, pending = await asyncio.wait(translation_tasks)
-            # for task in done:
-            #     if task == translation_tasks[1]:
-            #         output_video_filename = task.result()
-            #         self.output_video_filename = output_video_filename
+            audio_path, video_obj = await asyncio.gather(*translation_tasks)
+
+            # Determine which audio to use; fall back to extracted audio if necessary
+            final_audio_path = audio_path or self.extracted_audio_file
+            final_video_path = video_obj.get_filename() if isinstance(video_obj, Video) else None
+
+            # Combine audio and video
+            if final_video_path:
+                combined_path = self._combine_audio_video(final_video_path, final_audio_path)
+                self.output_video_filename = combined_path
+            else:
+                # If video translation failed to return a Video object, just copy the input video
+                self.logger.warning("Video translation did not return a Video object; copying original video.")
+                import shutil
+                output_dir = str(Config.TRANSLATION_OUTPUT_PATH)
+                os.makedirs(output_dir, exist_ok=True)
+                dest_path = os.path.join(output_dir, os.path.basename(self.video_file))
+                shutil.copy(self.video_file, dest_path)
+                self.output_video_filename = dest_path
+
             self.logger.info("Parallel audio and video translations completed successfully")
         except Exception as e:
             self.logger.critical(f"Error during parallel audio and video translations: {e}")
             raise
         finally:
             utils.teardown()
 
-    def translate_sync(self):
-        """Asynchronous coroutine for performing both audio and video translations in parallel"""
+    def translate_sync(self) -> None:
+        """Synchronous wrapper around the asynchronous translate coroutine."""
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
         loop.run_until_complete(self.translate())