🚀 Maaany things

2026-01-02 17:24:54 +00:00 · 2024-02-15 14:11:16 +01:00
parent 57bcf0af8e
commit 5be7937ffa
7 changed files with 272 additions and 50 deletions
--- a/src/engines/TTSEngine/BaseTTSEngine.py
+++ b/src/engines/TTSEngine/BaseTTSEngine.py
@@ -1,8 +1,16 @@
 import moviepy.editor as mp
+import whisper_timestamped as wt
+
+from typing import TypedDict
+from torch.cuda import is_available
 from abc import ABC, abstractmethod
-# Assuming BaseEngine is defined elsewhere in your project
+
 from ..BaseEngine import BaseEngine

+class Word(TypedDict):
+    start: str
+    end: str
+    text: str

 class BaseTTSEngine(BaseEngine):

@@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine):
    def synthesize(self, text: str, path: str) -> str:
        pass
    
+    def time_with_whisper(self, path: str) -> list[Word]:
+            """
+            Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+            Args:
+                path (str): The path to the audio file.
+
+            Returns:
+                list[Word]: A list of Word objects representing the transcribed words.
+                Example:
+                ```json
+                [
+                    {
+                        "start": "0.00",
+                        "end": "0.50",
+                        "text": "Hello"
+                    },
+                    {
+                        "start": "0.50",
+                        "end": "1.00",
+                        "text": "world"
+                    }
+                ]
+                ```
+            """
+            device = "cuda" if is_available() else "cpu"
+            audio = wt.load_audio(path)
+            model = wt.load_model("tiny", device=device)
+            
+            result = wt.transcribe(model=model, audio=audio)
+            results = [word for chunk in result for word in chunk["words"]]
+            for result in results:
+                # Not needed for the current use case
+                del result["confidence"]
+            return results
+
    def force_duration(self, duration: float, path: str):
+        """
+        Forces the audio clip at the given path to have the specified duration.
+
+        Args:
+            duration (float): The desired duration in seconds.
+            path (str): The path to the audio clip file.
+
+        Returns:
+            None
+        """
        audio_clip = mp.AudioFileClip(path)
        
        if audio_clip.duration > duration:
--- a/src/engines/TTSEngine/CoquiTTSEngine.py
+++ b/src/engines/TTSEngine/CoquiTTSEngine.py
@@ -1,9 +1,9 @@
 import gradio as gr

-# import TTS
+import TTS
 import os

-# import torch
+import torch

 from .BaseTTSEngine import BaseTTSEngine

@@ -102,15 +102,25 @@ class CoquiTTSEngine(BaseTTSEngine):

        os.environ["COQUI_TOS_AGREED"] = "1"

-    #        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
-    #       device = "cuda" if torch.cuda.is_available() else "cpu"
-    #       self.tts.to(device)
+        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tts.to(device)

-    def synthesize(self, text: str, path: str) -> str:
-        #      self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
-        if self.to_force_duration:
-            self.force_duration(float(self.duration), path)
-        return path
+    def synthesize(self, text: str, path: str):
+            """
+            Synthesizes the given text into speech and saves it to the specified file path.
+
+            Args:
+                text (str): The text to synthesize into speech.
+                path (str): The file path to save the synthesized speech.
+
+            Returns:
+                float: The time taken to synthesize the speech with whispering effect.
+            """
+            self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
+            if self.to_force_duration:
+                self.force_duration(float(self.duration), path)
+            return self.time_with_whisper(path)

    @classmethod
    def get_options(cls) -> list:
@@ -129,12 +139,11 @@ class CoquiTTSEngine(BaseTTSEngine):
            ),
        ]
    
-        duration_checkbox = gr.Checkbox(value=False)
-        duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False)
+        duration_checkbox = gr.Checkbox(label="Force duration", info="Force the duration of the generated audio to be at most the specified value", value=False)
+        duration = gr.Number(label="Duration [s]", value=57, step=1, minimum=10, visible=False)
        duration_switch = lambda x: gr.update(visible=x)
        duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration])
-        duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration")

-        options.append(duration_checkbox_group)
+        options.append(duration_checkbox)
        options.append(duration)
        return options