🐛 fix(GenerationContext.py): fix import statements and add support for captioning engine

✨ feat(GenerationContext.py): add support for captioning engine in the GenerationContext class The import statement for the `moviepy.editor` module is changed to `moviepy.editor as mp` to improve code readability. Additionally, the `gradio` module is imported as `gr` to improve code readability. The `GenerationContext` class now includes a `captioningengine` parameter and initializes a `captioningengine` attribute. The `setup_dir` method is modified to include a call to create a directory for the output files. The `get_file_path` method is modified to return the file path based on the output directory. The `process` method is modified to include additional steps for captioning. The `timed_script` attribute is added to store the result of the `ttsengine.synthesize` method. The `captioningengine` is used to generate captions and store them in the `captions` attribute. The final video is rendered using the `moviepy` library and saved as "final.mp4" in the output directory.
2026-01-02 09:16:19 +00:00 · 2024-02-17 18:47:30 +01:00
parent eedbc99121
commit e3229518d4
12 changed files with 261 additions and 34 deletions
--- a/src/engines/TTSEngine/BaseTTSEngine.py
+++ b/src/engines/TTSEngine/BaseTTSEngine.py
@@ -16,8 +16,22 @@ class Word(TypedDict):

 class BaseTTSEngine(BaseEngine):
    @abstractmethod
-    def synthesize(self, text: str, path: str) -> str:
+    def synthesize(self, text: str, path: str) -> list[Word]:
        pass
+    
+    def remove_punctuation(self, text: str) -> str:
+        return text.translate(str.maketrans("", "", ".,!?;:"))
+
+    def fix_captions(self, script: str, captions: list[Word]) -> list[Word]:
+        script = script.split(" ")
+        new_captions = []
+        for i, word in enumerate(script):
+            original_word = self.remove_punctuation(word.lower())
+            stt_word = self.remove_punctuation(word.lower())
+            if stt_word in original_word:
+                captions[i]["text"] = word
+                new_captions.append(captions[i])
+            #elif there is a word more in the stt than in the original, we 

    def time_with_whisper(self, path: str) -> list[Word]:
        """
@@ -46,7 +60,7 @@ class BaseTTSEngine(BaseEngine):
        """
        device = "cuda" if is_available() else "cpu"
        audio = wt.load_audio(path)
-        model = wt.load_model("tiny", device=device)
+        model = wt.load_model("small", device=device)

        result = wt.transcribe(model=model, audio=audio)
        results = [word for chunk in result["segments"] for word in chunk["words"]]
--- a/src/engines/TTSEngine/CoquiTTSEngine.py
+++ b/src/engines/TTSEngine/CoquiTTSEngine.py
@@ -5,8 +5,9 @@ import os

 import torch

-from .BaseTTSEngine import BaseTTSEngine
+from .BaseTTSEngine import BaseTTSEngine, Word

+from ...utils.prompting import get_prompt

 class CoquiTTSEngine(BaseTTSEngine):
    voices = [
@@ -122,8 +123,10 @@ class CoquiTTSEngine(BaseTTSEngine):
        )
        if self.to_force_duration:
            self.force_duration(float(self.duration), path)
+        
        return self.time_with_whisper(path)

+        
    @classmethod
    def get_options(cls) -> list:
        options = [
@@ -131,7 +134,7 @@ class CoquiTTSEngine(BaseTTSEngine):
                label="Voice",
                choices=cls.voices,
                max_choices=1,
-                value=cls.voices[0],
+                value="Damien Black",
            ),
            gr.Dropdown(
                label="Language",
@@ -145,6 +148,7 @@ class CoquiTTSEngine(BaseTTSEngine):
            label="Force duration",
            info="Force the duration of the generated audio to be at most the specified value",
            value=False,
+            show_label=True,
        )
        duration = gr.Number(
            label="Duration [s]", value=57, step=1, minimum=10, visible=False
--- a/src/engines/TTSEngine/prompts/fix_captions.yaml
+++ b/src/engines/TTSEngine/prompts/fix_captions.yaml
@@ -0,0 +1,32 @@
+system: |-
+  You will recieve from the user a textual script and its captions. Since the captions have been generated trough stt, they might contain some errors. Your task is to fix theese transcription errors and return the corrected captions, keeping the timestamped format.
+  Please return valid json output, with no extra characters or comments, nor any codeblocks.
+  The errors / corrections you should do are:
+  - Fix any spelling errors
+  - Fix any grammar errors
+  - If a punctuation mark is not the same as in the script, change it to match the script. However, there should still be punctioation marks. They do not count in the one word per "text" field rule.
+  - Turn any number or symbol that is spelled out into its numerical or symbolic representation (ex. "one" -> "1", "percent" -> "%", "dollar" -> "$", etc.)
+  - Add capitalization at the beginning of each SENTENCE if missing (not each "text tag, only when multile tags form a sentence !!!") but do not create or infer sentences. Only if a sentence that is already there is not capitalized, you should capitalize it.
+  - You are NOT allowed to change the timestamps at any cost, nor to reorganize the captions in any way. Your sole role is to fix transcription errors. Nothing else.
+  - You should not add new words. If a sentence feels wrong in the original script, you should not change it, but keep it as is, and if needed make the captions match the script, even if the script does not feel correct.
+  The response format should be a json object as follows:
+  {
+    "captions": [
+      {
+        "start": 0,
+        "end": 1000.000,
+        "text": "This is the first caption."
+      },
+      {
+        "start": 1000.000,
+        "end": 2000.023,
+        "text": "This is the second caption."
+      },
+      etc...]
+  }
+chat: |-
+  {script}
+
+  {captions}
+
+  Remember that each "text" field should contain ONLY ONE WORD and should be changed ONLY IF NEEDED, else just copy pasted as is with no changes, nor any changes in the timestamps! ans the "text" fiels should NEVER BE a full sentence. The transcript is made to be precise at the word level, so you should not change the words, or it will be pointless.