From 57bcf0af8eced9baa136d7c09dba417f199dbfb2 Mon Sep 17 00:00:00 2001
From: Paillat <me@paillat.dev>
Date: Thu, 15 Feb 2024 12:27:13 +0100
Subject: [PATCH] fix(GenerationContext.py): fix typo in variable name
 powerfulllmengine to powerfulllmengine for better readability
 feat(GenerationContext.py): add setup_dir method to create a directory for
 output files with a timestamp feat(GenerationContext.py): call setup_dir
 method before generating script and synthesizing audio to ensure output
 directory exists feat(prompts/fix_captions.yaml): add a new prompt file to
 provide instructions for fixing captions fix(BaseTTSEngine.py): add
 force_duration method to adjust audio clip duration if it exceeds a specified
 duration feat(CoquiTTSEngine.py): add options for forcing duration and
 specifying duration in the UI feat(utils/prompting.py): add get_prompt
 function to load prompt files from a specified location fix(gradio_ui.py):
 set equal_height=True for engine_rows to ensure consistent height for engine
 options

---
 src/chore/GenerationContext.py          | 24 +++++++++++++++++++-----
 src/chore/prompts/fix_captions.yaml     |  8 ++++++++
 src/engines/TTSEngine/BaseTTSEngine.py  | 15 ++++++++++++++-
 src/engines/TTSEngine/CoquiTTSEngine.py | 18 ++++++++++++++++--
 src/utils/__init__.py                   |  1 +
 src/utils/prompting.py                  | 14 ++++++++++++++
 ui/gradio_ui.py                         |  2 +-
 7 files changed, 73 insertions(+), 9 deletions(-)
 create mode 100644 src/chore/prompts/fix_captions.yaml
 create mode 100644 src/utils/__init__.py
 create mode 100644 src/utils/prompting.py

diff --git a/src/chore/GenerationContext.py b/src/chore/GenerationContext.py
index bef49d5..dcd30cd 100644
--- a/src/chore/GenerationContext.py
+++ b/src/chore/GenerationContext.py
@@ -1,17 +1,31 @@
 import moviepy
+import time
+import os
 
 from .. import engines
+from ..utils.prompting import get_prompt
+
 class GenerationContext:
 
-    def __init__(self, llmengine: engines.LLMEngine.BaseLLMEngine, scriptengine: engines.ScriptEngine.BaseScriptEngine, ttsengine: engines.TTSEngine.BaseTTSEngine) -> None:
-        self.llmengine = llmengine
-        self.llmengine.ctx = self
+    def __init__(self, powerfulllmengine: engines.LLMEngine.BaseLLMEngine, simplellmengine: engines.LLMEngine.BaseLLMEngine, scriptengine: engines.ScriptEngine.BaseScriptEngine, ttsengine: engines.TTSEngine.BaseTTSEngine) -> None:
+        self.powerfulllmengine = powerfulllmengine
+        self.powerfulllmengine.ctx = self
+        
+        self.simplellmengine = simplellmengine
+        self.simplellmengine.ctx = self
 
         self.scriptengine = scriptengine
         self.scriptengine.ctx = self
 
         self.ttsengine = ttsengine
         self.ttsengine.ctx = self
-    
+    def setup_dir(self):
+        self.dir = f"output/{time.time()}"
+        os.makedirs(self.dir)
+
     def process(self):
-        timed_script = self.scriptengine.generate()
\ No newline at end of file
+        self.setup_dir()
+
+        script = self.scriptengine.generate()
+
+        timed_script = self.ttsengine.synthesize(script, self.dir)
\ No newline at end of file
diff --git a/src/chore/prompts/fix_captions.yaml b/src/chore/prompts/fix_captions.yaml
new file mode 100644
index 0000000..e9d0e56
--- /dev/null
+++ b/src/chore/prompts/fix_captions.yaml
@@ -0,0 +1,8 @@
+system: |-
+  You will recieve from the user a textual script and its captions. Since the captions have been generated trough stt, they might contain some errors. Your task is to fix theese transcription errors and return the corrected captions, keeping the timestamped format.
+  Please return valid json output, with no extra characters or comments, nor any codeblocks.
+
+chat: |-
+  {script}
+
+  {captions}
\ No newline at end of file
diff --git a/src/engines/TTSEngine/BaseTTSEngine.py b/src/engines/TTSEngine/BaseTTSEngine.py
index 51ad549..709d933 100644
--- a/src/engines/TTSEngine/BaseTTSEngine.py
+++ b/src/engines/TTSEngine/BaseTTSEngine.py
@@ -1,10 +1,23 @@
+import moviepy.editor as mp
 from abc import ABC, abstractmethod
+# Assuming BaseEngine is defined elsewhere in your project
 from ..BaseEngine import BaseEngine
 
 
 class BaseTTSEngine(BaseEngine):
-    pass
 
     @abstractmethod
     def synthesize(self, text: str, path: str) -> str:
         pass
+    
+    def force_duration(self, duration: float, path: str):
+        audio_clip = mp.AudioFileClip(path)
+        
+        if audio_clip.duration > duration:
+            speed_factor = audio_clip.duration / duration
+            
+            new_audio = audio_clip.fx(mp.vfx.speedx, speed_factor, final_duration=duration)
+            
+            new_audio.write_audiofile(path, codec='libmp3lame')
+            
+        audio_clip.close()
\ No newline at end of file
diff --git a/src/engines/TTSEngine/CoquiTTSEngine.py b/src/engines/TTSEngine/CoquiTTSEngine.py
index 0661594..030d7da 100644
--- a/src/engines/TTSEngine/CoquiTTSEngine.py
+++ b/src/engines/TTSEngine/CoquiTTSEngine.py
@@ -90,13 +90,15 @@ class CoquiTTSEngine(BaseTTSEngine):
         "ko",  # Korean
         "hi",  # Hindi
     ]
-    num_options = 2
+    num_options = 4
 
     def __init__(self, options: list):
         super().__init__()
 
         self.voice = options[0][0]
         self.language = options[1][0]
+        self.to_force_duration = options[2][0]
+        self.duration = options[3]
 
         os.environ["COQUI_TOS_AGREED"] = "1"
 
@@ -106,11 +108,13 @@ class CoquiTTSEngine(BaseTTSEngine):
 
     def synthesize(self, text: str, path: str) -> str:
         #      self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
+        if self.to_force_duration:
+            self.force_duration(float(self.duration), path)
         return path
 
     @classmethod
     def get_options(cls) -> list:
-        return [
+        options = [
             gr.Dropdown(
                 label="Voice",
                 choices=cls.voices,
@@ -124,3 +128,13 @@ class CoquiTTSEngine(BaseTTSEngine):
                 value=cls.languages[0],
             ),
         ]
+    
+        duration_checkbox = gr.Checkbox(value=False)
+        duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False)
+        duration_switch = lambda x: gr.update(visible=x)
+        duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration])
+        duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration")
+
+        options.append(duration_checkbox_group)
+        options.append(duration)
+        return options
\ No newline at end of file
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..8ca957b
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1 @@
+from . import prompting
\ No newline at end of file
diff --git a/src/utils/prompting.py b/src/utils/prompting.py
new file mode 100644
index 0000000..90facc6
--- /dev/null
+++ b/src/utils/prompting.py
@@ -0,0 +1,14 @@
+import yaml
+import os
+from typing import TypedDict
+
+class Prompt(TypedDict):
+    system: str
+    chat: str
+
+def get_prompt(name, *, location = "src/chore/prompts") -> Prompt:
+    path = os.path.join(os.getcwd(), location, f"{name}.yaml")
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Prompt file {path} does not exist.")
+    with open(path, "r") as file:
+        return yaml.safe_load(file)
\ No newline at end of file
diff --git a/ui/gradio_ui.py b/ui/gradio_ui.py
index b7f6129..a0319d0 100644
--- a/ui/gradio_ui.py
+++ b/ui/gradio_ui.py
@@ -52,7 +52,7 @@ class GenerateUI:
                             inputs.append(engine_dropdown)
                             engine_rows = []
                             for i, engine in enumerate(engines):
-                                with gr.Row(visible=(i == 0)) as engine_row:
+                                with gr.Row(equal_height=True, visible=(i == 0)) as engine_row:
                                     engine_rows.append(engine_row)
                                     options = engine.get_options()
                                     inputs.extend(options)