src/engines/TTSEngine/BaseTTSEngine.py

import moviepy.editor as mp
import whisper_timestamped as wt

from typing import TypedDict
from torch.cuda import is_available
from abc import ABC, abstractmethod

from ..BaseEngine import BaseEngine

class Word(TypedDict):
    start: str
    end: str
    text: str

class BaseTTSEngine(BaseEngine):

    @abstractmethod
    def synthesize(self, text: str, path: str) -> str:
        pass
    
    def time_with_whisper(self, path: str) -> list[Word]:
            """
            Transcribes the audio file at the given path using a pre-trained model and returns a list of words.

            Args:
                path (str): The path to the audio file.

            Returns:
                list[Word]: A list of Word objects representing the transcribed words.
                Example:
                ```json
                [
                    {
                        "start": "0.00",
                        "end": "0.50",
                        "text": "Hello"
                    },
                    {
                        "start": "0.50",
                        "end": "1.00",
                        "text": "world"
                    }
                ]
                ```
            """
            device = "cuda" if is_available() else "cpu"
            audio = wt.load_audio(path)
            model = wt.load_model("tiny", device=device)
            
            result = wt.transcribe(model=model, audio=audio)
            results = [word for chunk in result for word in chunk["words"]]
            for result in results:
                # Not needed for the current use case
                del result["confidence"]
            return results

    def force_duration(self, duration: float, path: str):
        """
        Forces the audio clip at the given path to have the specified duration.

        Args:
            duration (float): The desired duration in seconds.
            path (str): The path to the audio clip file.

        Returns:
            None
        """
        audio_clip = mp.AudioFileClip(path)
        
        if audio_clip.duration > duration:
            speed_factor = audio_clip.duration / duration
            
            new_audio = audio_clip.fx(mp.vfx.speedx, speed_factor, final_duration=duration)
            
            new_audio.write_audiofile(path, codec='libmp3lame')
            
        audio_clip.close()
fix(GenerationContext.py): fix typo in variable name powerfulllmengine to powerfulllmengine for better readability feat(GenerationContext.py): add setup_dir method to create a directory for output files with a timestamp feat(GenerationContext.py): call setup_dir method before generating script and synthesizing audio to ensure output directory exists feat(prompts/fix_captions.yaml): add a new prompt file to provide instructions for fixing captions fix(BaseTTSEngine.py): add force_duration method to adjust audio clip duration if it exceeds a specified duration feat(CoquiTTSEngine.py): add options for forcing duration and specifying duration in the UI feat(utils/prompting.py): add get_prompt function to load prompt files from a specified location fix(gradio_ui.py): set equal_height=True for engine_rows to ensure consistent height for engine options 2024-02-15 12:27:13 +01:00			`import moviepy.editor as mp`
:rocket: Maaany things 2024-02-15 14:11:16 +01:00			`import whisper_timestamped as wt`

			`from typing import TypedDict`
			`from torch.cuda import is_available`
Some stuff 2024-02-13 14:15:27 +01:00			`from abc import ABC, abstractmethod`
:rocket: Maaany things 2024-02-15 14:11:16 +01:00
Some stuff 2024-02-13 14:15:27 +01:00			`from ..BaseEngine import BaseEngine`

:rocket: Maaany things 2024-02-15 14:11:16 +01:00			`class Word(TypedDict):`
			`start: str`
			`end: str`
			`text: str`
Some stuff 2024-02-13 14:15:27 +01:00
			`class BaseTTSEngine(BaseEngine):`

			`@abstractmethod`
			`def synthesize(self, text: str, path: str) -> str:`
:rocket: 2024-02-14 17:49:51 +01:00			`pass`
fix(GenerationContext.py): fix typo in variable name powerfulllmengine to powerfulllmengine for better readability feat(GenerationContext.py): add setup_dir method to create a directory for output files with a timestamp feat(GenerationContext.py): call setup_dir method before generating script and synthesizing audio to ensure output directory exists feat(prompts/fix_captions.yaml): add a new prompt file to provide instructions for fixing captions fix(BaseTTSEngine.py): add force_duration method to adjust audio clip duration if it exceeds a specified duration feat(CoquiTTSEngine.py): add options for forcing duration and specifying duration in the UI feat(utils/prompting.py): add get_prompt function to load prompt files from a specified location fix(gradio_ui.py): set equal_height=True for engine_rows to ensure consistent height for engine options 2024-02-15 12:27:13 +01:00
:rocket: Maaany things 2024-02-15 14:11:16 +01:00			`def time_with_whisper(self, path: str) -> list[Word]:`
			`"""`
			`Transcribes the audio file at the given path using a pre-trained model and returns a list of words.`

			`Args:`
			`path (str): The path to the audio file.`

			`Returns:`
			`list[Word]: A list of Word objects representing the transcribed words.`
			`Example:`
			```json
			`[`
			`{`
			`"start": "0.00",`
			`"end": "0.50",`
			`"text": "Hello"`
			`},`
			`{`
			`"start": "0.50",`
			`"end": "1.00",`
			`"text": "world"`
			`}`
			`]`
			```
			`"""`
			`device = "cuda" if is_available() else "cpu"`
			`audio = wt.load_audio(path)`
			`model = wt.load_model("tiny", device=device)`

			`result = wt.transcribe(model=model, audio=audio)`
			`results = [word for chunk in result for word in chunk["words"]]`
			`for result in results:`
			`# Not needed for the current use case`
			`del result["confidence"]`
			`return results`

fix(GenerationContext.py): fix typo in variable name powerfulllmengine to powerfulllmengine for better readability feat(GenerationContext.py): add setup_dir method to create a directory for output files with a timestamp feat(GenerationContext.py): call setup_dir method before generating script and synthesizing audio to ensure output directory exists feat(prompts/fix_captions.yaml): add a new prompt file to provide instructions for fixing captions fix(BaseTTSEngine.py): add force_duration method to adjust audio clip duration if it exceeds a specified duration feat(CoquiTTSEngine.py): add options for forcing duration and specifying duration in the UI feat(utils/prompting.py): add get_prompt function to load prompt files from a specified location fix(gradio_ui.py): set equal_height=True for engine_rows to ensure consistent height for engine options 2024-02-15 12:27:13 +01:00			`def force_duration(self, duration: float, path: str):`
:rocket: Maaany things 2024-02-15 14:11:16 +01:00			`"""`
			`Forces the audio clip at the given path to have the specified duration.`

			`Args:`
			`duration (float): The desired duration in seconds.`
			`path (str): The path to the audio clip file.`

			`Returns:`
			`None`
			`"""`
fix(GenerationContext.py): fix typo in variable name powerfulllmengine to powerfulllmengine for better readability feat(GenerationContext.py): add setup_dir method to create a directory for output files with a timestamp feat(GenerationContext.py): call setup_dir method before generating script and synthesizing audio to ensure output directory exists feat(prompts/fix_captions.yaml): add a new prompt file to provide instructions for fixing captions fix(BaseTTSEngine.py): add force_duration method to adjust audio clip duration if it exceeds a specified duration feat(CoquiTTSEngine.py): add options for forcing duration and specifying duration in the UI feat(utils/prompting.py): add get_prompt function to load prompt files from a specified location fix(gradio_ui.py): set equal_height=True for engine_rows to ensure consistent height for engine options 2024-02-15 12:27:13 +01:00			`audio_clip = mp.AudioFileClip(path)`

			`if audio_clip.duration > duration:`
			`speed_factor = audio_clip.duration / duration`

			`new_audio = audio_clip.fx(mp.vfx.speedx, speed_factor, final_duration=duration)`

			`new_audio.write_audiofile(path, codec='libmp3lame')`

			`audio_clip.close()`