🚀 Maaany things

This commit is contained in:
2024-02-15 14:11:16 +01:00
parent 57bcf0af8e
commit 5be7937ffa
7 changed files with 272 additions and 50 deletions

View File

@@ -1,8 +1,16 @@
import moviepy.editor as mp
import whisper_timestamped as wt
from typing import TypedDict
from torch.cuda import is_available
from abc import ABC, abstractmethod
# Assuming BaseEngine is defined elsewhere in your project
from ..BaseEngine import BaseEngine
class Word(TypedDict):
start: str
end: str
text: str
class BaseTTSEngine(BaseEngine):
@@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine):
def synthesize(self, text: str, path: str) -> str:
pass
def time_with_whisper(self, path: str) -> list[Word]:
"""
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
Args:
path (str): The path to the audio file.
Returns:
list[Word]: A list of Word objects representing the transcribed words.
Example:
```json
[
{
"start": "0.00",
"end": "0.50",
"text": "Hello"
},
{
"start": "0.50",
"end": "1.00",
"text": "world"
}
]
```
"""
device = "cuda" if is_available() else "cpu"
audio = wt.load_audio(path)
model = wt.load_model("tiny", device=device)
result = wt.transcribe(model=model, audio=audio)
results = [word for chunk in result for word in chunk["words"]]
for result in results:
# Not needed for the current use case
del result["confidence"]
return results
def force_duration(self, duration: float, path: str):
"""
Forces the audio clip at the given path to have the specified duration.
Args:
duration (float): The desired duration in seconds.
path (str): The path to the audio clip file.
Returns:
None
"""
audio_clip = mp.AudioFileClip(path)
if audio_clip.duration > duration:

View File

@@ -1,9 +1,9 @@
import gradio as gr
# import TTS
import TTS
import os
# import torch
import torch
from .BaseTTSEngine import BaseTTSEngine
@@ -102,15 +102,25 @@ class CoquiTTSEngine(BaseTTSEngine):
os.environ["COQUI_TOS_AGREED"] = "1"
# self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
# device = "cuda" if torch.cuda.is_available() else "cpu"
# self.tts.to(device)
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
device = "cuda" if torch.cuda.is_available() else "cpu"
self.tts.to(device)
def synthesize(self, text: str, path: str) -> str:
# self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
if self.to_force_duration:
self.force_duration(float(self.duration), path)
return path
def synthesize(self, text: str, path: str):
"""
Synthesizes the given text into speech and saves it to the specified file path.
Args:
text (str): The text to synthesize into speech.
path (str): The file path to save the synthesized speech.
Returns:
float: The time taken to synthesize the speech with whispering effect.
"""
self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
if self.to_force_duration:
self.force_duration(float(self.duration), path)
return self.time_with_whisper(path)
@classmethod
def get_options(cls) -> list:
@@ -129,12 +139,11 @@ class CoquiTTSEngine(BaseTTSEngine):
),
]
duration_checkbox = gr.Checkbox(value=False)
duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False)
duration_checkbox = gr.Checkbox(label="Force duration", info="Force the duration of the generated audio to be at most the specified value", value=False)
duration = gr.Number(label="Duration [s]", value=57, step=1, minimum=10, visible=False)
duration_switch = lambda x: gr.update(visible=x)
duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration])
duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration")
options.append(duration_checkbox_group)
options.append(duration_checkbox)
options.append(duration)
return options