mirror of
https://github.com/Paillat-dev/viralfactory.git
synced 2026-01-02 17:24:54 +00:00
🚀 Maaany things
This commit is contained in:
@@ -1,8 +1,16 @@
|
||||
import moviepy.editor as mp
|
||||
import whisper_timestamped as wt
|
||||
|
||||
from typing import TypedDict
|
||||
from torch.cuda import is_available
|
||||
from abc import ABC, abstractmethod
|
||||
# Assuming BaseEngine is defined elsewhere in your project
|
||||
|
||||
from ..BaseEngine import BaseEngine
|
||||
|
||||
class Word(TypedDict):
|
||||
start: str
|
||||
end: str
|
||||
text: str
|
||||
|
||||
class BaseTTSEngine(BaseEngine):
|
||||
|
||||
@@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine):
|
||||
def synthesize(self, text: str, path: str) -> str:
|
||||
pass
|
||||
|
||||
def time_with_whisper(self, path: str) -> list[Word]:
|
||||
"""
|
||||
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
|
||||
|
||||
Args:
|
||||
path (str): The path to the audio file.
|
||||
|
||||
Returns:
|
||||
list[Word]: A list of Word objects representing the transcribed words.
|
||||
Example:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"start": "0.00",
|
||||
"end": "0.50",
|
||||
"text": "Hello"
|
||||
},
|
||||
{
|
||||
"start": "0.50",
|
||||
"end": "1.00",
|
||||
"text": "world"
|
||||
}
|
||||
]
|
||||
```
|
||||
"""
|
||||
device = "cuda" if is_available() else "cpu"
|
||||
audio = wt.load_audio(path)
|
||||
model = wt.load_model("tiny", device=device)
|
||||
|
||||
result = wt.transcribe(model=model, audio=audio)
|
||||
results = [word for chunk in result for word in chunk["words"]]
|
||||
for result in results:
|
||||
# Not needed for the current use case
|
||||
del result["confidence"]
|
||||
return results
|
||||
|
||||
def force_duration(self, duration: float, path: str):
|
||||
"""
|
||||
Forces the audio clip at the given path to have the specified duration.
|
||||
|
||||
Args:
|
||||
duration (float): The desired duration in seconds.
|
||||
path (str): The path to the audio clip file.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
audio_clip = mp.AudioFileClip(path)
|
||||
|
||||
if audio_clip.duration > duration:
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import gradio as gr
|
||||
|
||||
# import TTS
|
||||
import TTS
|
||||
import os
|
||||
|
||||
# import torch
|
||||
import torch
|
||||
|
||||
from .BaseTTSEngine import BaseTTSEngine
|
||||
|
||||
@@ -102,15 +102,25 @@ class CoquiTTSEngine(BaseTTSEngine):
|
||||
|
||||
os.environ["COQUI_TOS_AGREED"] = "1"
|
||||
|
||||
# self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
# self.tts.to(device)
|
||||
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.tts.to(device)
|
||||
|
||||
def synthesize(self, text: str, path: str) -> str:
|
||||
# self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
|
||||
if self.to_force_duration:
|
||||
self.force_duration(float(self.duration), path)
|
||||
return path
|
||||
def synthesize(self, text: str, path: str):
|
||||
"""
|
||||
Synthesizes the given text into speech and saves it to the specified file path.
|
||||
|
||||
Args:
|
||||
text (str): The text to synthesize into speech.
|
||||
path (str): The file path to save the synthesized speech.
|
||||
|
||||
Returns:
|
||||
float: The time taken to synthesize the speech with whispering effect.
|
||||
"""
|
||||
self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
|
||||
if self.to_force_duration:
|
||||
self.force_duration(float(self.duration), path)
|
||||
return self.time_with_whisper(path)
|
||||
|
||||
@classmethod
|
||||
def get_options(cls) -> list:
|
||||
@@ -129,12 +139,11 @@ class CoquiTTSEngine(BaseTTSEngine):
|
||||
),
|
||||
]
|
||||
|
||||
duration_checkbox = gr.Checkbox(value=False)
|
||||
duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False)
|
||||
duration_checkbox = gr.Checkbox(label="Force duration", info="Force the duration of the generated audio to be at most the specified value", value=False)
|
||||
duration = gr.Number(label="Duration [s]", value=57, step=1, minimum=10, visible=False)
|
||||
duration_switch = lambda x: gr.update(visible=x)
|
||||
duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration])
|
||||
duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration")
|
||||
|
||||
options.append(duration_checkbox_group)
|
||||
options.append(duration_checkbox)
|
||||
options.append(duration)
|
||||
return options
|
||||
Reference in New Issue
Block a user