2024-02-15 12:27:13 +01:00
|
|
|
import moviepy.editor as mp
|
2024-02-15 14:11:16 +01:00
|
|
|
import whisper_timestamped as wt
|
|
|
|
|
|
|
|
|
|
from typing import TypedDict
|
|
|
|
|
from torch.cuda import is_available
|
2024-02-13 14:15:27 +01:00
|
|
|
from abc import ABC, abstractmethod
|
2024-02-15 14:11:16 +01:00
|
|
|
|
2024-02-13 14:15:27 +01:00
|
|
|
from ..BaseEngine import BaseEngine
|
|
|
|
|
|
2024-02-15 17:54:13 +01:00
|
|
|
|
2024-02-15 14:11:16 +01:00
|
|
|
class Word(TypedDict):
|
|
|
|
|
start: str
|
|
|
|
|
end: str
|
|
|
|
|
text: str
|
2024-02-13 14:15:27 +01:00
|
|
|
|
|
|
|
|
|
2024-02-15 17:54:13 +01:00
|
|
|
class BaseTTSEngine(BaseEngine):
|
2024-02-13 14:15:27 +01:00
|
|
|
@abstractmethod
|
2024-02-21 09:06:36 +01:00
|
|
|
def synthesize(self, text: str, path: str) -> None:
|
2024-02-14 17:49:51 +01:00
|
|
|
pass
|
2024-02-20 14:47:54 +01:00
|
|
|
|
2024-02-17 18:47:30 +01:00
|
|
|
def remove_punctuation(self, text: str) -> str:
|
|
|
|
|
return text.translate(str.maketrans("", "", ".,!?;:"))
|
|
|
|
|
|
|
|
|
|
def fix_captions(self, script: str, captions: list[Word]) -> list[Word]:
|
|
|
|
|
script = script.split(" ")
|
|
|
|
|
new_captions = []
|
|
|
|
|
for i, word in enumerate(script):
|
|
|
|
|
original_word = self.remove_punctuation(word.lower())
|
|
|
|
|
stt_word = self.remove_punctuation(word.lower())
|
|
|
|
|
if stt_word in original_word:
|
|
|
|
|
captions[i]["text"] = word
|
|
|
|
|
new_captions.append(captions[i])
|
2024-02-20 14:47:54 +01:00
|
|
|
# elif there is a word more in the stt than in the original, we
|
2024-02-15 17:54:13 +01:00
|
|
|
|
2024-02-15 14:11:16 +01:00
|
|
|
def time_with_whisper(self, path: str) -> list[Word]:
|
2024-02-15 17:54:13 +01:00
|
|
|
"""
|
|
|
|
|
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
path (str): The path to the audio file.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list[Word]: A list of Word objects representing the transcribed words.
|
|
|
|
|
Example:
|
|
|
|
|
```json
|
|
|
|
|
[
|
|
|
|
|
{
|
|
|
|
|
"start": "0.00",
|
|
|
|
|
"end": "0.50",
|
|
|
|
|
"text": "Hello"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"start": "0.50",
|
|
|
|
|
"end": "1.00",
|
|
|
|
|
"text": "world"
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
```
|
|
|
|
|
"""
|
|
|
|
|
device = "cuda" if is_available() else "cpu"
|
|
|
|
|
audio = wt.load_audio(path)
|
2024-02-17 18:47:30 +01:00
|
|
|
model = wt.load_model("small", device=device)
|
2024-02-15 17:54:13 +01:00
|
|
|
|
|
|
|
|
result = wt.transcribe(model=model, audio=audio)
|
2024-02-15 18:13:48 +01:00
|
|
|
results = [word for chunk in result["segments"] for word in chunk["words"]]
|
2024-02-15 17:54:13 +01:00
|
|
|
for result in results:
|
|
|
|
|
# Not needed for the current use case
|
|
|
|
|
del result["confidence"]
|
|
|
|
|
return results
|
2024-02-15 14:11:16 +01:00
|
|
|
|
2024-02-15 12:27:13 +01:00
|
|
|
def force_duration(self, duration: float, path: str):
|
2024-02-15 14:11:16 +01:00
|
|
|
"""
|
|
|
|
|
Forces the audio clip at the given path to have the specified duration.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
duration (float): The desired duration in seconds.
|
|
|
|
|
path (str): The path to the audio clip file.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
None
|
|
|
|
|
"""
|
2024-02-15 12:27:13 +01:00
|
|
|
audio_clip = mp.AudioFileClip(path)
|
2024-02-15 17:54:13 +01:00
|
|
|
|
2024-02-15 12:27:13 +01:00
|
|
|
if audio_clip.duration > duration:
|
|
|
|
|
speed_factor = audio_clip.duration / duration
|
2024-02-15 17:54:13 +01:00
|
|
|
|
|
|
|
|
new_audio = audio_clip.fx(
|
|
|
|
|
mp.vfx.speedx, speed_factor, final_duration=duration
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
new_audio.write_audiofile(path, codec="libmp3lame")
|
|
|
|
|
|
|
|
|
|
audio_clip.close()
|