🚀 Maaany things

This commit is contained in:
2024-02-15 14:11:16 +01:00
parent 57bcf0af8e
commit 5be7937ffa
7 changed files with 272 additions and 50 deletions

View File

@@ -1,8 +1,16 @@
import moviepy.editor as mp
import whisper_timestamped as wt
from typing import TypedDict
from torch.cuda import is_available
from abc import ABC, abstractmethod
# Assuming BaseEngine is defined elsewhere in your project
from ..BaseEngine import BaseEngine
class Word(TypedDict):
start: str
end: str
text: str
class BaseTTSEngine(BaseEngine):
@@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine):
def synthesize(self, text: str, path: str) -> str:
pass
def time_with_whisper(self, path: str) -> list[Word]:
"""
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
Args:
path (str): The path to the audio file.
Returns:
list[Word]: A list of Word objects representing the transcribed words.
Example:
```json
[
{
"start": "0.00",
"end": "0.50",
"text": "Hello"
},
{
"start": "0.50",
"end": "1.00",
"text": "world"
}
]
```
"""
device = "cuda" if is_available() else "cpu"
audio = wt.load_audio(path)
model = wt.load_model("tiny", device=device)
result = wt.transcribe(model=model, audio=audio)
results = [word for chunk in result for word in chunk["words"]]
for result in results:
# Not needed for the current use case
del result["confidence"]
return results
def force_duration(self, duration: float, path: str):
"""
Forces the audio clip at the given path to have the specified duration.
Args:
duration (float): The desired duration in seconds.
path (str): The path to the audio clip file.
Returns:
None
"""
audio_clip = mp.AudioFileClip(path)
if audio_clip.duration > duration: