mirror of
https://github.com/Paillat-dev/viralfactory.git
synced 2026-01-02 01:06:19 +00:00
Formatting
This commit is contained in:
@@ -7,52 +7,53 @@ from abc import ABC, abstractmethod
|
|||||||
|
|
||||||
from ..BaseEngine import BaseEngine
|
from ..BaseEngine import BaseEngine
|
||||||
|
|
||||||
|
|
||||||
class Word(TypedDict):
|
class Word(TypedDict):
|
||||||
start: str
|
start: str
|
||||||
end: str
|
end: str
|
||||||
text: str
|
text: str
|
||||||
|
|
||||||
class BaseTTSEngine(BaseEngine):
|
|
||||||
|
|
||||||
|
class BaseTTSEngine(BaseEngine):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def synthesize(self, text: str, path: str) -> str:
|
def synthesize(self, text: str, path: str) -> str:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def time_with_whisper(self, path: str) -> list[Word]:
|
def time_with_whisper(self, path: str) -> list[Word]:
|
||||||
"""
|
"""
|
||||||
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
|
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path (str): The path to the audio file.
|
path (str): The path to the audio file.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[Word]: A list of Word objects representing the transcribed words.
|
list[Word]: A list of Word objects representing the transcribed words.
|
||||||
Example:
|
Example:
|
||||||
```json
|
```json
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"start": "0.00",
|
"start": "0.00",
|
||||||
"end": "0.50",
|
"end": "0.50",
|
||||||
"text": "Hello"
|
"text": "Hello"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"start": "0.50",
|
"start": "0.50",
|
||||||
"end": "1.00",
|
"end": "1.00",
|
||||||
"text": "world"
|
"text": "world"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
device = "cuda" if is_available() else "cpu"
|
device = "cuda" if is_available() else "cpu"
|
||||||
audio = wt.load_audio(path)
|
audio = wt.load_audio(path)
|
||||||
model = wt.load_model("tiny", device=device)
|
model = wt.load_model("tiny", device=device)
|
||||||
|
|
||||||
result = wt.transcribe(model=model, audio=audio)
|
result = wt.transcribe(model=model, audio=audio)
|
||||||
results = [word for chunk in result for word in chunk["words"]]
|
results = [word for chunk in result for word in chunk["words"]]
|
||||||
for result in results:
|
for result in results:
|
||||||
# Not needed for the current use case
|
# Not needed for the current use case
|
||||||
del result["confidence"]
|
del result["confidence"]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def force_duration(self, duration: float, path: str):
|
def force_duration(self, duration: float, path: str):
|
||||||
"""
|
"""
|
||||||
@@ -70,8 +71,10 @@ class BaseTTSEngine(BaseEngine):
|
|||||||
if audio_clip.duration > duration:
|
if audio_clip.duration > duration:
|
||||||
speed_factor = audio_clip.duration / duration
|
speed_factor = audio_clip.duration / duration
|
||||||
|
|
||||||
new_audio = audio_clip.fx(mp.vfx.speedx, speed_factor, final_duration=duration)
|
new_audio = audio_clip.fx(
|
||||||
|
mp.vfx.speedx, speed_factor, final_duration=duration
|
||||||
|
)
|
||||||
|
|
||||||
new_audio.write_audiofile(path, codec='libmp3lame')
|
new_audio.write_audiofile(path, codec="libmp3lame")
|
||||||
|
|
||||||
audio_clip.close()
|
audio_clip.close()
|
||||||
Reference in New Issue
Block a user