mirror of
https://github.com/Paillat-dev/viralfactory.git
synced 2026-01-02 09:16:19 +00:00
♻️ ✨ Refactor assets handling, add new AI images engines, add new long form videos pipeline, remove import of shorts pipeline awaiting upgrade to use new code.
This commit is contained in:
49
src/engines/TranscriptionEngine/BaseTranscriptionEngine.py
Normal file
49
src/engines/TranscriptionEngine/BaseTranscriptionEngine.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from abc import abstractmethod
|
||||
from typing import TypedDict
|
||||
|
||||
from ..BaseEngine import BaseEngine
|
||||
|
||||
|
||||
class Word(TypedDict):
|
||||
start: str
|
||||
end: str
|
||||
text: str
|
||||
|
||||
|
||||
class BaseTranscriptionEngine(BaseEngine):
|
||||
|
||||
@abstractmethod
|
||||
def transcribe(
|
||||
self,
|
||||
path: str,
|
||||
fast: bool = False,
|
||||
words: bool = False,
|
||||
avoid_hallucinations: bool = False,
|
||||
) -> list[Word]:
|
||||
"""
|
||||
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
|
||||
|
||||
Args:
|
||||
path (str): The path to the audio file.
|
||||
fast (bool): Whether to use a fast transcription model.
|
||||
words (bool): Whether to return the words as a list of Word objects.
|
||||
|
||||
Returns:
|
||||
list[Word]: A list of Word objects representing the transcribed words.
|
||||
Example:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"start": "0.00",
|
||||
"end": "0.50",
|
||||
"text": "Hello"
|
||||
},
|
||||
{
|
||||
"start": "0.50",
|
||||
"end": "1.00",
|
||||
"text": "world"
|
||||
}
|
||||
]
|
||||
```
|
||||
"""
|
||||
...
|
||||
@@ -0,0 +1,73 @@
|
||||
from abc import abstractmethod
|
||||
from typing import TypedDict
|
||||
|
||||
import whisper_timestamped as wt
|
||||
from torch.cuda import is_available
|
||||
|
||||
from . import BaseTranscriptionEngine
|
||||
|
||||
|
||||
class Word(TypedDict):
|
||||
start: str
|
||||
end: str
|
||||
text: str
|
||||
|
||||
|
||||
class WhisperTranscriptionEngine(BaseTranscriptionEngine):
|
||||
name = "Whisper Transcription Engine"
|
||||
description = (
|
||||
"A transcription engine that uses the whisper model to transcribe audio files."
|
||||
)
|
||||
num_options = 0
|
||||
|
||||
def __init__(self, options: list) -> None:
|
||||
super().__init__()
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
path: str,
|
||||
fast: bool = False,
|
||||
words=False,
|
||||
avoid_hallucinations: bool = False,
|
||||
) -> list[Word] | dict[str, dict[str, str]]:
|
||||
"""
|
||||
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
|
||||
|
||||
Args:
|
||||
path (str): The path to the audio file.
|
||||
fast (bool): Whether to use a fast transcription model.
|
||||
words (bool): Whether to return the words as a list of Word objects.
|
||||
|
||||
Returns:
|
||||
list[Word]: A list of Word objects representing the transcribed words.
|
||||
Example:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"start": "0.00",
|
||||
"end": "0.50",
|
||||
"text": "Hello"
|
||||
},
|
||||
{
|
||||
"start": "0.50",
|
||||
"end": "1.00",
|
||||
"text": "world"
|
||||
}
|
||||
]
|
||||
```
|
||||
"""
|
||||
device = "cuda" if is_available() else "cpu"
|
||||
audio = wt.load_audio(path)
|
||||
model = wt.load_model("large-v3" if not fast else "base", device=device)
|
||||
result = wt.transcribe(model=model, audio=audio, vad=avoid_hallucinations)
|
||||
if words:
|
||||
results = [word for chunk in result["segments"] for word in chunk["words"]]
|
||||
for result in results:
|
||||
del result["confidence"]
|
||||
|
||||
return results
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def get_options(cls):
|
||||
return []
|
||||
2
src/engines/TranscriptionEngine/__init__.py
Normal file
2
src/engines/TranscriptionEngine/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .BaseTranscriptionEngine import BaseTranscriptionEngine
|
||||
from .WhisperTranscriptionEngine import WhisperTranscriptionEngine
|
||||
Reference in New Issue
Block a user