♻️ Refactor assets handling, add new AI images engines, add new long form videos pipeline, remove import of shorts pipeline awaiting upgrade to use new code.

This commit is contained in:
2024-04-21 21:57:16 +02:00
parent a2c6823e89
commit e9a5328d1d
38 changed files with 1492 additions and 565 deletions

View File

@@ -0,0 +1,49 @@
from abc import abstractmethod
from typing import TypedDict
from ..BaseEngine import BaseEngine
class Word(TypedDict):
start: str
end: str
text: str
class BaseTranscriptionEngine(BaseEngine):
@abstractmethod
def transcribe(
self,
path: str,
fast: bool = False,
words: bool = False,
avoid_hallucinations: bool = False,
) -> list[Word]:
"""
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
Args:
path (str): The path to the audio file.
fast (bool): Whether to use a fast transcription model.
words (bool): Whether to return the words as a list of Word objects.
Returns:
list[Word]: A list of Word objects representing the transcribed words.
Example:
```json
[
{
"start": "0.00",
"end": "0.50",
"text": "Hello"
},
{
"start": "0.50",
"end": "1.00",
"text": "world"
}
]
```
"""
...

View File

@@ -0,0 +1,73 @@
from abc import abstractmethod
from typing import TypedDict
import whisper_timestamped as wt
from torch.cuda import is_available
from . import BaseTranscriptionEngine
class Word(TypedDict):
start: str
end: str
text: str
class WhisperTranscriptionEngine(BaseTranscriptionEngine):
name = "Whisper Transcription Engine"
description = (
"A transcription engine that uses the whisper model to transcribe audio files."
)
num_options = 0
def __init__(self, options: list) -> None:
super().__init__()
def transcribe(
self,
path: str,
fast: bool = False,
words=False,
avoid_hallucinations: bool = False,
) -> list[Word] | dict[str, dict[str, str]]:
"""
Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
Args:
path (str): The path to the audio file.
fast (bool): Whether to use a fast transcription model.
words (bool): Whether to return the words as a list of Word objects.
Returns:
list[Word]: A list of Word objects representing the transcribed words.
Example:
```json
[
{
"start": "0.00",
"end": "0.50",
"text": "Hello"
},
{
"start": "0.50",
"end": "1.00",
"text": "world"
}
]
```
"""
device = "cuda" if is_available() else "cpu"
audio = wt.load_audio(path)
model = wt.load_model("large-v3" if not fast else "base", device=device)
result = wt.transcribe(model=model, audio=audio, vad=avoid_hallucinations)
if words:
results = [word for chunk in result["segments"] for word in chunk["words"]]
for result in results:
del result["confidence"]
return results
return result
@classmethod
def get_options(cls):
return []

View File

@@ -0,0 +1,2 @@
from .BaseTranscriptionEngine import BaseTranscriptionEngine
from .WhisperTranscriptionEngine import WhisperTranscriptionEngine