♻️ ✨ Refactor assets handling, add new AI images engines, add new long form videos pipeline, remove import of shorts pipeline awaiting upgrade to use new code.

2026-01-02 09:16:19 +00:00 · 2024-04-21 21:57:16 +02:00
parent a2c6823e89
commit e9a5328d1d
38 changed files with 1492 additions and 565 deletions
--- a/src/engines/TranscriptionEngine/BaseTranscriptionEngine.py
+++ b/src/engines/TranscriptionEngine/BaseTranscriptionEngine.py
@@ -0,0 +1,49 @@
+from abc import abstractmethod
+from typing import TypedDict
+
+from ..BaseEngine import BaseEngine
+
+
+class Word(TypedDict):
+    start: str
+    end: str
+    text: str
+
+
+class BaseTranscriptionEngine(BaseEngine):
+
+    @abstractmethod
+    def transcribe(
+        self,
+        path: str,
+        fast: bool = False,
+        words: bool = False,
+        avoid_hallucinations: bool = False,
+    ) -> list[Word]:
+        """
+        Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+        Args:
+            path (str): The path to the audio file.
+            fast (bool): Whether to use a fast transcription model.
+            words (bool): Whether to return the words as a list of Word objects.
+
+        Returns:
+            list[Word]: A list of Word objects representing the transcribed words.
+            Example:
+            ```json
+            [
+                {
+                    "start": "0.00",
+                    "end": "0.50",
+                    "text": "Hello"
+                },
+                {
+                    "start": "0.50",
+                    "end": "1.00",
+                    "text": "world"
+                }
+            ]
+            ```
+        """
+        ...
--- a/src/engines/TranscriptionEngine/WhisperTranscriptionEngine.py
+++ b/src/engines/TranscriptionEngine/WhisperTranscriptionEngine.py
@@ -0,0 +1,73 @@
+from abc import abstractmethod
+from typing import TypedDict
+
+import whisper_timestamped as wt
+from torch.cuda import is_available
+
+from . import BaseTranscriptionEngine
+
+
+class Word(TypedDict):
+    start: str
+    end: str
+    text: str
+
+
+class WhisperTranscriptionEngine(BaseTranscriptionEngine):
+    name = "Whisper Transcription Engine"
+    description = (
+        "A transcription engine that uses the whisper model to transcribe audio files."
+    )
+    num_options = 0
+
+    def __init__(self, options: list) -> None:
+        super().__init__()
+
+    def transcribe(
+        self,
+        path: str,
+        fast: bool = False,
+        words=False,
+        avoid_hallucinations: bool = False,
+    ) -> list[Word] | dict[str, dict[str, str]]:
+        """
+        Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+        Args:
+            path (str): The path to the audio file.
+            fast (bool): Whether to use a fast transcription model.
+            words (bool): Whether to return the words as a list of Word objects.
+
+        Returns:
+            list[Word]: A list of Word objects representing the transcribed words.
+            Example:
+            ```json
+            [
+                {
+                    "start": "0.00",
+                    "end": "0.50",
+                    "text": "Hello"
+                },
+                {
+                    "start": "0.50",
+                    "end": "1.00",
+                    "text": "world"
+                }
+            ]
+            ```
+        """
+        device = "cuda" if is_available() else "cpu"
+        audio = wt.load_audio(path)
+        model = wt.load_model("large-v3" if not fast else "base", device=device)
+        result = wt.transcribe(model=model, audio=audio, vad=avoid_hallucinations)
+        if words:
+            results = [word for chunk in result["segments"] for word in chunk["words"]]
+            for result in results:
+                del result["confidence"]
+
+            return results
+        return result
+
+    @classmethod
+    def get_options(cls):
+        return []
--- a/src/engines/TranscriptionEngine/init.py
+++ b/src/engines/TranscriptionEngine/init.py
@@ -0,0 +1,2 @@
+from .BaseTranscriptionEngine import BaseTranscriptionEngine
+from .WhisperTranscriptionEngine import WhisperTranscriptionEngine