♻️ ✨ Refactor assets handling, add new AI images engines, add new long form videos pipeline, remove import of shorts pipeline awaiting upgrade to use new code.

2026-03-02 18:04:54 +00:00 · 2024-04-21 21:57:16 +02:00
parent a2c6823e89
commit e9a5328d1d
38 changed files with 1492 additions and 565 deletions
--- a/src/chore/GenerationContext.py
+++ b/src/chore/GenerationContext.py
@@ -2,9 +2,6 @@ import os
 import time
 from datetime import datetime

-import gradio
-import moviepy as mp
-
 from .. import engines
 from ..models import Video, SessionLocal

@@ -25,19 +22,20 @@ class GenerationContext:
            db.commit()

    def __init__(
-            self,
-            powerfulllmengine,
-            simplellmengine,
-            scriptengine,
-            ttsengine,
-            captioningengine,
-            assetsengine,
-            settingsengine,
-            backgroundengine,
-            metadataengine,
-            uploadengine,
-            audiobackgroundengine,
-            progress,
+        self,
+        pipeline,
+        settingsengine,
+        simplellmengine,
+        powerfulllmengine,
+        ttsengine,
+        transcriptionengine,
+        captioningengine,
+        aiimageengine,
+        stockimageengine,
+        backgroundengine,
+        audiobackgroundengine,
+        uploadengine,
+        progress,
    ) -> None:
        self.captions = []
        self.dir = None
@@ -48,28 +46,34 @@ class GenerationContext:
        self.duration = None
        self.progress = progress

-        self.powerfulllmengine: engines.LLMEngine.BaseLLMEngine = powerfulllmengine[0]
-        self.powerfulllmengine.ctx = self
+        self.pipeline: engines.Pipelines.BasePipeline = pipeline[0]
+        self.pipeline.ctx = self

        self.simplellmengine: engines.LLMEngine.BaseLLMEngine = simplellmengine[0]
        self.simplellmengine.ctx = self

-        self.scriptengine: engines.ScriptEngine.BaseScriptEngine = scriptengine[0]
-        self.scriptengine.ctx = self
+        self.powerfulllmengine: engines.LLMEngine.BaseLLMEngine = powerfulllmengine[0]
+        self.powerfulllmengine.ctx = self

        self.ttsengine: engines.TTSEngine.BaseTTSEngine = ttsengine[0]
        self.ttsengine.ctx = self

+        self.transcriptionengine: (
+            engines.TranscriptionEngine.BaseTranscriptionEngine
+        ) = transcriptionengine[0]
+
        self.captioningengine: engines.CaptioningEngine.BaseCaptioningEngine = (
            captioningengine[0]
        )
        self.captioningengine.ctx = self

-        self.assetsengine: list[engines.AssetsEngine.BaseAssetsEngine] = assetsengine
-        for eng in self.assetsengine:
-            eng.ctx = self
-        self.assetsengineselector = engines.AssetsEngine.AssetsEngineSelector()
-        self.assetsengineselector.ctx = self
+        self.aiimageengine: engines.AIImageEngine.BaseAIImageEngine = aiimageengine[0]
+        self.aiimageengine.ctx = self
+
+        self.stockimageengine: engines.StockImageEngine.BaseStockImageEngine = (
+            stockimageengine[0]
+        )
+        self.stockimageengine.ctx = self

        self.settingsengine: engines.SettingsEngine.SettingsEngine = settingsengine[0]
        self.settingsengine.ctx = self
@@ -79,18 +83,13 @@ class GenerationContext:
        )
        self.backgroundengine.ctx = self

-        self.metadataengine: engines.MetadataEngine.BaseMetadataEngine = metadataengine[
-            0
-        ]
-        self.metadataengine.ctx = self
-
        self.uploadengine: list[engines.UploadEngine.BaseUploadEngine] = uploadengine
        for eng in self.uploadengine:
            eng.ctx = self

-        self.audiobackgroundengine: engines.AudioBackgroundEngine.BaseAudioBackgroundEngine = (
-            audiobackgroundengine[0]
-        )
+        self.audiobackgroundengine: (
+            engines.AudioBackgroundEngine.BaseAudioBackgroundEngine
+        ) = audiobackgroundengine[0]
        self.audiobackgroundengine.ctx = self

        # Kinda like in css, we have a z-index of moviepy clips (any). Then the engines append some clips to this,
@@ -120,77 +119,4 @@ class GenerationContext:
    def process(self):
        # ⚠️ IMPORTANT NOTE: All methods called here are expected to be defined as abstract methods in the base
        # classes, if not there is an issue with the engine implementation.
-
-        self.progress(0.1, "Loading settings...")
-        self.setup_dir()
-        if not isinstance(self.settingsengine, engines.NoneEngine):
-            self.settingsengine.load()
-
-        self.progress(0.2, "Generating script...")
-        if not isinstance(self.powerfulllmengine, engines.NoneEngine):
-            self.scriptengine.generate()
-
-        self.progress(0.3, "Synthesizing voice...")
-        if not isinstance(self.ttsengine, engines.NoneEngine):
-            self.ttsengine.synthesize(self.script, self.get_file_path("tts.wav"))
-            self.duration: float  # for type hinting
-            self.audio.append(mp.AudioFileClip(self.get_file_path("tts.wav")))
-
-        if not isinstance(self.backgroundengine, engines.NoneEngine):
-            self.progress(0.4, "Generating background...")
-            self.backgroundengine.get_background()
-
-        if not isinstance(self.audiobackgroundengine, engines.NoneEngine):
-            self.progress(0.45, "Generating audio background...")
-            self.audiobackgroundengine.get_background()
-
-        self.assetsengine = [
-            engine
-            for engine in self.assetsengine
-            if not isinstance(engine, engines.NoneEngine)
-        ]
-        if len(self.assetsengine) > 0:
-            self.progress(0.5, "Generating assets...")
-            self.assetsengineselector.get_assets()
-
-        if not isinstance(self.captioningengine, engines.NoneEngine):
-            self.progress(0.6, "Generating captions...")
-            self.captioningengine.get_captions()
-
-        # we render to a file called final.mp4
-        self.progress(0.7, "Rendering video...")
-        clips = [
-            *self.index_0,
-            *self.index_1,
-            *self.index_2,
-            *self.index_3,
-            *self.index_4,
-            *self.index_5,
-            *self.index_6,
-            *self.index_7,
-            *self.index_8,
-            *self.index_9,
-        ]
-        audio = mp.CompositeAudioClip(self.audio)
-        clip = mp.CompositeVideoClip(clips, size=(self.width, self.height)).with_duration(self.duration).with_audio(
-            audio
-        )
-        clip.write_videofile(self.get_file_path("final.mp4"), fps=60, threads=4, codec="h264_nvenc")
-
-        self.progress(0.8, "Generating metadata...")
-        self.metadataengine.get_metadata()
-
-        self.description = self.description + "\n" + self.credits
-        self.progress(0.9, "Uploading video...")
-        for engine in self.uploadengine:
-            try:
-                engine.upload()
-            except Exception as e:
-                print(e)
-                gradio.Warning(f"{engine.name} failed to upload the video.")
-        self.progress(0.99, "Storing in database...")
-        self.store_in_db()
-        self.progress(1, "Done!")
-
-        command = "start" if os.name == 'nt' else "open"
-        os.system(f"{command} {os.path.abspath(self.dir)}")
+        self.pipeline.launch(self)
--- a/src/engines/AIImageEngine/A1111AIImageEngine.py
+++ b/src/engines/AIImageEngine/A1111AIImageEngine.py
@@ -0,0 +1,89 @@
+import os
+from typing import Literal, TypedDict, List
+
+import gradio as gr
+import moviepy as mp
+import moviepy.video.fx as vfx
+import requests
+import base64
+
+from . import BaseAIImageEngine
+
+
+class Spec(TypedDict):
+    prompt: str
+    start: float
+    end: float
+    style: Literal["vivid", "natural"]
+
+
+class A1111AIImageEngine(BaseAIImageEngine):
+    name = "A1111"
+    description = "Stable Diffusion web UI"
+
+    num_options = 0
+
+    def __init__(self, options: dict):
+        self.base_url = self.retrieve_setting(identifier="a1111_base_url")
+        if not self.base_url:
+            gr.Warning("Please set the base URL for the A1111 API.")
+            return
+        self.base_url = self.base_url["base_url"]
+
+        super().__init__()
+
+    def generate(self, prompt: str, start: float, end: float) -> mp.ImageClip:
+        max_width = self.ctx.width / 3 * 2
+        try:
+            url = self.base_url + "/sdapi/v1/txt2img"
+            payload = {
+                "prompt": prompt,
+                "width": 1024,
+                "height": 1024,
+            }
+            response = requests.post(url, json=payload)
+            response.raise_for_status()
+
+            with open("temp.png", "wb") as f:
+                f.write(base64.b64decode(response.json()["images"][0]))
+        except Exception as e:
+            gr.Warning(f"Failed to get image: {e}")
+            return (
+                mp.ColorClip((self.ctx.width, self.ctx.height), color=(0, 0, 0))
+                .with_duration(end - start)
+                .with_start(start)
+            )
+        img = mp.ImageClip("temp.png")
+        os.remove("temp.png")
+
+        position = ("center", "center")
+        img = (
+            img.with_position(position)
+            .with_duration(end - start)
+            .with_start(start)
+            .with_effects([vfx.Resize(width=max_width)])
+        )
+        return img
+
+    @classmethod
+    def get_options(cls):
+        return []
+
+    @classmethod
+    def get_settings(cls):
+        current_base_url: dict | list[dict] | None = cls.retrieve_setting(
+            identifier="a1111_base_url"
+        )
+        current_base_url = current_base_url["base_url"] if current_base_url else ""
+        base_url_input = gr.Textbox(
+            label="Automatic 1111 Base URL",
+            value=current_base_url,
+        )
+        save = gr.Button("Save")
+
+        def save_base_url(base_url: str):
+            cls.store_setting(identifier="a1111_base_url", data={"base_url": base_url})
+            gr.Info("Base URL saved successfully.")
+            return gr.update(value=base_url)
+
+        save.click(save_base_url, inputs=[base_url_input])
--- a/src/engines/AIImageEngine/BaseAIImageEngine.py
+++ b/src/engines/AIImageEngine/BaseAIImageEngine.py
@@ -0,0 +1,18 @@
+import moviepy as mp
+
+from abc import abstractmethod
+
+from ..BaseEngine import BaseEngine
+
+
+class BaseAIImageEngine(BaseEngine):
+    """
+    The base class for all assets engines.
+    """
+
+    @abstractmethod
+    def generate(self, prompt: str, start: float, end: float) -> mp.ImageClip:
+        """
+        Ge
+        """
+        ...
--- a/src/engines/AIImageEngine/DallEAIImageEngine.py
+++ b/src/engines/AIImageEngine/DallEAIImageEngine.py
@@ -0,0 +1,106 @@
+import os
+from typing import Literal, TypedDict, List
+
+import gradio as gr
+import moviepy as mp
+import moviepy.video.fx as vfx
+import openai
+from openai import OpenAI
+import requests
+
+from . import BaseAIImageEngine
+
+
+class Spec(TypedDict):
+    prompt: str
+    start: float
+    end: float
+    style: Literal["vivid", "natural"]
+
+
+class DallEAIImageEngine(BaseAIImageEngine):
+    name = "DALL-E"
+    description = "A powerful image generation model by OpenAI."
+
+    num_options = 1
+
+    def __init__(self, options: dict):
+        self.aspect_ratio: Literal["portrait", "square", "landscape"] = options[0]
+        api_key = self.retrieve_setting(identifier="openai_api_key")
+        if not api_key:
+            raise ValueError("OpenAI API key is not set.")
+        self.client = OpenAI(api_key=api_key["api_key"])
+
+        super().__init__()
+
+    def generate(self, prompt: str, start: float, end: float) -> mp.ImageClip:
+        max_width = self.ctx.width / 3 * 2
+        size: Literal["1024x1024", "1024x1792", "1792x1024"] = (
+            "1024x1024"
+            if self.aspect_ratio == "square"
+            else "1024x1792" if self.aspect_ratio == "portrait" else "1792x1024"
+        )
+        try:
+            response = self.client.images.generate(
+                model="dall-e-3",
+                prompt=prompt,
+                size=size,
+                n=1,
+                style="natural",
+                response_format="url",
+            )
+        except openai.BadRequestError as e:
+            if e.code == "content_policy_violation":
+                gr.Warning("Image generation violated openai policies.")
+                return (
+                    mp.ColorClip((self.ctx.width, self.ctx.height), color=(0, 0, 0))
+                    .with_duration(end - start)
+                    .with_start(start)
+                )
+
+            else:
+                raise
+        img_bytes = requests.get(response.data[0].url)
+        with open("temp.png", "wb") as f:
+            f.write(img_bytes.content)
+        img = mp.ImageClip("temp.png")
+        os.remove("temp.png")
+
+        position = ("center", "center")
+        img = (
+            img.with_position(position)
+            .with_duration(end - start)
+            .with_start(start)
+            .with_effects([vfx.Resize(width=max_width)])
+        )
+        return img
+
+    @classmethod
+    def get_options(cls):
+        return [
+            gr.Radio(
+                ["portrait", "square", "landscape"],
+                label="Aspect Ratio",
+                value="square",
+            )
+        ]
+
+    @classmethod
+    def get_settings(cls):
+        current_api_key: dict | list[dict] | None = cls.retrieve_setting(
+            identifier="openai_api_key"
+        )
+        current_api_key = current_api_key["api_key"] if current_api_key else ""
+        api_key_input = gr.Textbox(
+            label="OpenAI API Key",
+            type="password",
+            value=current_api_key,
+        )
+        save = gr.Button("Save")
+
+        def save_api_key(api_key: str):
+            cls.store_setting(identifier="openai_api_key", data={"api_key": api_key})
+            gr.Info("API key saved successfully.")
+            return gr.update(value=api_key)
+
+        save.click(save_api_key, inputs=[api_key_input])
--- a/src/engines/AIImageEngine/init.py
+++ b/src/engines/AIImageEngine/init.py
@@ -0,0 +1,3 @@
+from .BaseAIImageEngine import BaseAIImageEngine
+from .DallEAIImageEngine import DallEAIImageEngine
+from .A1111AIImageEngine import A1111AIImageEngine
--- a/src/engines/AIImageEngine/prompts/assets.yaml
+++ b/src/engines/AIImageEngine/prompts/assets.yaml
--- a/src/engines/AssetsEngine/AssetsEngineSelector.py
+++ b/src/engines/AssetsEngine/AssetsEngineSelector.py
@@ -1,37 +0,0 @@
-import json
-
-from ...chore import GenerationContext
-from ...utils.prompting import get_prompt
-
-
-class AssetsEngineSelector:
-    def __init__(self):
-        self.ctx: GenerationContext
-
-    def get_assets(self):
-        system_prompt, chat_prompt = get_prompt("assets", by_file_location=__file__)
-        engines_descriptors = ""
-
-        for engine in self.ctx.assetsengine:
-            engines_descriptors += (
-                f"name: '{engine.name}'\n{json.dumps(engine.specification)}\n"
-            )
-
-        system_prompt = system_prompt.replace("{engines}", engines_descriptors)
-        chat_prompt = chat_prompt.replace(
-            "{caption}", json.dumps(self.ctx.timed_script)
-        )
-
-        assets = self.ctx.powerfulllmengine.generate(
-            system_prompt=system_prompt,
-            chat_prompt=chat_prompt,
-            max_tokens=4096,
-            json_mode=True,
-        )["assets"]
-        clips: list = []
-        for engine in self.ctx.assetsengine:
-            assets_opts = [
-                asset["args"] for asset in assets if asset["engine"] == engine.name
-            ]
-            clips.extend(engine.generate(assets_opts))
-        self.ctx.index_3.extend(clips)
--- a/src/engines/AssetsEngine/BaseAssetsEngine.py
+++ b/src/engines/AssetsEngine/BaseAssetsEngine.py
@@ -1,22 +0,0 @@
-from abc import abstractmethod
-
-from ..BaseEngine import BaseEngine
-
-
-class BaseAssetsEngine(BaseEngine):
-    """
-    The base class for all assets engines.
-
-    Attributes:
-        specification (dict): A dictionary containing the specification of the engine, especially what an object returned by the llm should look like.
-        spec_name (str): A comprehensive name for the specification for purely llm purposes.
-        spec_description (str): A comprehensive description for the specification for purely llm purposes.
-    """
-
-    specification: dict
-    spec_name: str
-    spec_description: str
-
-    @abstractmethod
-    def generate(self, options: list) -> list:
-        ...
--- a/src/engines/AssetsEngine/DallEAssetsEngine.py
+++ b/src/engines/AssetsEngine/DallEAssetsEngine.py
@@ -1,116 +0,0 @@
-import os
-from typing import Literal, TypedDict, List
-
-import gradio as gr
-import moviepy as mp
-import moviepy.video.fx as vfx
-import openai
-from openai import OpenAI
-import requests
-
-from . import BaseAssetsEngine
-
-
-class Spec(TypedDict):
-    prompt: str
-    start: float
-    end: float
-    style: Literal["vivid", "natural"]
-
-
-class DallEAssetsEngine(BaseAssetsEngine):
-    name = "DALL-E"
-    description = "A powerful image generation model by OpenAI."
-    spec_name = "dalle"
-    spec_description = (
-        "Use the dall-e 3 model to generate images from a detailed prompt."
-    )
-    specification = {
-        "prompt": "A detailed prompt to generate the image from. Describe every subtle detail of the image you want to generate. [str]",
-        "start": "The starting time of the video clip. [float]",
-        "end": "The ending time of the video clip. [float]",
-        "style": "The style of the generated images. Must be one of vivid or natural. Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images. [str]",
-    }
-
-    num_options = 1
-
-    def __init__(self, options: dict):
-        self.aspect_ratio: Literal["portrait", "square", "landscape"] = options[0]
-        api_key = self.retrieve_setting(identifier="openai_api_key")
-        if not api_key:
-            raise ValueError("OpenAI API key is not set.")
-        self.client = OpenAI(api_key=api_key["api_key"])
-
-        super().__init__()
-
-    def generate(self, options: list[Spec]) -> list[mp.ImageClip]:
-        max_width = self.ctx.width / 3 * 2
-        clips = []
-        for option in options:
-            prompt = option["prompt"]
-            start = option["start"]
-            end = option["end"]
-            style = option["style"]
-            size: Literal["1024x1024", "1024x1792", "1792x1024"] = (
-                "1024x1024"
-                if self.aspect_ratio == "square"
-                else "1024x1792"
-                if self.aspect_ratio == "portrait"
-                else "1792x1024"
-            )
-            try:
-                response = self.client.images.generate(
-                    model="dall-e-3",
-                    prompt=prompt,
-                    size=size,
-                    n=1,
-                    style=style,
-                    response_format="url",
-                )
-            except openai.BadRequestError as e:
-                if e.code == "content_policy_violation":
-                    # we skip this prompt
-                    continue
-                else:
-                    raise
-            img_bytes = requests.get(response.data[0].url)
-            with open("temp.png", "wb") as f:
-                f.write(img_bytes.content)
-            img = mp.ImageClip("temp.png")
-            os.remove("temp.png")
-
-            img: mp.ImageClip = img.with_duration(end - start)
-            img: mp.ImageClip = img.with_start(start)
-            img: mp.ImageClip = img.with_effects([vfx.Resize(width=max_width)])
-            position = img.with_position(("center", "top"))
-            img: mp.ImageClip = img.with_position(position)
-            clips.append(img)
-        return clips
-
-    @classmethod
-    def get_options(cls):
-        return [
-            gr.Radio(
-                ["portrait", "square", "landscape"],
-                label="Aspect Ratio",
-                value="square",
-            )
-        ]
-    
-    @classmethod
-    def get_settings(cls):
-        current_api_key: dict | list[dict] | None = cls.retrieve_setting(identifier="openai_api_key")
-        current_api_key = current_api_key["api_key"] if current_api_key else ""
-        api_key_input = gr.Textbox(
-            label="OpenAI API Key",
-            type="password",
-            value=current_api_key,
-        )
-        save = gr.Button("Save")
-
-        def save_api_key(api_key: str):
-            cls.store_setting(identifier="openai_api_key", data={"api_key": api_key})
-            gr.Info("API key saved successfully.")
-            return gr.update(value=api_key)
-
-        save.click(save_api_key, inputs=[api_key_input])
--- a/src/engines/AssetsEngine/init.py
+++ b/src/engines/AssetsEngine/init.py
@@ -1,4 +0,0 @@
-from .AssetsEngineSelector import AssetsEngineSelector
-from .BaseAssetsEngine import BaseAssetsEngine
-from .DallEAssetsEngine import DallEAssetsEngine
-from .GoogleAssetsEngine import GoogleAssetsEngine
--- a/src/engines/BaseEngine.py
+++ b/src/engines/BaseEngine.py
@@ -1,6 +1,7 @@
-from abc import ABC, abstractmethod
-
 import moviepy as mp
+import os
+
+from abc import ABC, abstractmethod
 from sqlalchemy.future import select

 from ..chore import GenerationContext
@@ -17,8 +18,7 @@ class BaseEngine(ABC):

    @classmethod
    @abstractmethod
-    def get_options(cls):
-        ...
+    def get_options(cls): ...

    def get_video_duration(self, path: str) -> float:
        return mp.VideoFileClip(path).duration
@@ -28,7 +28,9 @@ class BaseEngine(ABC):

    # noinspection PyShadowingBuiltins
    @classmethod
-    def get_assets(cls, *, type: str = None, by_id: int = None) -> list[File] | File | None:
+    def get_assets(
+        cls, *, type: str = None, by_id: int = None
+    ) -> list[File] | File | None:
        with SessionLocal() as db:
            if type:
                # noinspection PyTypeChecker
@@ -45,9 +47,7 @@ class BaseEngine(ABC):
                # noinspection PyTypeChecker
                return (
                    db.execute(
-                        select(File).filter(
-                            File.id == by_id, File.provider == cls.name
-                        )
+                        select(File).filter(File.id == by_id, File.provider == cls.name)
                    )
                    .scalars()
                    .first()
@@ -102,7 +102,9 @@ class BaseEngine(ABC):

    # noinspection PyShadowingBuiltins
    @classmethod
-    def retrieve_setting(cls, *, identifier: str = None, type: str = None) -> dict | list[dict] | None:
+    def retrieve_setting(
+        cls, *, identifier: str = None, type: str = None
+    ) -> dict | list[dict] | None:
        """
        Retrieve a setting from the database based on the provided identifier or type.

@@ -166,5 +168,12 @@ class BaseEngine(ABC):
            db.commit()

    @classmethod
-    def get_settings(cls):
-        ...
+    def get_settings(cls): ...
+
+    @classmethod
+    def get_dir(cls, file):
+        return os.path.dirname(os.path.realpath(file))
+
+    @classmethod
+    def get_file_name(cls, file):
+        return os.path.splitext(os.path.basename(file))[0]
--- a/src/engines/MetadataEngine/BaseMetadataEngine.py
+++ b/src/engines/MetadataEngine/BaseMetadataEngine.py
@@ -1,13 +0,0 @@
-from abc import abstractmethod
-
-from ..BaseEngine import BaseEngine
-
-
-class BaseMetadataEngine(BaseEngine):
-    def __init__(self, **kwargs) -> None:
-        super().__init__()
-        ...
-
-    @abstractmethod
-    def get_metadata(self) -> None:
-        ...
--- a/src/engines/MetadataEngine/ShortsMetadataEngine.py
+++ b/src/engines/MetadataEngine/ShortsMetadataEngine.py
@@ -1,30 +0,0 @@
-from . import BaseMetadataEngine
-
-from ...utils.prompting import get_prompt
-
-
-class ShortsMetadataEngine(BaseMetadataEngine):
-    name = "ShortsMetadata"
-    description = "Generate metadata for YouTube Shorts / TikTok format videos"
-
-    num_options = 0
-
-    def __init__(self, **kwargs) -> None:
-        super().__init__(**kwargs)
-        ...
-
-    def get_metadata(self):
-        sytsem_prompt, chat_prompt = get_prompt(
-            "ShortsMetadata", by_file_location=__file__
-        )
-        chat_prompt = chat_prompt.replace("{script}", self.ctx.script)
-
-        result = self.ctx.simplellmengine.generate(
-            chat_prompt=chat_prompt, system_prompt=sytsem_prompt, json_mode=True
-        )
-        self.ctx.title = result["title"]
-        self.ctx.description = result["description"]
-
-    @classmethod
-    def get_options(cls):
-        return []
--- a/src/engines/MetadataEngine/init.py
+++ b/src/engines/MetadataEngine/init.py
@@ -1,2 +0,0 @@
-from .BaseMetadataEngine import BaseMetadataEngine
-from .ShortsMetadataEngine import ShortsMetadataEngine
--- a/src/engines/Pipelines/BasePipeline.py
+++ b/src/engines/Pipelines/BasePipeline.py
@@ -0,0 +1,12 @@
+from ...chore import GenerationContext
+from abc import ABC, abstractmethod
+from ..BaseEngine import BaseEngine
+
+
+class BasePipeline(BaseEngine):
+    def __init__(self) -> None:
+        super().__init__()
+
+    @abstractmethod
+    def launch(self, ctx: GenerationContext) -> None:
+        pass
--- a/src/engines/Pipelines/BestofShortPipeline.py
+++ b/src/engines/Pipelines/BestofShortPipeline.py
@@ -0,0 +1,386 @@
+import json
+import os
+
+import cv2
+import gradio as gr
+import moviepy as mp
+import yt_dlp
+import numpy as np
+
+from . import BasePipeline
+from ... import engines
+from ...chore import GenerationContext
+from ...utils.prompting import get_prompts
+
+
+def track_and_center_face(
+    input_video_path, output_video_path, output_resolution, progress=lambda x, y: None
+):
+    face_cascade = cv2.CascadeClassifier(
+        cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
+    )
+    temp_output_video_path = output_video_path.replace(".mp4", "_temp.mp4")
+
+    cap = cv2.VideoCapture(input_video_path)
+    if not cap.isOpened():
+        raise IOError(f"Could not open video {input_video_path}")
+
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    output_fps = min(fps, 30)  # Limit to 30 FPS if necessary
+    total_frames_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps > 30:
+        frame_interval = int(np.round(fps / 30))
+    else:
+        frame_interval = 1
+    positions: dict[int, tuple[int, int]] = {}
+    frame_count = 0
+    previous_center = None
+    while True:
+        progress(frame_count, total_frames_count)
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        # Adjusting frame rate to 30 FPS if necessary
+        if (fps > 30) and (frame_count % frame_interval != 0):
+            frame_count += 1
+            continue
+        frame_count += 1
+
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        faces = face_cascade.detectMultiScale(gray, 1.1, 4)
+        H, W = frame.shape[:2]
+
+        if len(faces) > 0:
+            x, y, w, h = max(faces, key=lambda rect: rect[2] * rect[3])  # largest face
+            center_x, center_y = (x + w // 2, y + h // 2)
+
+            positions[frame_count] = (center_x, center_y)
+            previous_center = (center_x, center_y)
+
+            # Crop and center based on the face
+            """
+            startX = max(center_x - output_resolution[0] // 2, 0)
+            startY = max(center_y - output_resolution[1] // 2, 0)
+            startX = min(startX, W - output_resolution[0])
+            startY = min(startY, H - output_resolution[1])
+
+            output_frame = frame[
+                startY : startY + output_resolution[1],
+                startX : startX + output_resolution[0],
+            ]
+            output_frame = cv2.resize(output_frame, output_resolution)
+            """
+        else:
+            if previous_center:
+                positions[frame_count] = previous_center
+                previous_center = positions[frame_count]
+            else:
+                positions[frame_count] = (0, 0)
+
+    del previous_center
+    cap.release()
+    cv2.destroyAllWindows()
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(temp_output_video_path, fourcc, output_fps, output_resolution)
+    cap = cv2.VideoCapture(input_video_path)
+    frame_count = 0
+    movement_threshold = 2 * output_resolution[0] / 3
+    while True:
+        # if the image moved more than movement_threshold in 2d space, and the next two images did as well, then move to that image, else use the previous center
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if positions.get(frame_count):
+            if (
+                positions.get(frame_count - 1)  # The previous frame exists
+                and np.linalg.norm(  # The previous face is far enough from the current face
+                    np.array(positions.get(frame_count))
+                    - np.array(positions.get(frame_count - 1))
+                )
+                > movement_threshold
+                and positions.get(frame_count + 1)  # The next frame exists
+                and np.linalg.norm(  # The previous face is far enough from the current face
+                    np.array(positions.get(frame_count - 1))
+                    - np.array(positions.get(frame_count + 1))
+                )
+                > movement_threshold
+            ):
+                # the big movement is consistent between two frames, so we do move the camera (in this case, don't do anything)
+                pass
+            else:
+                # did not move enough, so we use the previous center to allow for more consistent tracking
+                positions[frame_count] = positions.get(
+                    frame_count - 1, positions[frame_count]
+                )
+        else:
+            positions[frame_count] = positions.get(
+                frame_count + 1, positions.get(frame_count - 1)
+            )
+        if positions.get(frame_count):
+            center_x, center_y = positions[frame_count]
+            startX = max(center_x - output_resolution[0] // 2, 0)
+            startY = max(center_y - output_resolution[1] // 2, 0)
+            startX = min(startX, W - output_resolution[0])
+            startY = min(startY, H - output_resolution[1])
+            output_frame = frame[
+                startY : startY + output_resolution[1],
+                startX : startX + output_resolution[0],
+            ]
+            output_frame = cv2.resize(output_frame, output_resolution)
+            out.write(output_frame)
+        else:
+            # we CROP!!! fo size output_resolution in the middle of the frame
+            startX = (W - output_resolution[0]) // 2
+            startY = (H - output_resolution[1]) // 2
+            output_frame = frame[
+                startY : startY + output_resolution[1],
+                startX : startX + output_resolution[0],
+            ]
+            output_frame = cv2.resize(output_frame, output_resolution)
+            out.write(output_frame)
+        frame_count += 1
+    cap.release()
+    out.release()
+    cv2.destroyAllWindows()
+
+    # Process audio and finalize the video
+    original_clip = mp.VideoFileClip(input_video_path)
+    processed_video = mp.VideoFileClip(temp_output_video_path)
+    processed_video = processed_video.with_audio(original_clip.audio)
+    processed_video.write_videofile(output_video_path, codec="libx264")
+
+
+class BestofShortPipeline(BasePipeline):
+    name = "Bestof Short Pipeline"
+    description = (
+        "Creates a short video based on a best-of compilation of a given video."
+    )
+    num_options = 2
+
+    def __init__(self, options: list) -> None:
+        self.n_shorts = options[0]
+        self.url = options[1]
+        super().__init__()
+
+    def launch(self, ctx: GenerationContext) -> None:
+
+        ctx.progress(0.1, "Loading settings...")
+        ctx.setup_dir()
+        if not isinstance(ctx.settingsengine, engines.NoneEngine):
+            ctx.settingsengine.load()
+        prompts = get_prompts("bestof", by_file_location=__file__)
+
+        ctx.progress(0.2, "Downloading video...")
+        video_id = self.url.split("v=")[1]
+        video_id = video_id.split("&")[0]
+        self.url = f"https://www.youtube.com/watch?v={video_id}"
+        input_video_path = f"local/assets/youtube/{video_id}.mp4"
+        if not os.path.exists(input_video_path):
+            os.makedirs("local/assets/youtube", exist_ok=True)
+            ydl_opts = {
+                "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+                "outtmpl": input_video_path,
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([self.url])
+                info = ydl.extract_info(self.url, download=False)
+                title = info["title"]
+                heatmap = info["heatmap"]
+                channel = info["channel"]
+        else:
+            with yt_dlp.YoutubeDL() as ydl:
+                info = ydl.extract_info(self.url, download=False)
+                title = info["title"]
+                heatmap = info["heatmap"]
+                channel = info["channel"]
+        ctx.progress(0.3, "Transcribing video...")
+        input_transcript_path = f"local/assets/youtube/{video_id}_transcript.json"
+        if not os.path.exists(input_transcript_path):
+            result = ctx.transcriptionengine.transcribe(
+                input_video_path, fast=True, words=False, avoid_hallucinations=True
+            )
+            with open(input_transcript_path, "w") as f:
+                json.dump(result, f)
+        else:
+            with open(input_transcript_path, "r") as f:
+                result = json.load(f)
+        timed_script = [
+            {
+                "start": segment["start"],
+                "end": segment["end"],
+                "text": segment["text"].strip(),
+            }
+            for segment in result["segments"]
+        ]
+        ctx.progress(0.4, "Finding viral sections...")
+        sections = [
+            {
+                "start": x["start_time"] - 30,
+                "end": x["end_time"] + 30,
+                "value": x["value"],
+            }
+            for x in heatmap
+            if x["value"] > 0.35 and x["start_time"] > 30
+        ]
+        if len(sections) > self.n_shorts:
+            sections = sections[: self.n_shorts]
+        elif len(sections) < self.n_shorts:
+            gr.Warning(
+                "The number of viral sections found is less than the number of shorts requested. Less shorts will be generated."
+            )
+
+        for i, section in enumerate(sections):
+            if i == 0:
+                continue
+            allocated_progress = 0.5 / len(sections)
+            get_progress = lambda x, t: 0.5 + allocated_progress * (x / t)
+
+            ctx.progress(
+                get_progress(1, 8), f"Preprocessing {i+1} of {len(sections)}..."
+            )
+            rough_start_time = section["start"]
+            rough_end_time = section["end"]
+            audio = mp.AudioFileClip(input_video_path)
+            rough_audio = audio.with_subclip(rough_start_time, rough_end_time)
+            filename = ctx.get_file_path(
+                f"audio_{rough_start_time}_{rough_end_time}.mp3"
+            )
+            rough_audio.write_audiofile(filename)
+
+            ctx.progress(
+                get_progress(2, 9), f"Transcribing {i+1} of {len(sections)}..."
+            )
+            rough_transcript = ctx.transcriptionengine.transcribe(
+                filename, fast=False, words=True
+            )
+
+            ctx.progress(
+                get_progress(3, 9), f"Generating edit {i+1} of {len(sections)}..."
+            )
+            full_edit = ctx.powerfulllmengine.generate(
+                system_prompt=prompts["Full edit"]["system"],
+                chat_prompt=prompts["Full edit"]["chat"].replace(
+                    "{transcript}", json.dumps(rough_transcript)
+                ),
+                temperature=1,
+                json_mode=True,
+            )
+            video = mp.VideoFileClip(input_video_path)
+            full_edit_start = rough_start_time + full_edit["start"]
+            full_edit_end = rough_start_time + full_edit["end"]
+            clip: mp.VideoClip = video.with_subclip(full_edit_start, full_edit_end)
+            w, h = clip.size
+            resolution: float = w / h
+            canvas_resolution: float = ctx.width / ctx.height
+            if resolution > canvas_resolution:
+                clip = clip.resized(height=ctx.height)
+            else:
+                clip = clip.resized(width=ctx.width)
+            video_filename = ctx.get_file_path(
+                f"intermediary_video_{full_edit_start}_{full_edit_end}.mp4"
+            )
+            clip.write_videofile(video_filename, codec="h264_nvenc")
+
+            ctx.progress(
+                get_progress(4, 9),
+                f"Tracking and centering face {i+1} of {len(sections)}...",
+            )
+
+            def track_progress(step, total):
+                # sub_allocated_progress is the allocated progress divided by 9
+                sub_allocated_progress = allocated_progress / 9
+                current_progress = sub_allocated_progress * step / total
+                ctx.progress(
+                    get_progress(4, 9) + current_progress,
+                    f"Tracking and centering face {i+1} of {len(sections)}, frame {step} of {total}...",
+                )
+
+            tracked_video_filename = ctx.get_file_path(
+                f"tracked_video_{full_edit_start}_{full_edit_end}.mp4"
+            )
+            track_and_center_face(
+                video_filename,
+                tracked_video_filename,
+                (ctx.width, ctx.height),
+                track_progress,
+            )
+
+            ctx.progress(
+                get_progress(5, 9), f"Transcribing {i+1} of {len(sections)}..."
+            )
+            final_transcript = ctx.transcriptionengine.transcribe(
+                tracked_video_filename, fast=False, words=True
+            )
+
+            ctx.progress(
+                get_progress(6, 9), f"Generating captions {i+1} of {len(sections)}..."
+            )
+            captions = ctx.captioningengine.get_captions(final_transcript)
+            video = mp.VideoFileClip(tracked_video_filename)
+            final = mp.CompositeVideoClip(
+                [video, *captions], size=(ctx.width, ctx.height)
+            )
+            final_filename = ctx.get_file_path(
+                f"final_video_{full_edit_start}_{full_edit_end}.mp4"
+            )
+
+            ctx.progress(
+                get_progress(7, 9), f"Final rendering {i+1} of {len(sections)}..."
+            )
+            final.write_videofile(final_filename, codec="h264_nvenc")
+
+            ctx.progress(
+                get_progress(8, 9),
+                f"Generating description {i+1} of {len(sections)}...",
+            )
+            description = ctx.powerfulllmengine.generate(
+                system_prompt=prompts["Description"]["system"],
+                chat_prompt=prompts["Description"]["chat"]
+                .replace("{transcript}", json.dumps(final_transcript))
+                .replace("{title}", title)
+                .replace("{channel}", channel),
+                temperature=1,
+                json_mode=True,
+            )
+            ctx.credits += f"\nOriginal video by {channel} on Youtube."
+            title = description["title"]
+            description = description["description"]
+
+            ctx.progress(get_progress(9, 9), f"Uploading {i+1} of {len(sections)}...")
+            description = description + "\n" + ctx.credits
+            for engine in ctx.uploadengine:
+                try:
+                    engine.upload(
+                        title=title,
+                        description=description,
+                        path=final_filename,
+                    )
+                except Exception as e:
+                    print(e)
+                    gr.Warning(f"{engine.name} failed to upload the video.")
+
+        # ctx.progress(0.99, "Storing in database...")
+        # ctx.store_in_db()
+        ctx.progress(1, "Done!")
+
+        command = "start" if os.name == "nt" else "open"
+        os.system(f"{command} {os.path.abspath(ctx.dir)}")
+
+    @classmethod
+    def get_options(cls):
+        return [
+            gr.Number(
+                minimum=1,
+                maximum=10,
+                label="Number of shorts",
+                value=1,
+                step=1,
+            ),
+            gr.Textbox(
+                label="Youtube URL",
+                placeholder="Enter the URL of the video you want to use",
+                value="",
+                max_lines=1,
+            ),
+        ]
--- a/src/engines/Pipelines/ScriptedShortPipeline.py
+++ b/src/engines/Pipelines/ScriptedShortPipeline.py
@@ -0,0 +1,143 @@
+import os
+
+import gradio as gr
+import moviepy as mp
+
+from . import BasePipeline
+from ... import engines
+from ...chore import GenerationContext
+from ...utils.prompting import get_prompt, get_prompts
+
+
+class ScriptedShortPipeline(BasePipeline):
+    name = "Scripted Short Pipeline"
+    description = "A pipeline that generates a short video based on a script."
+    num_options = 2
+
+    def __init__(self, options: list) -> None:
+        self.script_prompt = self.get_prompts()[options[0]]
+        self.n_sentences = options[1]
+        super().__init__()
+
+    @classmethod
+    def get_prompts(cls):
+        return get_prompts("scripts", by_file_location=__file__)
+
+    def launch(self, ctx: GenerationContext) -> None:
+
+        ctx.progress(0.1, "Loading settings...")
+        ctx.setup_dir()
+        if not isinstance(ctx.settingsengine, engines.NoneEngine):
+            ctx.settingsengine.load()
+
+        ctx.progress(0.2, "Generating script...")
+        system, chat = self.script_prompt["system"], self.script_prompt["chat"]
+        system, chat = system.replace(
+            "{n_sentences}", str(self.n_sentences)
+        ), chat.replace("{n_sentences}", str(self.n_sentences))
+        ctx.script = ctx.powerfulllmengine.generate(
+            system_prompt=system,
+            chat_prompt=chat,
+            json_mode=False,
+            temperature=1.3,
+            max_tokens=20 * self.n_sentences,
+        )
+
+        ctx.progress(0.3, "Synthesizing voice...")
+        ctx.duration = ctx.ttsengine.synthesize(
+            ctx.script, ctx.get_file_path("tts.wav")
+        )
+        ctx.audio.append(mp.AudioFileClip(ctx.get_file_path("tts.wav")))
+        ctx.timed_script = ctx.transcriptionengine.transcribe(
+            ctx.get_file_path("tts.wav"), fast=False, words=True
+        )
+
+        if not isinstance(ctx.backgroundengine, engines.NoneEngine):
+            ctx.progress(0.4, "Generating background...")
+            ctx.index_0.append(ctx.backgroundengine.get_background())
+
+        if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine):
+            ctx.progress(0.45, "Generating audio background...")
+            ctx.audio.append(ctx.audiobackgroundengine.get_background())
+
+        ctx.assetsengine = [
+            engine
+            for engine in ctx.assetsengine
+            if not isinstance(engine, engines.NoneEngine)
+        ]
+        if len(ctx.assetsengine) > 0:
+            ctx.progress(0.5, "Generating assets...")
+            ctx.index_3.extend(ctx.assetsengineselector.get_assets())
+
+        if not isinstance(ctx.captioningengine, engines.NoneEngine):
+            ctx.progress(0.6, "Generating captions...")
+            ctx.index_7.extend(
+                ctx.captioningengine.get_captions(words=ctx.timed_script)
+            )
+
+        # we render to a file called final.mp4
+        ctx.progress(0.7, "Rendering video...")
+        clips = [
+            *ctx.index_0,
+            *ctx.index_1,
+            *ctx.index_2,
+            *ctx.index_3,
+            *ctx.index_4,
+            *ctx.index_5,
+            *ctx.index_6,
+            *ctx.index_7,
+            *ctx.index_8,
+            *ctx.index_9,
+        ]
+        audio = mp.CompositeAudioClip(ctx.audio)
+        clip = (
+            mp.CompositeVideoClip(clips, size=(ctx.width, ctx.height))
+            .with_duration(ctx.duration)
+            .with_audio(audio)
+        )
+        clip.write_videofile(
+            ctx.get_file_path("final.mp4"), fps=60, threads=4, codec="h264_nvenc"
+        )
+
+        system, chat = get_prompt("description", by_file_location=__file__)
+        metadata = ctx.powerfulllmengine.generate(
+            system_prompt=system, chat_prompt=chat, json_mode=True, temperature=1
+        )
+        ctx.title = metadata["title"]
+        ctx.description = metadata["description"]
+
+        ctx.description = ctx.description + "\n" + ctx.credits
+        ctx.progress(0.9, "Uploading video...")
+        for engine in ctx.uploadengine:
+            try:
+                engine.upload(
+                    ctx.title, ctx.description, ctx.get_file_path("final.mp4")
+                )
+            except Exception as e:
+                print(e)
+                gr.Warning(f"{engine.name} failed to upload the video.")
+
+        ctx.progress(0.99, "Storing in database...")
+        ctx.store_in_db()
+        ctx.progress(1, "Done!")
+
+        command = "start" if os.name == "nt" else "open"
+        os.system(f"{command} {os.path.abspath(ctx.dir)}")
+
+    @classmethod
+    def get_options(cls):
+        prompts = list(cls.get_prompts().keys())
+        return [
+            gr.Radio(
+                prompts,
+                label="Script",
+                value=prompts[0],
+            ),
+            gr.Number(
+                minimum=1,
+                maximum=25,
+                label="Number of sentences",
+                value=5,
+                step=1,
+            ),
+        ]
--- a/src/engines/Pipelines/ScriptedVideoPipeline.py
+++ b/src/engines/Pipelines/ScriptedVideoPipeline.py
@@ -0,0 +1,162 @@
+import os
+
+import gradio as gr
+import moviepy as mp
+
+from . import BasePipeline
+from ... import engines
+from ...chore import GenerationContext
+from ...utils.prompting import get_prompt, get_prompts
+
+
+class ScriptedVideoPipeline(BasePipeline):
+    name = "Scripted Long Form Pipeline"
+    description = (
+        "A pipeline that generates a long form video based on a script instruction."
+    )
+    num_options = 2
+
+    def __init__(self, options: list) -> None:
+        self.user_instructions = options[0]
+        self.assets_instructions = options[1]
+        super().__init__()
+
+    def launch(self, ctx: GenerationContext) -> None:
+
+        ctx.progress(0.1, "Loading settings...")
+        ctx.setup_dir()
+        if not isinstance(ctx.settingsengine, engines.NoneEngine):
+            ctx.settingsengine.load()
+        prompts = get_prompts("long_form", by_file_location=__file__)
+        ctx.progress(0.2, "Generating chapters...")
+        system = prompts["chapters"]["system"]
+        chat = prompts["chapters"]["chat"]
+        chat = chat.replace("{user_instructions}", str(self.user_instructions))
+        chapters: list[dict[str, str]] = ctx.powerfulllmengine.generate(
+            system_prompt=system,
+            chat_prompt=chat,
+            json_mode=True,
+            temperature=1,
+            max_tokens=4096,
+        )["chapters"]
+        ctx.script = ""
+
+        for chapter in chapters:
+            ctx.progress(0.2, f"Generating chapter: {chapter['title']}...")
+            system = prompts["writer"]["system"]
+            chat = prompts["writer"]["chat"]
+            chat = (
+                chat.replace("{user_instructions}", str(self.user_instructions))
+                .replace("{chapter_title}", chapter["title"])
+                .replace("{chapter_instructions}", chapter["explanation"])
+            )
+            ctx.script += ctx.powerfulllmengine.generate(
+                system_prompt=system,
+                chat_prompt=chat,
+                temperature=1,
+                max_tokens=4096,
+                json_mode=True,
+            )["chapter"]
+            ctx.script += "\n"
+
+        ctx.progress(0.3, "Synthesizing voice...")
+        ctx.duration = ctx.ttsengine.synthesize(
+            ctx.script, ctx.get_file_path("tts.wav")
+        )
+        ctx.audio.append(mp.AudioFileClip(ctx.get_file_path("tts.wav")))
+        ctx.progress(0.4, "Transcribing audio...")
+        ctx.timed_script = ctx.transcriptionengine.transcribe(
+            ctx.get_file_path("tts.wav"), fast=False, words=True
+        )
+
+        ctx.progress(0.5, "Generating images...")
+        system = prompts["imager"]["system"]
+        chat = prompts["imager"]["chat"]
+        chat = chat.replace("{user_instructions}", str(self.user_instructions))
+        chat = chat.replace("{assets_instructions}", str(self.assets_instructions))
+        chat = chat.replace("{video_transcript}", str(ctx.timed_script))
+        assets: list[dict[str, str | float]] = ctx.powerfulllmengine.generate(
+            system_prompt=system, chat_prompt=chat, temperature=1, max_tokens=4096,
+            json_mode=True
+        )["assets"]
+        for asset in assets:
+            if asset["type"] == "stock":
+                ctx.index_4.append(
+                    ctx.stockimageengine.get(
+                        asset["query"], asset["start"], asset["end"]
+                    )
+                )
+            elif asset["type"] == "ai":
+                ctx.index_5.append(
+                    ctx.aiimageengine.generate(
+                        asset["prompt"], asset["start"], asset["end"]
+                    )
+                )
+
+        if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine):
+            ctx.progress(0.45, "Generating audio background...")
+            ctx.audio.append(ctx.audiobackgroundengine.get_background())
+
+        ctx.progress(0.7, "Rendering video...")
+        clips = [
+            *ctx.index_0,
+            *ctx.index_1,
+            *ctx.index_2,
+            *ctx.index_3,
+            *ctx.index_4,
+            *ctx.index_5,
+            *ctx.index_6,
+            *ctx.index_7,
+            *ctx.index_8,
+            *ctx.index_9,
+        ]
+        audio = mp.CompositeAudioClip(ctx.audio)
+        clip = (
+            mp.CompositeVideoClip(clips, size=(ctx.width, ctx.height))
+            .with_duration(ctx.duration)
+            .with_audio(audio)
+        )
+        clip.write_videofile(
+            ctx.get_file_path("final.mp4"), fps=60, threads=4, codec="h264_nvenc"
+        )
+        system = prompts["description"]["system"]
+        chat = prompts["description"]["chat"]
+        chat.replace("{script}", ctx.script)
+        metadata = ctx.powerfulllmengine.generate(
+            system_prompt=system, chat_prompt=chat, json_mode=True, temperature=1
+        )
+        ctx.title = metadata["title"]
+        ctx.description = metadata["description"]
+
+        ctx.description = ctx.description + "\n" + ctx.credits
+        ctx.progress(0.9, "Uploading video...")
+        for engine in ctx.uploadengine:
+            try:
+                engine.upload(
+                    ctx.title, ctx.description, ctx.get_file_path("final.mp4")
+                )
+            except Exception as e:
+                print(e)
+                gr.Warning(f"{engine.name} failed to upload the video.")
+
+        ctx.progress(0.99, "Storing in database...")
+        ctx.store_in_db()
+        ctx.progress(1, "Done!")
+
+        command = "start" if os.name == "nt" else "open"
+        os.system(f"{command} {os.path.abspath(ctx.dir)}")
+
+    @classmethod
+    def get_options(cls):
+        return [
+            gr.Textbox(
+                lines=4,
+                max_lines=6,
+                label="Video instructions",
+            ),
+            gr.Textbox(
+                lines=4,
+                max_lines=6,
+                label="Assets only instructions",
+            ),
+        ]
--- a/src/engines/Pipelines/init.py
+++ b/src/engines/Pipelines/init.py
@@ -0,0 +1,6 @@
+from .BasePipeline import BasePipeline
+
+# from .ScriptedShortPipeline import ScriptedShortPipeline
+
+# from .BestofShortPipeline import BestofShortPipeline
+from .ScriptedVideoPipeline import ScriptedVideoPipeline
--- a/src/engines/Pipelines/prompts/bestof.yaml
+++ b/src/engines/Pipelines/prompts/bestof.yaml
@@ -0,0 +1,50 @@
+Full edit:
+  system: |-
+    You are an expert content writer of a YouTube shorts channel. You specialize in creating bestofs from long form content.
+    You will be recieving a section of a transcript of a video from the user in a json format as follows:
+    [
+      {
+        "text": "Hello",
+        "start": 0.0,
+        "end": 5.0
+        },
+        {
+        "text": "world!",
+        "start": 5.0,
+        "end": 10.0
+        },
+      etc.
+    ]
+    A catch sentence, which will be the start of the edit, catches the user's attention into the video. A good catch sentence is a sentence that is immediately promising of either telling the watcher a cheatcode, or be a strong opinion, or just be very unusual or interesting. The catch sentence should be followed by a logical and relevant discussion on the topic, the sentence should be a catch to that discussion, which means it dosen't need to be the first sentence of that discussion. However, make sure to use a sentence that is followed by a discussion, not one that is alone even if it's better than one that is not alone.
+    Your job here, is to complete the video editing process. For this, you should return a json object listing the start and end of the final video.
+    Your response should be json only and look like this:
+    {
+        "thought":"Explain what you *will* do to select a good catch sentence, then explain the discussions(s) ongoing briefly and which ones are interesting and surprising. List max 3 best catch sentences, then select one of these three and explain why. Then explain when you ended the video and why."
+        "start" 0.0, # start of your catch sentence
+        "end"10.0 # end of your full edit, not just the catch sentence
+    }
+  chat: |-
+    Really make sure that your edit is at least 70 seconds long, at the end of each thought write the current total length of the video. You can cut out less things to make the video longer, just make sure it is over 70s. Here is the catch sentence:
+    Transcript:
+    {transcript}
+Description:
+  system: |-
+    You will be receiving from the user a youtube shorts video transcript. The video is taken from a longer video and split to form shorts like extracts. Your goal is to write a very short title and description for the video.
+
+    The title should be no more than 100 characters and the description should be no more than 150 characters.
+
+    In the title, include one emoji related to the content of the video or to the general theme or mood of the video.
+    The emoji should be the very first character in the title, before any text.
+    Include NO hashtags in the title.
+    Include 3-5 relevant hashtags in the description. Do not repeat yourself and do not use specific hashtags. Use generic hastag or else they won't be relevant. The hashtags should always be at the END of the description, separated by spaces.
+    Hashtags,title and description should be general and about the theme / mood instead of the content (form vs substance). For example, a video about scientific facts should havs as a title, description and hashtags things related to knowledge, learning sience and education, but not to the specific facts of the video.
+
+    Your response should be a json object with the following structure:
+    {
+      "title": "Your title here",
+      "description": "Your description here"
+    }
+  chat: |-
+    Original Title: {title}
+    Original Channel: {channel}
+    {transcript}
--- a/src/engines/MetadataEngine/prompts/ShortsMetadata.yaml
+++ b/src/engines/MetadataEngine/prompts/ShortsMetadata.yaml
--- a/src/engines/Pipelines/prompts/long_form.yaml
+++ b/src/engines/Pipelines/prompts/long_form.yaml
@@ -0,0 +1,201 @@
+chapters:
+  system: |-
+    You are an expert content writer of a YouTube shorts channel. You specialize in creating of high quality long form landscape videos.
+    Here are some requirements for a good youtube video:
+    
+    1. **Engaging Storytelling**:
+        - Utilize a clear and compelling narrative structure that guides the viewer through the content in a logical and intriguing manner.
+        - Employ personal anecdotes or relatable stories to humanize the content and enhance viewer connection.
+    
+    2. **Detailed and Accurate Information**:
+        - Provide specific, well-researched details to establish credibility and demonstrate expertise.
+        - Include technical insights when appropriate, ensuring they are explained in a manner accessible to the intended audience.
+    
+    3. **Building Intrigue and Controversy**:
+        - Introduce elements of controversy or debate to spark interest and encourage viewers to think critically about the topic.
+        - Explore different perspectives or challenge conventional wisdom to keep the content dynamic and thought-provoking.
+    
+    4. **Visual and Technical Design**:
+        - Incorporate high-quality visuals and animations to help illustrate complex concepts and maintain viewer engagement.
+        - Ensure the video is well-edited, with a clean and professional look that reinforces the video’s themes and messages.
+    
+    5. **Suspense and Climactic Buildup**:
+        - Structure the content to build suspense and lead to significant revelations or conclusions that reward the viewer’s attention.
+        - Use pacing effectively, balancing the delivery of information to maintain interest without overwhelming the audience.
+    
+    6. **Interactive Elements**:
+        - Include questions, prompts, or direct calls to action that encourage viewer interaction, either within the video itself or in the comments.
+        - Consider integrating quizzes or summary points that reinforce learning and ensure understanding.
+    
+    7. **Accessibility Features**:
+        - Use clear and concise language, avoiding unnecessary jargon unless thoroughly explained.
+    
+    8. **Cultural and Ethical Sensitivity**:
+        - Be mindful of cultural references and ensure content is respectful and considerate of diverse viewer backgrounds.
+        - Address sensitive topics with care, ensuring to provide balanced views without bias.
+    
+    The user will be sending you some subject, explaining you in more or less detail a video idea / subject. He might or might not give you some creative freedom. You should then write a table of contents for the video, following the guidelines above.
+    The format of the table of contents should be in json, according to this schema:
+    {
+          "analysis": "Explain the structure of the video, the narrative arc, the key points, the climax, the resolution, etc. Discuss what you do to find a good balance between information and entertainment. Also describe what you choose to build your climax upon and in which order you will reveal each information / argument to build it up. Also discuss how you will make the video accessible to a wide audience, by not being too technical but also not too cringe with childish explanations, and by making the viewer attracted and interested in the topic.",
+          "chapters": [
+            {
+              "explanation": "Explain the content of the heading, what it should talk about with specific details, almost as if you were giving advice to a writer who would write this chapter. Make sure to add any relevant information from your analysis here.",
+              "title": "Title of the heading",
+            },
+      etc...
+      ]
+    }
+    You should have at least 5 chapters if not more specified by the user. You should make sure not to include any introduction and concluding chapters as they would be too much. Instead, in the first chapter's explanation, you should explain that the video will start with a short introduction and in the last chapter's explanation, you should explain that the video will end with a short conclusion.
+  chat: |-
+    {user_instructions}
+
+writer:
+  system: |-
+    You are an expert content writer of a YouTube shorts channel. You specialize in creating of high quality long form landscape videos.
+    Here are some requirements for a good youtube video:
+    
+    1. **Engaging Storytelling**:
+        - Utilize a clear and compelling narrative structure that guides the viewer through the content in a logical and intriguing manner.
+        - Employ personal anecdotes or relatable stories to humanize the content and enhance viewer connection.
+    
+    2. **Detailed and Accurate Information**:
+        - Provide specific, well-researched details to establish credibility and demonstrate expertise.
+        - Include technical insights when appropriate, ensuring they are explained in a manner accessible to the intended audience.
+    
+    3. **Building Intrigue and Controversy**:
+        - Introduce elements of controversy or debate to spark interest and encourage viewers to think critically about the topic.
+        - Explore different perspectives or challenge conventional wisdom to keep the content dynamic and thought-provoking.
+    
+    4. **Visual and Technical Design**:
+        - Incorporate high-quality visuals and animations to help illustrate complex concepts and maintain viewer engagement.
+        - Ensure the video is well-edited, with a clean and professional look that reinforces the video’s themes and messages.
+    
+    5. **Suspense and Climactic Buildup**:
+        - Structure the content to build suspense and lead to significant revelations or conclusions that reward the viewer’s attention.
+        - Use pacing effectively, balancing the delivery of information to maintain interest without overwhelming the audience.
+    
+    6. **Interactive Elements**:
+        - Include questions, prompts, or direct calls to action that encourage viewer interaction, either within the video itself or in the comments.
+        - Consider integrating quizzes or summary points that reinforce learning and ensure understanding.
+    
+    7. **Accessibility Features**:
+        - Use clear and concise language, avoiding unnecessary jargon unless thoroughly explained.
+    
+    8. **Cultural and Ethical Sensitivity**:
+        - Be mindful of cultural references and ensure content is respectful and considerate of diverse viewer backgrounds.
+        - Address sensitive topics with care, ensuring to provide balanced views without bias.
+    
+    The user will be sending you some subject, explaining you in more or less detail a video idea / subject. You will also receive a chapter title, and explanation. He might or might not give you some creative freedom. You should then write a script for the video, following the guidelines above.
+    You will not write the script for the entire video, but just for one chapter you received information about.
+    This means you will have both the user's instructions for the entire video ant the instructions for the chapter you will write the script for. You should follow more strictly the latter as the first one is only present more for context.
+    The format of the script should be in json, according to this schema:
+    {
+          "analysis": "Explain the structure of the chapter. Discuss what you do to find a good balance between information and entertainment. Also discuss how you will make the video accessible to a wide audience, by not being too technical but also not too cringe with childish explanations, and by making the viewer attracted and interested in the topic.",
+          "chapter": "This is a string with the actual chapter of the video. It should NOT have any headings titles. It should be reasonably long, not short at all, the goal is to make long almost documentary like videos, so this chapter ONLY should last at éeast 2 to 3 minutes, really long!. The script should be written in a way that it is easy to read and understand, with clear and concise sentences. It should be engaging and informative, and should follow the guidelines above, always. Do not use any markdown, it might bug the text to speech. Don't add any first title to the chapter.",
+    }
+    Do not escape any characters from the json itself. Do not use " in the text, use ' instead. Use " only for the json structure, but don't escape it.
+  chat: |-
+    User instructions: {user_instructions}
+    Chapter title: {chapter_title}
+    Chapter details: {chapter_instructions}
+imager:
+  system: |-
+    You are an expert content writer of a YouTube shorts channel. You specialize in creating of high quality long form landscape videos.
+    Here are some requirements for a good youtube video:
+    
+    1. **Engaging Storytelling**:
+        - Utilize a clear and compelling narrative structure that guides the viewer through the content in a logical and intriguing manner.
+        - Employ personal anecdotes or relatable stories to humanize the content and enhance viewer connection.
+    
+    2. **Detailed and Accurate Information**:
+        - Provide specific, well-researched details to establish credibility and demonstrate expertise.
+        - Include technical insights when appropriate, ensuring they are explained in a manner accessible to the intended audience.
+    
+    3. **Building Intrigue and Controversy**:
+        - Introduce elements of controversy or debate to spark interest and encourage viewers to think critically about the topic.
+        - Explore different perspectives or challenge conventional wisdom to keep the content dynamic and thought-provoking.
+    
+    4. **Visual and Technical Design**:
+        - Incorporate high-quality visuals and animations to help illustrate complex concepts and maintain viewer engagement.
+        - Ensure the video is well-edited, with a clean and professional look that reinforces the video’s themes and messages.
+    
+    5. **Suspense and Climactic Buildup**:
+        - Structure the content to build suspense and lead to significant revelations or conclusions that reward the viewer’s attention.
+        - Use pacing effectively, balancing the delivery of information to maintain interest without overwhelming the audience.
+    
+    6. **Interactive Elements**:
+        - Include questions, prompts, or direct calls to action that encourage viewer interaction, either within the video itself or in the comments.
+        - Consider integrating quizzes or summary points that reinforce learning and ensure understanding.
+    
+    7. **Accessibility Features**:
+        - Use clear and concise language, avoiding unnecessary jargon unless thoroughly explained.
+    
+    8. **Cultural and Ethical Sensitivity**:
+        - Be mindful of cultural references and ensure content is respectful and considerate of diverse viewer backgrounds.
+        - Address sensitive topics with care, ensuring to provide balanced views without bias.
+    
+    You will recieve three things: User instructions, assets guidelines and a video transcript, which will follow this format:
+    [
+        {
+            "start": float,
+            "end": float,
+            "text": "The sentence being said at this moment in the video."
+        },
+    etc...
+    ]
+    Your goal is to create assets for the video, following the guidelines above. For this, you will be able to choose between AI generated assets, and stock assets.
+    Here is wen to USE AI generated assets:
+    - Illustrative purposes
+    - Illustrate an action, something happening rather than a specific information
+    - To show abstract concepts
+    - Complex and specific actions (a person doing a backflip from a plane, two rinos fighting in a city, etc)
+    Here is when to USE stock assets:
+    - To illustrate a specific information
+    - To show specific people, places or things (AI is not good at this)
+    - Simple actions (eating, walking, etc)
+    Here is how your output should look like:
+    {
+        "assets": [
+            {   // if using AI
+                "start": float,
+                "end": float,
+                "type": "ai",
+                "prompt": "A long and detailed description of the scene. DO not reference persons, places or things, as the AI might not be able to generate them. Instead, describe the action, the mood, the colors, detailed clothing, etc.",
+            },
+            {   // if using stock
+                "start": float,
+                "end": float,
+                "type": "stock",
+                "query": "one word to two MAXIMUM words query for the stock asset",
+            },
+        etc...
+        ]
+    }
+    You should make sure the entire video is covered by assets, and that the assets are relevant to the content of the video. You should also make sure that the assets are of high quality and that they are engaging and interesting to the viewer. Assets cannot overlap, should have at least 200ms of difference between them, and should appear for at least 2.5 seconds.
+    Your response should be a json object with the structure above, without any other comments or codeblocks. Just the json object.
+  chat: |-
+    User instructions: {user_instructions}
+    Assets guidelines: {assets_guidelines}
+    Video transcript: {video_transcript}
+description:
+  system: |-
+    You will be receiving from the user a youtube video script.
+    Your goal is to write a very short title and description for the video.
+  
+    The title should be no more than 100 characters and the description should be no more than 400 characters.
+  
+    Include NO hashtags in the title.
+    Include 3-5 relevant hashtags in the description. Do not repeat yourself and do not use specific hashtags. Use generic hashtag or else they won't be relevant. The hashtags should always be at the END of the description, separated by spaces.
+    The description should be general about what the video *will* talk about and not reveal anything. The description can be similar to the first 1-2 sentences of the video (in general, but keep some common sense). The title as well.
+    The title should be catchy and intriguing, and make the user be like wait what, how the heck ?
+    A good title example would be "How this guy broke scammed McDonald for 3 years" or "This painting might not be from who you think" (referring to the miniature).
+  
+    Your response should be a json object with the following structure:
+    {
+      "title": "Your title here",
+      "description": "Your description here"
+    }
+    Do not add any codeblock or comment other than the json object.
+  chat: |-
+    {script}
--- a/src/engines/Pipelines/prompts/scripts.yaml
+++ b/src/engines/Pipelines/prompts/scripts.yaml
@@ -0,0 +1,52 @@
+Shower thoughts:
+  system: |-
+    You are an expert content writer of a YouTube shorts channel. You specialize in shower thoughts shorts.
+    Your shorts are {n_sentences} sentences long. This is VERY IMPORTANT, MAKE SURE TO RESPECT THIS LENGTH EVEN IF THE EXAMPLE SEEMS LONGER. 
+    They are extremely captivating, and original.
+    An example of a shower thought short is: 
+    ---
+    Isn't it funny how your character doesn't blink in first person video games? Shower Thoughts. You aren't paid according to how hard you work, you are paid according to how hard you are to replace. The 15 minutes of extra sleep after you wake up feels like a lot more than the 8 hours of sleep that you just had. Tell me.
+    ---
+    You need to follow the following guidelines:
+
+    - **Hook the Viewer:** Start with a compelling question, fact, or scenario to grab attention immediately.
+    - **Keep it Short and Sweet:** Deliver your content concisely and rapidly to match the platform's fast-paced nature.
+    - **Tap into Relatability or Curiosity:** Make your content relatable or introduce surprising elements to spark curiosity.
+    - **Maintain a Conversational Tone:** Use conversational language to make your content more accessible and engaging.
+    - **Incorporate Humor:** Leverage humor where appropriate to make your content more entertaining and shareable.
+    - **Use Visual Imagery:** Describe concepts in a way that invokes visual imagery, enhancing engagement.
+    - **Include a Call to Action:** End with a direct call to action to encourage viewer interaction.
+    - **Multiple subjects:** Every subject or idea deserves only one sentence, nothing more, so you also need to make sure that the sentence intriques the watcher but at the same time dosen't allow him to think about it because the next thought comes.
+    You are now tasked to produce the greatest short script for the user.
+    Like in the example, start with a compelling question, fact, or scenario to grab attention IMMEDIATELY.
+    Keep it short, extremely interesting and original.
+    If it is appropriate, at the end, ask a question to the user, and end point blank.
+    YOU never respond with anything else that the video script, not even a hello.
+  chat: |-
+    Please give me a script. Make sure to keep it {n_sentences} sentences long, including any questions or calls to action.
+Life hacks:
+  system: |-
+    You are an expert content writer of a YouTube shorts channel. You specialize in life hacks shorts.
+    Your shorts are {n_sentences} sentences long. This is VERY IMPORTANT, MAKE SURE TO RESPECT THIS LENGTH EVEN IF THE EXAMPLE SEEMS LONGER. 
+    They are extremely captivating, and original.
+    An example of a life hacks short is: 
+    ---
+    If you're ever in a situation where you need to remember something, but you don't have a pen and paper, try chewing gum. Studies have shown that chewing gum can help improve memory. Life Hacks. 
+    ---
+    You need to follow the following guidelines:
+
+    - **Hook the Viewer:** Start with a compelling question, fact, or scenario to grab attention immediately.
+    - **Keep it Short and Sweet:** Deliver your content concisely and rapidly to match the platform's fast-paced nature.
+    - **Tap into Relatability or Curiosity:** Make your content relatable or introduce surprising elements to spark curiosity.
+    - **Maintain a Conversational Tone:** Use conversational language to make your content more accessible and engaging.
+    - **Incorporate Humor:** Leverage humor where appropriate to make your content more entertaining and shareable.
+    - **Use Visual Imagery:** Describe concepts in a way that invokes visual imagery, enhancing engagement.
+    - **Include a Call to Action:** End with a direct call to action to encourage viewer interaction.
+    - **Multiple subjects:** Every subject or idea deserves only one sentence, nothing more, so you also need to make sure that the sentence intriques the watcher but at the same time dosen't allow him to think about it because the next thought comes.
+    You are now tasked to produce the greatest short script for the user.
+    Like in the example, start with a compelling question, fact, or scenario to grab attention IMMEDIATELY.
+    Keep it short, extremely interesting and original.
+    If it is appropriate, at the end, ask a question to the user, and end point blank.
+    YOU never respond with anything else that the video script, not even a hello.
+  chat: |-
+    Please give me a script. Make sure to keep it {n_sentences} sentences long, including any questions or calls to action.
--- a/src/engines/ScriptEngine/BaseScriptEngine.py
+++ b/src/engines/ScriptEngine/BaseScriptEngine.py
@@ -1,14 +0,0 @@
-from abc import abstractmethod
-
-from ..BaseEngine import BaseEngine
-
-
-class BaseScriptEngine(BaseEngine):
-    pass
-
-    @abstractmethod
-    def generate(self) -> None:
-        pass
-
-    def time_script(self):
-        ...
--- a/src/engines/ScriptEngine/CustomScriptEngine.py
+++ b/src/engines/ScriptEngine/CustomScriptEngine.py
@@ -1,26 +0,0 @@
-import gradio as gr
-
-from .BaseScriptEngine import BaseScriptEngine
-
-
-class CustomScriptEngine(BaseScriptEngine):
-    name = "Custom Script Engine"
-    description = "Generate a script with a custom provided prompt"
-    num_options = 1
-
-    def __init__(self, options: list[list | tuple | str | int | float | bool | None]):
-        self.script = options[0]
-        super().__init__()
-
-    def generate(self, *args, **kwargs):
-        self.ctx.script = self.script.strip()
-
-    @classmethod
-    def get_options(cls) -> list:
-        return [
-            gr.Textbox(
-                label="Script",
-                placeholder="Enter your script here",
-                value="",
-            )
-        ]
--- a/src/engines/ScriptEngine/ScientificFactsScriptEngine.py
+++ b/src/engines/ScriptEngine/ScientificFactsScriptEngine.py
@@ -1,37 +0,0 @@
-import os
-
-import gradio as gr
-
-from .BaseScriptEngine import BaseScriptEngine
-from ...utils.prompting import get_prompt
-
-
-class ScientificFactsScriptEngine(BaseScriptEngine):
-    name = "Scientific facts"
-    description = "Generate a scientific facts script."
-    num_options = 1
-
-    def __init__(self, options: list[list | tuple | str | int | float | bool | None]):
-        self.n_sentences = options[0]
-        super().__init__()
-
-    def generate(self):
-        sys_prompt, chat_prompt = get_prompt(
-            "scientific_facts",
-            location=os.path.join(
-                os.path.dirname(os.path.abspath(__file__)), "prompts"
-            ),
-        )
-        sys_prompt = sys_prompt.format(n_sentences=self.n_sentences)
-        chat_prompt = chat_prompt.format(n_sentences=self.n_sentences)
-        self.ctx.script = self.ctx.powerfulllmengine.generate(
-            system_prompt=sys_prompt,
-            chat_prompt=chat_prompt,
-            max_tokens=20 * self.n_sentences,
-            temperature=1.3,
-            json_mode=False,
-        ).strip()
-
-    @classmethod
-    def get_options(cls) -> list:
-        return [gr.Number(label="Number of sentences", value=5, minimum=1)]
--- a/src/engines/ScriptEngine/ShowerThoughtsScriptEngine.py
+++ b/src/engines/ScriptEngine/ShowerThoughtsScriptEngine.py
@@ -1,37 +0,0 @@
-import os
-
-import gradio as gr
-
-from .BaseScriptEngine import BaseScriptEngine
-from ...utils.prompting import get_prompt
-
-
-class ShowerThoughtsScriptEngine(BaseScriptEngine):
-    name = "Shower Thoughts"
-    description = "Generate a Shower Thoughts script"
-    num_options = 1
-
-    def __init__(self, options: list[list | tuple | str | int | float | bool | None]):
-        self.n_sentences = options[0]
-        super().__init__()
-
-    def generate(self):
-        sys_prompt, chat_prompt = get_prompt(
-            "shower_thoughts",
-            location=os.path.join(
-                os.path.dirname(os.path.abspath(__file__)), "prompts"
-            ),
-        )
-        sys_prompt = sys_prompt.format(n_sentences=self.n_sentences)
-        chat_prompt = chat_prompt.format(n_sentences=self.n_sentences)
-        self.ctx.script = self.ctx.powerfulllmengine.generate(
-            system_prompt=sys_prompt,
-            chat_prompt=chat_prompt,
-            max_tokens=20 * self.n_sentences,
-            temperature=1.3,
-            json_mode=False,
-        ).strip()
-
-    @classmethod
-    def get_options(cls) -> list:
-        return [gr.Number(label="Number of sentences", value=5, minimum=1)]
--- a/src/engines/ScriptEngine/init.py
+++ b/src/engines/ScriptEngine/init.py
@@ -1,4 +0,0 @@
-from .BaseScriptEngine import BaseScriptEngine
-from .CustomScriptEngine import CustomScriptEngine
-from .ShowerThoughtsScriptEngine import ShowerThoughtsScriptEngine
-from .ScientificFactsScriptEngine import ScientificFactsScriptEngine
--- a/src/engines/ScriptEngine/prompts/scientific_facts.yaml
+++ b/src/engines/ScriptEngine/prompts/scientific_facts.yaml
@@ -1,19 +0,0 @@
-system: |-
-  You are an expert content writer of a YouTube shorts channel. You specialize in scientific facts shorts.
-  Your shorts are {n_sentences} sentences long. This is VERY IMPORTANT, MAKE SURE TO RESPECT THIS LENGTH. 
-  They are extremely captivating, and original.
-  You need to follow the following guidelines:
-  - **Hook the Viewer:** Start with a compelling question, fact, or scenario to grab attention immediately. Your fact can also be a bit wierd or shocking (not really shocking, but you get the point), so that the viewer wants to know the actual truth.
-    Specifically, you can start with something that isn't completely correct, then when you continue the actual explanation unfolds, in order to make the first few words more attractive.
-  - **Keep it Short and Sweet:** Deliver your content concisely and rapidly to match the platform's fast-paced nature.
-  - **Tap into Relatability or Curiosity:** Make your content relatable or introduce surprising elements to spark curiosity.
-  - **Maintain a Conversational Tone:** Use conversational language to make your content more accessible and engaging.
-  - **Use Visual Imagery:** Describe concepts in a way that invokes visual imagery, enhancing engagement.
-  - **Include a Call to Action:** End with a direct call to action to encourage viewer interaction if applicable.
-  You are now tasked to produce the greatest short script for the user.
-  Start with a compelling information, fact, or scenario to grab attention IMMEDIATELY.
-  Keep it short, EXTREMELY interesting and original.
-  If it is appropriate, at the end, ask a question to the user, and end point blank.
-  YOU never respond with anything else that the video script, not even a hello.
-chat: |
-  Please give me a script. Make sure to keep it {n_sentences} sentences long, including any questions or calls to action.
--- a/src/engines/ScriptEngine/prompts/shower_thoughts.yaml
+++ b/src/engines/ScriptEngine/prompts/shower_thoughts.yaml
@@ -1,25 +0,0 @@
-system: |-
-  You are an expert content writer of a YouTube shorts channel. You specialize in shower thoughts shorts.
-  Your shorts are {n_sentences} sentences long. This is VERY IMPORTANT, MAKE SURE TO RESPECT THIS LENGTH EVEN IF THE EXAMPLE SEEMS LONGER. 
-  They are extremely captivating, and original.
-  An example of a shower thought short is: 
-  ---
-  Isn't it funny how your character doesn't blink in first person video games? Shower Thoughts. You aren't paid according to how hard you work, you are paid according to how hard you are to replace. The 15 minutes of extra sleep after you wake up feels like a lot more than the 8 hours of sleep that you just had. Tell me.
-  ---
-  You need to follow the following guidelines:
-
-  - **Hook the Viewer:** Start with a compelling question, fact, or scenario to grab attention immediately.
-  - **Keep it Short and Sweet:** Deliver your content concisely and rapidly to match the platform's fast-paced nature.
-  - **Tap into Relatability or Curiosity:** Make your content relatable or introduce surprising elements to spark curiosity.
-  - **Maintain a Conversational Tone:** Use conversational language to make your content more accessible and engaging.
-  - **Incorporate Humor:** Leverage humor where appropriate to make your content more entertaining and shareable.
-  - **Use Visual Imagery:** Describe concepts in a way that invokes visual imagery, enhancing engagement.
-  - **Include a Call to Action:** End with a direct call to action to encourage viewer interaction.
-  - **Multiple subjects:** Every subject or idea deserves only one sentence, nothing more, so you also need to make sure that the sentence intriques the watcher but at the same time dosen't allow him to think about it because the next thought comes.
-  You are now tasked to produce the greatest short script for the user.
-  Like in the example, start with a compelling question, fact, or scenario to grab attention IMMEDIATELY.
-  Keep it short, extremely interesting and original.
-  If it is appropriate, at the end, ask a question to the user, and end point blank.
-  YOU never respond with anything else that the video script, not even a hello.
-chat: |
-  Please give me a script. Make sure to keep it {n_sentences} sentences long, including any questions or calls to action.
--- a/src/engines/StockImageEngine/BaseStockImageEngine.py
+++ b/src/engines/StockImageEngine/BaseStockImageEngine.py
@@ -0,0 +1,26 @@
+import moviepy as mp
+
+from abc import abstractmethod
+
+from ..BaseEngine import BaseEngine
+
+
+class BaseStockImageEngine(BaseEngine):
+    """
+    The base class for all Stock Image engines.
+    """
+
+    @abstractmethod
+    def get(self, query: str, start: float, end: float) -> mp.ImageClip:
+        """
+        Get a stock image based on a query.
+
+        Args:
+            query (str): The query to search for.
+            start (float): The starting time of the video clip.
+            end (float): The ending time of the video clip.
+
+        Returns:
+            str: The path to the saved image.
+        """
+        ...
--- a/src/engines/StockImageEngine/GoogleStockImageEngine.py
+++ b/src/engines/StockImageEngine/GoogleStockImageEngine.py
@@ -7,7 +7,7 @@ import moviepy as mp
 import moviepy.video.fx as vfx

 from google_images_search import GoogleImagesSearch
-from . import BaseAssetsEngine
+from .BaseStockImageEngine import BaseStockImageEngine


 class Spec(TypedDict):
@@ -16,18 +16,9 @@ class Spec(TypedDict):
    end: float


-class GoogleAssetsEngine(BaseAssetsEngine):
+class GoogleStockImageEngine(BaseStockImageEngine):
    name = "Google"
    description = "Search for images using the Google Images API."
-    spec_name = "google"
-    spec_description = (
-        "Use the Google Images API to search for images based on a query."
-    )
-    specification = {
-        "query": "A short and concise query to search for images. Do not include any details, just a simple query. [str]",
-        "start": "The starting time of the video clip. [float]",
-        "end": "The ending time of the video clip. [float]",
-    }

    num_options = 0

@@ -37,37 +28,40 @@ class GoogleAssetsEngine(BaseAssetsEngine):
        self.google = GoogleImagesSearch(api_key, project_cx)
        super().__init__()

-    def generate(self, options: list[Spec]) -> list[mp.ImageClip]:
+    def get(self, query: str, start: float, end: float) -> mp.ImageClip:
        max_width = int(self.ctx.width / 3 * 2)
-        clips = []
-        for option in options:
-            query = option["query"]
-            start = option["start"]
-            end = option["end"]
-            _search_params = {
-                "q": query,
-                "num": 1,
-            }
-            os.makedirs("temp", exist_ok=True)
-            try:
-                self.google.search(
-                    search_params=_search_params,
-                    path_to_dir="./temp/",
-                    custom_image_name="temp",
-                )
-                # we find the file called temp. extension
-                filename = [f for f in os.listdir("./temp/") if f.startswith("temp.")][0]
-                img = mp.ImageClip(f"./temp/{filename}")
-                # delete the temp folder
-            except Exception as e:
-                print(e)
-                continue
-            finally:
-                shutil.rmtree("temp")
+        _search_params = {
+            "q": query,
+            "num": 1,
+        }
+        os.makedirs("temp", exist_ok=True)
+        try:
+            self.google.search(
+                search_params=_search_params,
+                path_to_dir="./temp/",
+                custom_image_name="temp",
+            )
+            # we find the file called temp. extension
+            filename = [f for f in os.listdir("./temp/") if f.startswith("temp.")][0]
+            img = mp.ImageClip(f"./temp/{filename}")
+            # delete the temp folder
+        except Exception as e:
+            gr.Warning(f"Failed to get image: {e}")
+            return (
+                mp.ColorClip((self.ctx.width, self.ctx.height), color=(0, 0, 0))
+                .with_duration(end - start)
+                .with_start(start)
+            )
+        finally:
+            shutil.rmtree("temp")

-            img = img.with_duration(end - start).with_start(start).with_effects([vfx.Resize(width=max_width)]).with_position(("center", "top"))
-            clips.append(img)
-        return clips
+        img = (
+            img.with_duration(end - start)
+            .with_start(start)
+            .with_effects([vfx.Resize(width=max_width)])
+            .with_position(("center", "top"))
+        )
+        return img

    @classmethod
    def get_options(cls):
--- a/src/engines/StockImageEngine/init.py
+++ b/src/engines/StockImageEngine/init.py
@@ -0,0 +1,2 @@
+from .GoogleStockImageEngine import GoogleStockImageEngine
+from .BaseStockImageEngine import BaseStockImageEngine
--- a/src/engines/TranscriptionEngine/BaseTranscriptionEngine.py
+++ b/src/engines/TranscriptionEngine/BaseTranscriptionEngine.py
@@ -0,0 +1,49 @@
+from abc import abstractmethod
+from typing import TypedDict
+
+from ..BaseEngine import BaseEngine
+
+
+class Word(TypedDict):
+    start: str
+    end: str
+    text: str
+
+
+class BaseTranscriptionEngine(BaseEngine):
+
+    @abstractmethod
+    def transcribe(
+        self,
+        path: str,
+        fast: bool = False,
+        words: bool = False,
+        avoid_hallucinations: bool = False,
+    ) -> list[Word]:
+        """
+        Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+        Args:
+            path (str): The path to the audio file.
+            fast (bool): Whether to use a fast transcription model.
+            words (bool): Whether to return the words as a list of Word objects.
+
+        Returns:
+            list[Word]: A list of Word objects representing the transcribed words.
+            Example:
+            ```json
+            [
+                {
+                    "start": "0.00",
+                    "end": "0.50",
+                    "text": "Hello"
+                },
+                {
+                    "start": "0.50",
+                    "end": "1.00",
+                    "text": "world"
+                }
+            ]
+            ```
+        """
+        ...
--- a/src/engines/TranscriptionEngine/WhisperTranscriptionEngine.py
+++ b/src/engines/TranscriptionEngine/WhisperTranscriptionEngine.py
@@ -0,0 +1,73 @@
+from abc import abstractmethod
+from typing import TypedDict
+
+import whisper_timestamped as wt
+from torch.cuda import is_available
+
+from . import BaseTranscriptionEngine
+
+
+class Word(TypedDict):
+    start: str
+    end: str
+    text: str
+
+
+class WhisperTranscriptionEngine(BaseTranscriptionEngine):
+    name = "Whisper Transcription Engine"
+    description = (
+        "A transcription engine that uses the whisper model to transcribe audio files."
+    )
+    num_options = 0
+
+    def __init__(self, options: list) -> None:
+        super().__init__()
+
+    def transcribe(
+        self,
+        path: str,
+        fast: bool = False,
+        words=False,
+        avoid_hallucinations: bool = False,
+    ) -> list[Word] | dict[str, dict[str, str]]:
+        """
+        Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+        Args:
+            path (str): The path to the audio file.
+            fast (bool): Whether to use a fast transcription model.
+            words (bool): Whether to return the words as a list of Word objects.
+
+        Returns:
+            list[Word]: A list of Word objects representing the transcribed words.
+            Example:
+            ```json
+            [
+                {
+                    "start": "0.00",
+                    "end": "0.50",
+                    "text": "Hello"
+                },
+                {
+                    "start": "0.50",
+                    "end": "1.00",
+                    "text": "world"
+                }
+            ]
+            ```
+        """
+        device = "cuda" if is_available() else "cpu"
+        audio = wt.load_audio(path)
+        model = wt.load_model("large-v3" if not fast else "base", device=device)
+        result = wt.transcribe(model=model, audio=audio, vad=avoid_hallucinations)
+        if words:
+            results = [word for chunk in result["segments"] for word in chunk["words"]]
+            for result in results:
+                del result["confidence"]
+
+            return results
+        return result
+
+    @classmethod
+    def get_options(cls):
+        return []
--- a/src/engines/TranscriptionEngine/init.py
+++ b/src/engines/TranscriptionEngine/init.py
@@ -0,0 +1,2 @@
+from .BaseTranscriptionEngine import BaseTranscriptionEngine
+from .WhisperTranscriptionEngine import WhisperTranscriptionEngine
--- a/src/engines/init.py
+++ b/src/engines/init.py
@@ -1,17 +1,22 @@
-from . import AssetsEngine
+from . import Pipelines
+from . import AIImageEngine
+from . import StockImageEngine
 from . import BackgroundEngine
 from . import CaptioningEngine
 from . import LLMEngine
-from . import MetadataEngine
-from . import ScriptEngine
 from . import SettingsEngine
 from . import TTSEngine
 from . import UploadEngine
 from . import AudioBackgroundEngine
+from . import TranscriptionEngine
 from .BaseEngine import BaseEngine
 from .NoneEngine import NoneEngine

 ENGINES: dict[str, dict[str, bool | list[BaseEngine]]] = {
+    "Pipeline": {
+        "classes": [Pipelines.ScriptedVideoPipeline],
+        "multiple": False,
+    },
    "SettingsEngine": {
        "classes": [SettingsEngine.SettingsEngine],
        "multiple": False,
@@ -25,29 +30,28 @@ ENGINES: dict[str, dict[str, bool | list[BaseEngine]]] = {
        "classes": [LLMEngine.OpenaiLLMEngine, LLMEngine.AnthropicLLMEngine],
        "multiple": False,
    },
-    "ScriptEngine": {
-        "classes": [
-            ScriptEngine.ShowerThoughtsScriptEngine,
-            ScriptEngine.CustomScriptEngine,
-            ScriptEngine.ScientificFactsScriptEngine,
-        ],
-        "multiple": False,
-    },
    "TTSEngine": {
        "classes": [TTSEngine.CoquiTTSEngine],
        "multiple": False,
    },
+    "TranscriptionEngine": {
+        "classes": [TranscriptionEngine.WhisperTranscriptionEngine],
+        "multiple": False,
+    },
    "CaptioningEngine": {
        "classes": [CaptioningEngine.SimpleCaptioningEngine, NoneEngine],
        "multiple": False,
    },
-    "AssetsEngine": {
+    "AIImageEngine": {
        "classes": [
-            AssetsEngine.DallEAssetsEngine,
-            AssetsEngine.GoogleAssetsEngine,
-            NoneEngine,
+            AIImageEngine.DallEAIImageEngine,
+            AIImageEngine.A1111AIImageEngine,
        ],
-        "multiple": True,
+        "multiple": False,
+    },
+    "StockImageEngine": {
+        "classes": [StockImageEngine.GoogleStockImageEngine],
+        "multiple": False,
    },
    "BackgroundEngine": {
        "classes": [NoneEngine, BackgroundEngine.VideoBackgroundEngine],
@@ -57,12 +61,12 @@ ENGINES: dict[str, dict[str, bool | list[BaseEngine]]] = {
        "classes": [NoneEngine, AudioBackgroundEngine.MusicAudioBackgroundEngine],
        "multiple": False,
    },
-    "MetadataEngine": {
-        "classes": [MetadataEngine.ShortsMetadataEngine],
-        "multiple": False,
-    },
    "UploadEngine": {
-        "classes": [UploadEngine.TikTokUploadEngine, UploadEngine.YouTubeUploadEngine, NoneEngine],
+        "classes": [
+            UploadEngine.TikTokUploadEngine,
+            UploadEngine.YouTubeUploadEngine,
+            NoneEngine,
+        ],
        "multiple": True,
    },
 }
--- a/tests/init.py
+++ b/tests/init.py