🔄 Update ScriptedVideoPipeline to enhance performance and structure

- ✨ Enhanced script generation by introducing iterative chapter processing. - 🔊 Refactored text-to-speech integration to handle individual chapters separately, improving audio synchronization. - 🎞️ Modified video and asset generation workflow to align with dynamic content from each chapter. - 🐛 Fixed audio transcription segmentation to accurately split sentences, ensuring better subtitle accuracy. - 🧹 Cleaned up and structured the code for better readability and maintenance.
2026-03-03 02:14:54 +00:00 · 2024-05-16 14:02:50 +02:00
parent dbede558c4
commit d817420309
1 changed files with 76 additions and 61 deletions
--- a/src/engines/Pipelines/ScriptedVideoPipeline.py
+++ b/src/engines/Pipelines/ScriptedVideoPipeline.py
@@ -45,7 +45,11 @@ class ScriptedVideoPipeline(BasePipeline):
        )["chapters"]
        ctx.script = ""

-        for chapter in chapters:
+        text_audio = []
+
+        ctx.duration = 0
+
+        for i, chapter in enumerate(chapters):
            ctx.progress(0.2, f"Generating chapter: {chapter['title']}...")
            system = prompts["writer"]["system"]
            chat = prompts["writer"]["chat"]
@@ -54,29 +58,32 @@ class ScriptedVideoPipeline(BasePipeline):
                .replace("{chapter_title}", chapter["title"])
                .replace("{chapter_instructions}", chapter["explanation"])
            )
-            ctx.script += ctx.powerfulllmengine.generate(
+            script = ctx.powerfulllmengine.generate(
                system_prompt=system,
                chat_prompt=chat,
                temperature=1,
                max_tokens=4096,
                json_mode=True,
            )["chapter"]
+            ctx.script += script
            ctx.script += "\n"

            ctx.progress(0.3, "Synthesizing voice...")
-        ctx.duration = ctx.ttsengine.synthesize(
-            ctx.script, ctx.get_file_path("tts.wav")
+            duration = ctx.ttsengine.synthesize(
+                script, ctx.get_file_path(f"tts_{i}.wav")
            )
-        ctx.audio.append(mp.AudioFileClip(ctx.get_file_path("tts.wav")))
-        ctx.progress(0.4, "Transcribing audio...")
-        ctx.timed_script = ctx.transcriptionengine.transcribe(
-            ctx.get_file_path("tts.wav"), fast=False, words=True
+            audioclip = mp.AudioFileClip(ctx.get_file_path(f"tts_{i}.wav"))
+            audioclip = audioclip.with_start(ctx.duration)
+            text_audio.append(audioclip)
+            ctx.progress(0.2, f"Transcribing chapter: {chapter['title']}...")
+            timed_script = ctx.transcriptionengine.transcribe(
+                ctx.get_file_path(f"tts_{i}.wav"), fast=False, words=True
            )

            sentence_split_script = []
            current_sentence = None

-        for word in ctx.timed_script.copy():
+            for word in timed_script.copy():
                if current_sentence is None:
                    # Initialize the first sentence
                    current_sentence = {
@@ -99,7 +106,7 @@ class ScriptedVideoPipeline(BasePipeline):
            if current_sentence is not None:
                sentence_split_script.append(current_sentence)

-        ctx.progress(0.5, "Generating images...")
+            ctx.progress(0.2, f"Generating video for chapter: {chapter['title']}...")
            system = prompts["imager"]["system"]
            chat = prompts["imager"]["chat"]
            chat = chat.replace("{user_instructions}", str(self.user_instructions))
@@ -117,24 +124,30 @@ class ScriptedVideoPipeline(BasePipeline):
                    ctx.progress(0.5, f"Getting stock image {i + 1}...")
                    ctx.index_4.append(
                        ctx.stockimageengine.get(
-                        asset["query"], asset["start"], asset["end"]
+                            asset["query"],
+                            asset["start"] + ctx.duration,
+                            asset["end"] + ctx.duration,
                        )
                    )
                elif asset["type"] == "ai":
                    ctx.progress(0.5, f"Generating AI image {i + 1}...")
                    ctx.index_5.append(
                        ctx.aiimageengine.generate(
-                        asset["prompt"], asset["start"], asset["end"]
+                            asset["prompt"],
+                            asset["start"] + ctx.duration,
+                            asset["end"] + ctx.duration,
                        )
                    )

+            ctx.duration += duration + 0.5
+        ctx.audio.extend(text_audio)
        if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine):
            ctx.progress(0.6, "Generating audio background...")
            ctx.audio.append(ctx.audiobackgroundengine.get_background())

        if not isinstance(ctx.backgroundengine, engines.NoneEngine):
            ctx.progress(0.65, "Generating background...")
-            ctx.audio.append(ctx.backgroundengine.get_background())
+            ctx.index_0.append(ctx.backgroundengine.get_background())

        ctx.progress(0.7, "Rendering video...")
        clips = [
@@ -230,11 +243,13 @@ class ScriptedVideoPipeline(BasePipeline):
                lines=4,
                max_lines=6,
                label="Video instructions",
+                info="Explain what the video should be about, how many chapters, and any specific instructions.",
            ),
            gr.Textbox(
                lines=4,
                max_lines=6,
                label="Assets only instructions",
+                info="Explain how the assets should be used in the video. When, how many, and of what type (stock images, AI or both)",
            ),
            ratio,
            width,