🔄 Update ScriptedVideoPipeline to enhance performance and structure

-  Enhanced script generation by introducing iterative chapter processing.
- 🔊 Refactored text-to-speech integration to handle individual chapters separately, improving audio synchronization.
- 🎞️ Modified video and asset generation workflow to align with dynamic content from each chapter.
- 🐛 Fixed audio transcription segmentation to accurately split sentences, ensuring better subtitle accuracy.
- 🧹 Cleaned up and structured the code for better readability and maintenance.
This commit is contained in:
2024-05-16 14:02:50 +02:00
parent dbede558c4
commit d817420309

View File

@@ -45,7 +45,11 @@ class ScriptedVideoPipeline(BasePipeline):
)["chapters"] )["chapters"]
ctx.script = "" ctx.script = ""
for chapter in chapters: text_audio = []
ctx.duration = 0
for i, chapter in enumerate(chapters):
ctx.progress(0.2, f"Generating chapter: {chapter['title']}...") ctx.progress(0.2, f"Generating chapter: {chapter['title']}...")
system = prompts["writer"]["system"] system = prompts["writer"]["system"]
chat = prompts["writer"]["chat"] chat = prompts["writer"]["chat"]
@@ -54,87 +58,96 @@ class ScriptedVideoPipeline(BasePipeline):
.replace("{chapter_title}", chapter["title"]) .replace("{chapter_title}", chapter["title"])
.replace("{chapter_instructions}", chapter["explanation"]) .replace("{chapter_instructions}", chapter["explanation"])
) )
ctx.script += ctx.powerfulllmengine.generate( script = ctx.powerfulllmengine.generate(
system_prompt=system, system_prompt=system,
chat_prompt=chat, chat_prompt=chat,
temperature=1, temperature=1,
max_tokens=4096, max_tokens=4096,
json_mode=True, json_mode=True,
)["chapter"] )["chapter"]
ctx.script += script
ctx.script += "\n" ctx.script += "\n"
ctx.progress(0.3, "Synthesizing voice...") ctx.progress(0.3, "Synthesizing voice...")
ctx.duration = ctx.ttsengine.synthesize( duration = ctx.ttsengine.synthesize(
ctx.script, ctx.get_file_path("tts.wav") script, ctx.get_file_path(f"tts_{i}.wav")
) )
ctx.audio.append(mp.AudioFileClip(ctx.get_file_path("tts.wav"))) audioclip = mp.AudioFileClip(ctx.get_file_path(f"tts_{i}.wav"))
ctx.progress(0.4, "Transcribing audio...") audioclip = audioclip.with_start(ctx.duration)
ctx.timed_script = ctx.transcriptionengine.transcribe( text_audio.append(audioclip)
ctx.get_file_path("tts.wav"), fast=False, words=True ctx.progress(0.2, f"Transcribing chapter: {chapter['title']}...")
) timed_script = ctx.transcriptionengine.transcribe(
ctx.get_file_path(f"tts_{i}.wav"), fast=False, words=True
)
sentence_split_script = [] sentence_split_script = []
current_sentence = None current_sentence = None
for word in ctx.timed_script.copy(): for word in timed_script.copy():
if current_sentence is None: if current_sentence is None:
# Initialize the first sentence # Initialize the first sentence
current_sentence = { current_sentence = {
"text": word["text"], "text": word["text"],
"end": word["end"], "end": word["end"],
"start": word["start"], "start": word["start"],
} }
elif word["text"].endswith((".", "!", "?")): elif word["text"].endswith((".", "!", "?")):
# Add the word to the current sentence and finalize it # Add the word to the current sentence and finalize it
current_sentence["text"] += f" {word['text']}" current_sentence["text"] += f" {word['text']}"
current_sentence["end"] = word["end"] current_sentence["end"] = word["end"]
sentence_split_script.append(current_sentence)
current_sentence = None # Prepare to start a new sentence
else:
# Continue adding words to the current sentence
current_sentence["text"] += f" {word['text']}"
current_sentence["end"] = word["end"]
# If the last sentence didn't end with a punctuation mark
if current_sentence is not None:
sentence_split_script.append(current_sentence) sentence_split_script.append(current_sentence)
current_sentence = None # Prepare to start a new sentence
else:
# Continue adding words to the current sentence
current_sentence["text"] += f" {word['text']}"
current_sentence["end"] = word["end"]
# If the last sentence didn't end with a punctuation mark ctx.progress(0.2, f"Generating video for chapter: {chapter['title']}...")
if current_sentence is not None: system = prompts["imager"]["system"]
sentence_split_script.append(current_sentence) chat = prompts["imager"]["chat"]
chat = chat.replace("{user_instructions}", str(self.user_instructions))
ctx.progress(0.5, "Generating images...") chat = chat.replace("{assets_instructions}", str(self.assets_instructions))
system = prompts["imager"]["system"] chat = chat.replace("{video_transcript}", str(sentence_split_script))
chat = prompts["imager"]["chat"] assets: list[dict[str, str | float]] = ctx.powerfulllmengine.generate(
chat = chat.replace("{user_instructions}", str(self.user_instructions)) system_prompt=system,
chat = chat.replace("{assets_instructions}", str(self.assets_instructions)) chat_prompt=chat,
chat = chat.replace("{video_transcript}", str(sentence_split_script)) temperature=1,
assets: list[dict[str, str | float]] = ctx.powerfulllmengine.generate( max_tokens=4096,
system_prompt=system, json_mode=True,
chat_prompt=chat, )["assets"]
temperature=1, for i, asset in enumerate(assets):
max_tokens=4096, if asset["type"] == "stock":
json_mode=True, ctx.progress(0.5, f"Getting stock image {i + 1}...")
)["assets"] ctx.index_4.append(
for i, asset in enumerate(assets): ctx.stockimageengine.get(
if asset["type"] == "stock": asset["query"],
ctx.progress(0.5, f"Getting stock image {i + 1}...") asset["start"] + ctx.duration,
ctx.index_4.append( asset["end"] + ctx.duration,
ctx.stockimageengine.get( )
asset["query"], asset["start"], asset["end"]
) )
) elif asset["type"] == "ai":
elif asset["type"] == "ai": ctx.progress(0.5, f"Generating AI image {i + 1}...")
ctx.progress(0.5, f"Generating AI image {i + 1}...") ctx.index_5.append(
ctx.index_5.append( ctx.aiimageengine.generate(
ctx.aiimageengine.generate( asset["prompt"],
asset["prompt"], asset["start"], asset["end"] asset["start"] + ctx.duration,
asset["end"] + ctx.duration,
)
) )
)
ctx.duration += duration + 0.5
ctx.audio.extend(text_audio)
if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine): if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine):
ctx.progress(0.6, "Generating audio background...") ctx.progress(0.6, "Generating audio background...")
ctx.audio.append(ctx.audiobackgroundengine.get_background()) ctx.audio.append(ctx.audiobackgroundengine.get_background())
if not isinstance(ctx.backgroundengine, engines.NoneEngine): if not isinstance(ctx.backgroundengine, engines.NoneEngine):
ctx.progress(0.65, "Generating background...") ctx.progress(0.65, "Generating background...")
ctx.audio.append(ctx.backgroundengine.get_background()) ctx.index_0.append(ctx.backgroundengine.get_background())
ctx.progress(0.7, "Rendering video...") ctx.progress(0.7, "Rendering video...")
clips = [ clips = [
@@ -230,11 +243,13 @@ class ScriptedVideoPipeline(BasePipeline):
lines=4, lines=4,
max_lines=6, max_lines=6,
label="Video instructions", label="Video instructions",
info="Explain what the video should be about, how many chapters, and any specific instructions.",
), ),
gr.Textbox( gr.Textbox(
lines=4, lines=4,
max_lines=6, max_lines=6,
label="Assets only instructions", label="Assets only instructions",
info="Explain how the assets should be used in the video. When, how many, and of what type (stock images, AI or both)",
), ),
ratio, ratio,
width, width,