mirror of
https://github.com/Paillat-dev/viralfactory.git
synced 2026-01-02 01:06:19 +00:00
🔄 Update ScriptedVideoPipeline to enhance performance and structure
- ✨ Enhanced script generation by introducing iterative chapter processing. - 🔊 Refactored text-to-speech integration to handle individual chapters separately, improving audio synchronization. - 🎞️ Modified video and asset generation workflow to align with dynamic content from each chapter. - 🐛 Fixed audio transcription segmentation to accurately split sentences, ensuring better subtitle accuracy. - 🧹 Cleaned up and structured the code for better readability and maintenance.
This commit is contained in:
@@ -45,7 +45,11 @@ class ScriptedVideoPipeline(BasePipeline):
|
|||||||
)["chapters"]
|
)["chapters"]
|
||||||
ctx.script = ""
|
ctx.script = ""
|
||||||
|
|
||||||
for chapter in chapters:
|
text_audio = []
|
||||||
|
|
||||||
|
ctx.duration = 0
|
||||||
|
|
||||||
|
for i, chapter in enumerate(chapters):
|
||||||
ctx.progress(0.2, f"Generating chapter: {chapter['title']}...")
|
ctx.progress(0.2, f"Generating chapter: {chapter['title']}...")
|
||||||
system = prompts["writer"]["system"]
|
system = prompts["writer"]["system"]
|
||||||
chat = prompts["writer"]["chat"]
|
chat = prompts["writer"]["chat"]
|
||||||
@@ -54,87 +58,96 @@ class ScriptedVideoPipeline(BasePipeline):
|
|||||||
.replace("{chapter_title}", chapter["title"])
|
.replace("{chapter_title}", chapter["title"])
|
||||||
.replace("{chapter_instructions}", chapter["explanation"])
|
.replace("{chapter_instructions}", chapter["explanation"])
|
||||||
)
|
)
|
||||||
ctx.script += ctx.powerfulllmengine.generate(
|
script = ctx.powerfulllmengine.generate(
|
||||||
system_prompt=system,
|
system_prompt=system,
|
||||||
chat_prompt=chat,
|
chat_prompt=chat,
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=4096,
|
max_tokens=4096,
|
||||||
json_mode=True,
|
json_mode=True,
|
||||||
)["chapter"]
|
)["chapter"]
|
||||||
|
ctx.script += script
|
||||||
ctx.script += "\n"
|
ctx.script += "\n"
|
||||||
|
|
||||||
ctx.progress(0.3, "Synthesizing voice...")
|
ctx.progress(0.3, "Synthesizing voice...")
|
||||||
ctx.duration = ctx.ttsengine.synthesize(
|
duration = ctx.ttsengine.synthesize(
|
||||||
ctx.script, ctx.get_file_path("tts.wav")
|
script, ctx.get_file_path(f"tts_{i}.wav")
|
||||||
)
|
)
|
||||||
ctx.audio.append(mp.AudioFileClip(ctx.get_file_path("tts.wav")))
|
audioclip = mp.AudioFileClip(ctx.get_file_path(f"tts_{i}.wav"))
|
||||||
ctx.progress(0.4, "Transcribing audio...")
|
audioclip = audioclip.with_start(ctx.duration)
|
||||||
ctx.timed_script = ctx.transcriptionengine.transcribe(
|
text_audio.append(audioclip)
|
||||||
ctx.get_file_path("tts.wav"), fast=False, words=True
|
ctx.progress(0.2, f"Transcribing chapter: {chapter['title']}...")
|
||||||
)
|
timed_script = ctx.transcriptionengine.transcribe(
|
||||||
|
ctx.get_file_path(f"tts_{i}.wav"), fast=False, words=True
|
||||||
|
)
|
||||||
|
|
||||||
sentence_split_script = []
|
sentence_split_script = []
|
||||||
current_sentence = None
|
current_sentence = None
|
||||||
|
|
||||||
for word in ctx.timed_script.copy():
|
for word in timed_script.copy():
|
||||||
if current_sentence is None:
|
if current_sentence is None:
|
||||||
# Initialize the first sentence
|
# Initialize the first sentence
|
||||||
current_sentence = {
|
current_sentence = {
|
||||||
"text": word["text"],
|
"text": word["text"],
|
||||||
"end": word["end"],
|
"end": word["end"],
|
||||||
"start": word["start"],
|
"start": word["start"],
|
||||||
}
|
}
|
||||||
elif word["text"].endswith((".", "!", "?")):
|
elif word["text"].endswith((".", "!", "?")):
|
||||||
# Add the word to the current sentence and finalize it
|
# Add the word to the current sentence and finalize it
|
||||||
current_sentence["text"] += f" {word['text']}"
|
current_sentence["text"] += f" {word['text']}"
|
||||||
current_sentence["end"] = word["end"]
|
current_sentence["end"] = word["end"]
|
||||||
|
sentence_split_script.append(current_sentence)
|
||||||
|
current_sentence = None # Prepare to start a new sentence
|
||||||
|
else:
|
||||||
|
# Continue adding words to the current sentence
|
||||||
|
current_sentence["text"] += f" {word['text']}"
|
||||||
|
current_sentence["end"] = word["end"]
|
||||||
|
|
||||||
|
# If the last sentence didn't end with a punctuation mark
|
||||||
|
if current_sentence is not None:
|
||||||
sentence_split_script.append(current_sentence)
|
sentence_split_script.append(current_sentence)
|
||||||
current_sentence = None # Prepare to start a new sentence
|
|
||||||
else:
|
|
||||||
# Continue adding words to the current sentence
|
|
||||||
current_sentence["text"] += f" {word['text']}"
|
|
||||||
current_sentence["end"] = word["end"]
|
|
||||||
|
|
||||||
# If the last sentence didn't end with a punctuation mark
|
ctx.progress(0.2, f"Generating video for chapter: {chapter['title']}...")
|
||||||
if current_sentence is not None:
|
system = prompts["imager"]["system"]
|
||||||
sentence_split_script.append(current_sentence)
|
chat = prompts["imager"]["chat"]
|
||||||
|
chat = chat.replace("{user_instructions}", str(self.user_instructions))
|
||||||
ctx.progress(0.5, "Generating images...")
|
chat = chat.replace("{assets_instructions}", str(self.assets_instructions))
|
||||||
system = prompts["imager"]["system"]
|
chat = chat.replace("{video_transcript}", str(sentence_split_script))
|
||||||
chat = prompts["imager"]["chat"]
|
assets: list[dict[str, str | float]] = ctx.powerfulllmengine.generate(
|
||||||
chat = chat.replace("{user_instructions}", str(self.user_instructions))
|
system_prompt=system,
|
||||||
chat = chat.replace("{assets_instructions}", str(self.assets_instructions))
|
chat_prompt=chat,
|
||||||
chat = chat.replace("{video_transcript}", str(sentence_split_script))
|
temperature=1,
|
||||||
assets: list[dict[str, str | float]] = ctx.powerfulllmengine.generate(
|
max_tokens=4096,
|
||||||
system_prompt=system,
|
json_mode=True,
|
||||||
chat_prompt=chat,
|
)["assets"]
|
||||||
temperature=1,
|
for i, asset in enumerate(assets):
|
||||||
max_tokens=4096,
|
if asset["type"] == "stock":
|
||||||
json_mode=True,
|
ctx.progress(0.5, f"Getting stock image {i + 1}...")
|
||||||
)["assets"]
|
ctx.index_4.append(
|
||||||
for i, asset in enumerate(assets):
|
ctx.stockimageengine.get(
|
||||||
if asset["type"] == "stock":
|
asset["query"],
|
||||||
ctx.progress(0.5, f"Getting stock image {i + 1}...")
|
asset["start"] + ctx.duration,
|
||||||
ctx.index_4.append(
|
asset["end"] + ctx.duration,
|
||||||
ctx.stockimageengine.get(
|
)
|
||||||
asset["query"], asset["start"], asset["end"]
|
|
||||||
)
|
)
|
||||||
)
|
elif asset["type"] == "ai":
|
||||||
elif asset["type"] == "ai":
|
ctx.progress(0.5, f"Generating AI image {i + 1}...")
|
||||||
ctx.progress(0.5, f"Generating AI image {i + 1}...")
|
ctx.index_5.append(
|
||||||
ctx.index_5.append(
|
ctx.aiimageengine.generate(
|
||||||
ctx.aiimageengine.generate(
|
asset["prompt"],
|
||||||
asset["prompt"], asset["start"], asset["end"]
|
asset["start"] + ctx.duration,
|
||||||
|
asset["end"] + ctx.duration,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
|
ctx.duration += duration + 0.5
|
||||||
|
ctx.audio.extend(text_audio)
|
||||||
if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine):
|
if not isinstance(ctx.audiobackgroundengine, engines.NoneEngine):
|
||||||
ctx.progress(0.6, "Generating audio background...")
|
ctx.progress(0.6, "Generating audio background...")
|
||||||
ctx.audio.append(ctx.audiobackgroundengine.get_background())
|
ctx.audio.append(ctx.audiobackgroundengine.get_background())
|
||||||
|
|
||||||
if not isinstance(ctx.backgroundengine, engines.NoneEngine):
|
if not isinstance(ctx.backgroundengine, engines.NoneEngine):
|
||||||
ctx.progress(0.65, "Generating background...")
|
ctx.progress(0.65, "Generating background...")
|
||||||
ctx.audio.append(ctx.backgroundengine.get_background())
|
ctx.index_0.append(ctx.backgroundengine.get_background())
|
||||||
|
|
||||||
ctx.progress(0.7, "Rendering video...")
|
ctx.progress(0.7, "Rendering video...")
|
||||||
clips = [
|
clips = [
|
||||||
@@ -230,11 +243,13 @@ class ScriptedVideoPipeline(BasePipeline):
|
|||||||
lines=4,
|
lines=4,
|
||||||
max_lines=6,
|
max_lines=6,
|
||||||
label="Video instructions",
|
label="Video instructions",
|
||||||
|
info="Explain what the video should be about, how many chapters, and any specific instructions.",
|
||||||
),
|
),
|
||||||
gr.Textbox(
|
gr.Textbox(
|
||||||
lines=4,
|
lines=4,
|
||||||
max_lines=6,
|
max_lines=6,
|
||||||
label="Assets only instructions",
|
label="Assets only instructions",
|
||||||
|
info="Explain how the assets should be used in the video. When, how many, and of what type (stock images, AI or both)",
|
||||||
),
|
),
|
||||||
ratio,
|
ratio,
|
||||||
width,
|
width,
|
||||||
|
|||||||
Reference in New Issue
Block a user