chore(.gitignore): add bark_cache directory to gitignore

feat(video.py): use wav format instead of mp3 for generated audio files
feat(montage.py): use Bark TTS instead of 🐸TTS
feat(speak.py): add support for Bark TTS
fix(speak.py): remove unused 🐸TTS import and variable
fix(main.py): fix asyncio.run() call placement
docs: update requirements.txt with new dependencies
This commit is contained in:
Paillat
2023-06-25 21:59:52 +02:00
parent 5498fb7bac
commit 866f1b4138
6 changed files with 74 additions and 30 deletions

1
.gitignore vendored
View File

@@ -162,3 +162,4 @@ ideas/
montageTEMP_MPY_wvf_snd.mp3 montageTEMP_MPY_wvf_snd.mp3
marp.exe marp.exe
channels/ channels/
bark_cache/

View File

@@ -31,7 +31,7 @@ class Video:
os.makedirs( self.path) os.makedirs( self.path)
script = None script = None
if os.path.exists(os.path.join( self.path, "script.json")): if os.path.exists(os.path.join( self.path, "script.json")):
if input("Video script already exists. Do you want to overwrite it ? (y/N) : ") == "y": if input("Video script already exists. Do you want to overwrite it ? (y/N) : ").lower() == "y":
os.remove(os.path.join( self.path, "script.json")) os.remove(os.path.join( self.path, "script.json"))
if not os.path.exists(os.path.join( self.path, "script.json")): if not os.path.exists(os.path.join( self.path, "script.json")):
@@ -41,11 +41,14 @@ class Video:
script_prompt = f.read() script_prompt = f.read()
f.close() f.close()
if script_prompt: if script_prompt:
printm("Using custom script prompt")
script = await generate_script(self.idea['title'], self.idea['description'], script_prompt) script = await generate_script(self.idea['title'], self.idea['description'], script_prompt)
else: else:
printm("Using default script prompt")
script = await generate_script(self.idea['title'], self.idea['description']) script = await generate_script(self.idea['title'], self.idea['description'])
script = json.loads(script)
with open(os.path.join( self.path, "script.json"), "w") as f: with open(os.path.join( self.path, "script.json"), "w") as f:
json.dump(json.loads(script), f) json.dump(script, f)
f.close() f.close()
else: else:
with open(os.path.join(self.path, "script.json"), "r") as f: with open(os.path.join(self.path, "script.json"), "r") as f:

View File

@@ -4,7 +4,7 @@ import requests
import pysrt import pysrt
import random import random
from generators.speak import generate_voice, voices from generators.speak import VoiceGenerator, voices
from moviepy.video.VideoClip import ImageClip from moviepy.video.VideoClip import ImageClip
from moviepy.editor import concatenate_videoclips, CompositeAudioClip, concatenate_audioclips from moviepy.editor import concatenate_videoclips, CompositeAudioClip, concatenate_audioclips
from moviepy.audio.io.AudioFileClip import AudioFileClip from moviepy.audio.io.AudioFileClip import AudioFileClip
@@ -31,10 +31,11 @@ async def prepare(path):
f.close() f.close()
if fresh: if fresh:
choosen_voice = random.choice(voices) choosen_voice = random.choice(voices)
generator = VoiceGenerator(speaker=choosen_voice)
for i in range(len(script)): for i in range(len(script)):
audio_path = path + "/audio/audio" + str(i) + ".mp3" audio_path = path + "/audio/audio" + str(i) + ".wav"
if not os.path.exists(audio_path): if not os.path.exists(audio_path):
generate_voice(audio_path, script[i]['spoken'], choosen_voice) generator.generate_voice(audio_path, script[i]['spoken'])
if "image" in script[i]: if "image" in script[i]:
if os.path.exists(path + "/slides/assets/slide" + str(i) + ".md"): if os.path.exists(path + "/slides/assets/slide" + str(i) + ".md"):
#skip this slide #skip this slide
@@ -70,11 +71,14 @@ async def prepare(path):
with open(path + "/slides/slide" + str(i) + ".md", 'w', encoding='utf-8') as f: with open(path + "/slides/slide" + str(i) + ".md", 'w', encoding='utf-8') as f:
f.write(marp + "\n\n") # blank slide f.write(marp + "\n\n") # blank slide
for i in range(len(script)): for i in range(len(script)):
marrkdown_path = os.path.join(path, f"slides/slide{i}.md") markdown_path = os.path.join(path, f"slides/slide{i}.md")
if os.path.exists(f"./{path}/slides/slide{i}.png"): markdown_path = os.path.abspath(markdown_path)
image_path = os.path.join(path, f"slides/slide{i}.png")
image_path = os.path.abspath(image_path)
if os.path.exists(markdown_path):
#skip this slide #skip this slide
continue continue
command = f"marp.exe {marrkdown_path} -o {path}/slides/slide{i}.png --allow-local-files" command = f'marp.exe "{markdown_path}" -o "{image_path}" --allow-local-files'
os.system(command) os.system(command)
return script return script
@@ -101,7 +105,7 @@ async def mount(path, script):
srt = pysrt.SubRipFile() srt = pysrt.SubRipFile()
total_length = 0 total_length = 0
for i in range(num_slides): for i in range(num_slides):
audio = AudioFileClip(path + "/audio/audio" + str(i) + ".mp3") audio = AudioFileClip(path + "/audio/audio" + str(i) + ".wav")
complete_audio = CompositeAudioClip([ complete_audio = CompositeAudioClip([
AudioFileClip("silence.mp3").set_duration(1), AudioFileClip("silence.mp3").set_duration(1),
audio, audio,
@@ -109,6 +113,8 @@ async def mount(path, script):
]) ])
length = complete_audio.duration length = complete_audio.duration
total_length += length total_length += length
print(script[i])
print(script[i]['spoken'])
srt = subs(length, total_length, script[i]['spoken'], srt, i) srt = subs(length, total_length, script[i]['spoken'], srt, i)
slide = ImageClip(path + "/slides/slide" + str(i) + ".png").set_duration(length) slide = ImageClip(path + "/slides/slide" + str(i) + ".png").set_duration(length)
slide = slide.set_audio(complete_audio) slide = slide.set_audio(complete_audio)

View File

@@ -1,9 +1,6 @@
from TTS.api import TTS
# Running a multi-speaker and multi-lingual model import os
# List available 🐸TTS models and choose the first one
model_best_multi = "tts_models/en/vctk/vits"
fakenames = { fakenames = {
"Alexander": "p230", "Alexander": "p230",
"Benjamin": "p240", "Benjamin": "p240",
@@ -14,17 +11,52 @@ fakenames = {
voices = ["Alexander", "Benjamin", "Amelia", "Katherine", "Johanne"] voices = ["Alexander", "Benjamin", "Amelia", "Katherine", "Johanne"]
# Init TTS class VoiceGenerator:
def __init__(self, mode="Bark", speaker=""):
self.mode = mode
self.speaker = speaker
if mode == "Bark":
os.environ["XDG_CACHE_HOME"] = os.path.join(os.getcwd(), "bark_cache")
from bark import preload_models, generation
def generate_voice(path, text, speaker="Alexander"): preload_models()
model = model_best_multi self.speaker = "v2/en_speaker_6"
speaker = fakenames[speaker] if speaker in fakenames else speaker else:
from TTS.api import TTS
model = "tts_models/en/vctk/vits"
self.speaker = fakenames[speaker] if speaker in fakenames else speaker
print(f"Generating voice for {model} with speaker {speaker}") print(f"Generating voice for {model} with speaker {speaker}")
try: try:
tts = TTS(model, gpu=True) self.tts = TTS(model, gpu=True)
except: except:
tts = TTS(model, gpu=False) self.tts = TTS(model, gpu=False)
tts.tts_to_file(text=text, file_path=path, speaker=speaker, speed=1, emotion="Happy") if self.speaker == "": self.speaker = "p230"
else:
self.speaker = fakenames[self.speaker] if self.speaker in fakenames else fakenames["Alexander"]
def generate_voice(self, path, text):
if self.mode == "Bark":
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import read as wavread, write as wavwrite
import noisereduce as nr
import soundfile
import numpy as np
import nltk
sentences = nltk.sent_tokenize(text)
pieces = []
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence
for sentence in sentences:
audio_array = generate_audio(sentence, history_prompt=self.speaker)
pieces += [audio_array, silence.copy()]
audio_array = np.concatenate(pieces)
soundfile.write(path, audio_array, SAMPLE_RATE, format="WAV", subtype="PCM_16")
rate, data = wavread(path)
reduced_noise = nr.reduce_noise(y=data, sr=rate)
os.remove(path)
wavwrite(path, rate, reduced_noise)
else:
self.tts.tts_to_file(text=text, file_path=path, speaker=self.speaker, speed=1, emotion="Happy")
if __name__ == "__main__": if __name__ == "__main__":
generate_voice("test/test.mp3", "This is a test. I like the words python, django and flask. Betty bought a bit of butter but the butter was bitter. So she bought some better butter to make the bitter butter better.") generator = VoiceGenerator()
generator.generate_voice("test/test_r.wav", "Hello there!")
generator.generate_voice("test/teste_r.wav", "This is a test. I like the words python, django and flask. Betty bought a bit of butter but the butter was bitter. So she bought some better butter to make the bitter butter better.")

View File

@@ -64,12 +64,12 @@ async def main():
video = await channel.generate_video(idea) video = await channel.generate_video(idea)
printm("Done!") printm("Done!")
printm("Here is the video:") printm("Here is the video:")
printm(video) printm(video.url)
input("Press enter to continue...") input("Press enter to continue...")
if __name__ == "__main__": if __name__ == "__main__":
while True: while True:
asyncio.run(main())
try: try:
asyncio.run(main())
input("Press enter to continue or type ctrl+c to quit : ") input("Press enter to continue or type ctrl+c to quit : ")
clear_screen() clear_screen()
except KeyboardInterrupt: except KeyboardInterrupt:

View File

@@ -6,3 +6,5 @@ openai
pillow pillow
python-dotenv python-dotenv
google-api-python-client google-api-python-client
git+https://github.com/suno-ai/bark.git
noisereduce