diff --git a/requirements.txt b/requirements.txt index 762185b..23b1188 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,66 +1,174 @@ +absl-py==2.1.0 aiofiles==23.2.1 +aiohttp==3.9.3 +aiosignal==1.3.1 altair==5.2.0 annotated-types==0.6.0 +anyascii==0.3.2 anyio==4.2.0 +async-timeout==4.0.3 attrs==23.2.0 +audioread==3.0.1 +Babel==2.14.0 +bangla==0.0.2 +blinker==1.7.0 +blis==0.7.11 +bnnumerizer==0.0.2 +bnunicodenormalizer==0.1.6 +catalogue==2.0.10 certifi==2024.2.2 +cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 +cloudpathlib==0.16.0 colorama==0.4.6 +confection==0.1.4 contourpy==1.2.0 +coqpit==0.0.17 cycler==0.12.1 +cymem==2.0.8 +Cython==3.0.8 +dateparser==1.1.8 +decorator==4.4.2 +distro==1.9.0 +docopt==0.6.2 +dtw-python==1.3.1 +einops==0.7.0 +encodec==0.1.1 +exceptiongroup==1.2.0 fastapi==0.109.2 ffmpy==0.3.2 filelock==3.13.1 +Flask==3.0.2 fonttools==4.48.1 +frozenlist==1.4.1 fsspec==2024.2.0 -gradio==4.18.0 +g2pkk==0.1.2 +gradio==4.19.0 gradio_client==0.10.0 +grpcio==1.60.1 +gruut==2.2.3 +gruut-ipa==0.13.0 +gruut-lang-de==2.0.0 +gruut-lang-en==2.0.0 +gruut-lang-es==2.0.0 +gruut-lang-fr==2.0.2 h11==0.14.0 -httpcore==1.0.2 +hangul-romanize==0.1.0 +httpcore==1.0.3 httpx==0.26.0 huggingface-hub==0.20.3 idna==3.6 +imageio==2.34.0 +imageio-ffmpeg==0.4.9 importlib-resources==6.1.1 +inflect==7.0.0 +itsdangerous==2.1.2 +jamo==0.4.1 +jieba==0.42.1 Jinja2==3.1.3 +joblib==1.3.2 +jsonlines==1.2.0 jsonschema==4.21.1 jsonschema-specifications==2023.12.1 kiwisolver==1.4.5 +langcodes==3.3.0 +lazy_loader==0.3 +librosa==0.10.0 +llvmlite==0.42.0 +Markdown==3.5.2 markdown-it-py==3.0.0 MarkupSafe==2.1.5 -matplotlib==3.8.2 +matplotlib==3.8.3 mdurl==0.1.2 -numpy==1.26.4 -orjson==3.9.13 +more-itertools==10.2.0 +moviepy==1.0.3 +mpmath==1.3.0 +msgpack==1.0.7 +multidict==6.0.5 +murmurhash==1.0.10 +networkx==2.8.8 +nltk==3.8.1 +num2words==0.5.13 +numba==0.59.0 +numpy==1.22.0 +openai==1.12.0 +openai-whisper==20231117 +orjson==3.9.14 packaging==23.2 -pandas==2.2.0 +pandas==1.5.3 pillow==10.2.0 +platformdirs==4.2.0 +pooch==1.8.0 +preshed==3.0.9 +proglog==0.1.10 +protobuf==4.25.2 +psutil==5.9.8 +pycparser==2.21 pydantic==2.6.1 pydantic_core==2.16.2 pydub==0.25.1 Pygments==2.17.2 +pynndescent==0.5.11 pyparsing==3.1.1 +pypinyin==0.50.0 +pysbd==0.3.4 +python-crfsuite==0.9.10 python-dateutil==2.8.2 python-multipart==0.0.9 pytz==2024.1 PyYAML==6.0.1 referencing==0.33.0 +regex==2023.12.25 requests==2.31.0 rich==13.7.0 -rpds-py==0.17.1 +rpds-py==0.18.0 ruff==0.2.1 +safetensors==0.4.2 +scikit-learn==1.4.0 +scipy==1.11.4 semantic-version==2.10.0 shellingham==1.5.4 six==1.16.0 +smart-open==6.4.0 sniffio==1.3.0 +soundfile==0.12.1 +soxr==0.3.7 +spacy==3.7.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 starlette==0.36.3 +SudachiDict-core==20240109 +SudachiPy==0.6.8 +sympy==1.12 +tensorboard==2.16.1 +tensorboard-data-server==0.7.2 +tf-keras==2.15.0 +thinc==8.2.3 +threadpoolctl==3.3.0 +tiktoken==0.6.0 +tokenizers==0.15.2 tomlkit==0.12.0 toolz==0.12.1 +torch==2.2.0+cu118 +torchaudio==2.2.0+cu118 +torchvision==0.17.0+cu118 tqdm==4.66.2 +trainer==0.0.36 +transformers==4.37.2 +TTS==0.22.0 typer==0.9.0 typing_extensions==4.9.0 tzdata==2024.1 +tzlocal==5.2 +umap-learn==0.5.5 +Unidecode==1.3.8 urllib3==2.2.0 uvicorn==0.27.1 +wasabi==1.1.2 +weasel==0.3.4 websockets==11.0.3 -TTS \ No newline at end of file +Werkzeug==3.0.1 +whisper-timestamped==1.14.4 +yarl==1.9.4 diff --git a/src/engines/LLMEngine/AnthropicLLMEngine.py b/src/engines/LLMEngine/AnthropicLLMEngine.py new file mode 100644 index 0000000..5182bad --- /dev/null +++ b/src/engines/LLMEngine/AnthropicLLMEngine.py @@ -0,0 +1,57 @@ +import anthropic +import gradio as gr +import orjson + +from .BaseLLMEngine import BaseLLMEngine + +# Assuming these are the models supported by Anthropics that you wish to include +ANTHROPIC_POSSIBLE_MODELS = [ + "claude-2.1", + # Add more models as needed +] + +class AnthropicsLLMEngine(BaseLLMEngine): + num_options = 1 + name = "Anthropics" + description = "Anthropics language model engine." + + def __init__(self, options: list) -> None: + self.model = options[0] + self.client = anthropic.Anthropic(api_key="YourAnthropicAPIKeyHere") # Ensure API key is securely managed + super().__init__() + + def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 1024, temperature: float = 1.0, json_mode: bool = False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str | dict: + # Note: Adjust the parameters as per Anthropics API capabilities + prompt = f"""{anthropic.HUMAN_PROMPT} {system_prompt} {anthropic.HUMAN_PROMPT} {chat_prompt} {anthropic.AI_PROMPT}""" + if json_mode: + # anthopic does not officially support JSON mode, but we can bias the output towards a JSON-like format + prompt += " {" + response: anthropic.types.Completion = self.client.completions.create( + max_tokens_to_sample=max_tokens, + prompt=prompt, + model=self.model, + top_p=top_p, + temperature=temperature, + frequency_penalty=frequency_penalty, + ) + + content = response.completion + if json_mode: + #we add back the opening curly brace wich is not included in the response since it is in the prompt + content = "{" + content + #we remove everything after the last closing curly brace + content = content[:content.rfind("}") + 1] + return orjson.loads(content) + else: + return content + + @classmethod + def get_options(cls) -> list: + return [ + gr.Dropdown( + label="Model", + choices=ANTHROPIC_POSSIBLE_MODELS, + max_choices=1, + value=ANTHROPIC_POSSIBLE_MODELS[0] + ) + ] diff --git a/src/engines/LLMEngine/BaseLLMEngine.py b/src/engines/LLMEngine/BaseLLMEngine.py index 1250b01..bdea541 100644 --- a/src/engines/LLMEngine/BaseLLMEngine.py +++ b/src/engines/LLMEngine/BaseLLMEngine.py @@ -6,5 +6,5 @@ import openai class BaseLLMEngine(BaseEngine): @abstractmethod - def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int, temperature: float, top_p: float, frequency_penalty: float, presence_penalty: float) -> str: + def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int, temperature: float, top_p: float, frequency_penalty: float, presence_penalty: float) -> str | dict: pass \ No newline at end of file diff --git a/src/engines/LLMEngine/OpenaiLLMEngine.py b/src/engines/LLMEngine/OpenaiLLMEngine.py index 93088a0..b4f7758 100644 --- a/src/engines/LLMEngine/OpenaiLLMEngine.py +++ b/src/engines/LLMEngine/OpenaiLLMEngine.py @@ -1,49 +1,43 @@ -import openai +import anthropic import gradio as gr -import orjson - -from abc import ABC, abstractmethod from .BaseLLMEngine import BaseLLMEngine -OPENAI_POSSIBLE_MODELS = [ - "gpt-3.5-turbo-0125", - "gpt-4-turbo-preview", +# Assuming these are the models supported by Anthropics that you wish to include +ANTHROPIC_POSSIBLE_MODELS = [ + "claude-2.1", + # Add more models as needed ] -class OpenaiLLMEngine(BaseLLMEngine): +class AnthropicsLLMEngine(BaseLLMEngine): num_options = 1 - name = "OpenAI" - description = "OpenAI language model engine." + name = "Anthropics" + description = "Anthropics language model engine." def __init__(self, options: list) -> None: self.model = options[0] + self.client = anthropic.Anthropic(api_key="YourAnthropicAPIKeyHere") # Ensure API key is securely managed super().__init__() - - def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 512, temperature: float = 1.0, json_mode: bool= False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str: - response = openai.chat.completions.create( - model=self.model, + + def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 1024, temperature: float = 1.0, json_mode: bool = False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str | dict: + # Note: Adjust the parameters as per Anthropics API capabilities + message = self.client.messages.create( + max_tokens=max_tokens, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": chat_prompt}, ], - max_tokens=max_tokens, - temperature=temperature, - top_p=top_p, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - response_format={ "type": "json_object" } if json_mode else openai._types.NOT_GIVEN + model=self.model, ) - return response.choices[0].message.content if not json_mode else orjson.loads(response.choices[0].message.content) - + return message.content @classmethod def get_options(cls) -> list: return [ gr.Dropdown( label="Model", - choices=OPENAI_POSSIBLE_MODELS, + choices=ANTHROPIC_POSSIBLE_MODELS, max_choices=1, - value=OPENAI_POSSIBLE_MODELS[0] + value=ANTHROPIC_POSSIBLE_MODELS[0] ) - ] \ No newline at end of file + ] diff --git a/src/engines/TTSEngine/BaseTTSEngine.py b/src/engines/TTSEngine/BaseTTSEngine.py index 709d933..d595ba0 100644 --- a/src/engines/TTSEngine/BaseTTSEngine.py +++ b/src/engines/TTSEngine/BaseTTSEngine.py @@ -1,8 +1,16 @@ import moviepy.editor as mp +import whisper_timestamped as wt + +from typing import TypedDict +from torch.cuda import is_available from abc import ABC, abstractmethod -# Assuming BaseEngine is defined elsewhere in your project + from ..BaseEngine import BaseEngine +class Word(TypedDict): + start: str + end: str + text: str class BaseTTSEngine(BaseEngine): @@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine): def synthesize(self, text: str, path: str) -> str: pass + def time_with_whisper(self, path: str) -> list[Word]: + """ + Transcribes the audio file at the given path using a pre-trained model and returns a list of words. + + Args: + path (str): The path to the audio file. + + Returns: + list[Word]: A list of Word objects representing the transcribed words. + Example: + ```json + [ + { + "start": "0.00", + "end": "0.50", + "text": "Hello" + }, + { + "start": "0.50", + "end": "1.00", + "text": "world" + } + ] + ``` + """ + device = "cuda" if is_available() else "cpu" + audio = wt.load_audio(path) + model = wt.load_model("tiny", device=device) + + result = wt.transcribe(model=model, audio=audio) + results = [word for chunk in result for word in chunk["words"]] + for result in results: + # Not needed for the current use case + del result["confidence"] + return results + def force_duration(self, duration: float, path: str): + """ + Forces the audio clip at the given path to have the specified duration. + + Args: + duration (float): The desired duration in seconds. + path (str): The path to the audio clip file. + + Returns: + None + """ audio_clip = mp.AudioFileClip(path) if audio_clip.duration > duration: diff --git a/src/engines/TTSEngine/CoquiTTSEngine.py b/src/engines/TTSEngine/CoquiTTSEngine.py index 030d7da..07cd0bf 100644 --- a/src/engines/TTSEngine/CoquiTTSEngine.py +++ b/src/engines/TTSEngine/CoquiTTSEngine.py @@ -1,9 +1,9 @@ import gradio as gr -# import TTS +import TTS import os -# import torch +import torch from .BaseTTSEngine import BaseTTSEngine @@ -102,15 +102,25 @@ class CoquiTTSEngine(BaseTTSEngine): os.environ["COQUI_TOS_AGREED"] = "1" - # self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") - # device = "cuda" if torch.cuda.is_available() else "cpu" - # self.tts.to(device) + self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") + device = "cuda" if torch.cuda.is_available() else "cpu" + self.tts.to(device) - def synthesize(self, text: str, path: str) -> str: - # self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice) - if self.to_force_duration: - self.force_duration(float(self.duration), path) - return path + def synthesize(self, text: str, path: str): + """ + Synthesizes the given text into speech and saves it to the specified file path. + + Args: + text (str): The text to synthesize into speech. + path (str): The file path to save the synthesized speech. + + Returns: + float: The time taken to synthesize the speech with whispering effect. + """ + self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice) + if self.to_force_duration: + self.force_duration(float(self.duration), path) + return self.time_with_whisper(path) @classmethod def get_options(cls) -> list: @@ -129,12 +139,11 @@ class CoquiTTSEngine(BaseTTSEngine): ), ] - duration_checkbox = gr.Checkbox(value=False) - duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False) + duration_checkbox = gr.Checkbox(label="Force duration", info="Force the duration of the generated audio to be at most the specified value", value=False) + duration = gr.Number(label="Duration [s]", value=57, step=1, minimum=10, visible=False) duration_switch = lambda x: gr.update(visible=x) duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration]) - duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration") - options.append(duration_checkbox_group) + options.append(duration_checkbox) options.append(duration) return options \ No newline at end of file diff --git a/src/engines/__init__.py b/src/engines/__init__.py index 4b57911..f49f99a 100644 --- a/src/engines/__init__.py +++ b/src/engines/__init__.py @@ -1,5 +1,5 @@ -from . import TTSEngine from .BaseEngine import BaseEngine +from . import TTSEngine from . import ScriptEngine from . import LLMEngine