🚀 Maaany things

2026-03-03 02:14:54 +00:00 · 2024-02-15 14:11:16 +01:00
parent 57bcf0af8e
commit 5be7937ffa
7 changed files with 272 additions and 50 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,66 +1,174 @@
+absl-py==2.1.0
 aiofiles==23.2.1
+aiohttp==3.9.3
+aiosignal==1.3.1
 altair==5.2.0
 annotated-types==0.6.0
+anyascii==0.3.2
 anyio==4.2.0
+async-timeout==4.0.3
 attrs==23.2.0
+audioread==3.0.1
+Babel==2.14.0
+bangla==0.0.2
+blinker==1.7.0
+blis==0.7.11
+bnnumerizer==0.0.2
+bnunicodenormalizer==0.1.6
+catalogue==2.0.10
 certifi==2024.2.2
+cffi==1.16.0
 charset-normalizer==3.3.2
 click==8.1.7
+cloudpathlib==0.16.0
 colorama==0.4.6
+confection==0.1.4
 contourpy==1.2.0
+coqpit==0.0.17
 cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.8
+dateparser==1.1.8
+decorator==4.4.2
+distro==1.9.0
+docopt==0.6.2
+dtw-python==1.3.1
+einops==0.7.0
+encodec==0.1.1
+exceptiongroup==1.2.0
 fastapi==0.109.2
 ffmpy==0.3.2
 filelock==3.13.1
+Flask==3.0.2
 fonttools==4.48.1
+frozenlist==1.4.1
 fsspec==2024.2.0
-gradio==4.18.0
+g2pkk==0.1.2
+gradio==4.19.0
 gradio_client==0.10.0
+grpcio==1.60.1
+gruut==2.2.3
+gruut-ipa==0.13.0
+gruut-lang-de==2.0.0
+gruut-lang-en==2.0.0
+gruut-lang-es==2.0.0
+gruut-lang-fr==2.0.2
 h11==0.14.0
-httpcore==1.0.2
+hangul-romanize==0.1.0
+httpcore==1.0.3
 httpx==0.26.0
 huggingface-hub==0.20.3
 idna==3.6
+imageio==2.34.0
+imageio-ffmpeg==0.4.9
 importlib-resources==6.1.1
+inflect==7.0.0
+itsdangerous==2.1.2
+jamo==0.4.1
+jieba==0.42.1
 Jinja2==3.1.3
+joblib==1.3.2
+jsonlines==1.2.0
 jsonschema==4.21.1
 jsonschema-specifications==2023.12.1
 kiwisolver==1.4.5
+langcodes==3.3.0
+lazy_loader==0.3
+librosa==0.10.0
+llvmlite==0.42.0
+Markdown==3.5.2
 markdown-it-py==3.0.0
 MarkupSafe==2.1.5
-matplotlib==3.8.2
+matplotlib==3.8.3
 mdurl==0.1.2
-numpy==1.26.4
-orjson==3.9.13
+more-itertools==10.2.0
+moviepy==1.0.3
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.5
+murmurhash==1.0.10
+networkx==2.8.8
+nltk==3.8.1
+num2words==0.5.13
+numba==0.59.0
+numpy==1.22.0
+openai==1.12.0
+openai-whisper==20231117
+orjson==3.9.14
 packaging==23.2
-pandas==2.2.0
+pandas==1.5.3
 pillow==10.2.0
+platformdirs==4.2.0
+pooch==1.8.0
+preshed==3.0.9
+proglog==0.1.10
+protobuf==4.25.2
+psutil==5.9.8
+pycparser==2.21
 pydantic==2.6.1
 pydantic_core==2.16.2
 pydub==0.25.1
 Pygments==2.17.2
+pynndescent==0.5.11
 pyparsing==3.1.1
+pypinyin==0.50.0
+pysbd==0.3.4
+python-crfsuite==0.9.10
 python-dateutil==2.8.2
 python-multipart==0.0.9
 pytz==2024.1
 PyYAML==6.0.1
 referencing==0.33.0
+regex==2023.12.25
 requests==2.31.0
 rich==13.7.0
-rpds-py==0.17.1
+rpds-py==0.18.0
 ruff==0.2.1
+safetensors==0.4.2
+scikit-learn==1.4.0
+scipy==1.11.4
 semantic-version==2.10.0
 shellingham==1.5.4
 six==1.16.0
+smart-open==6.4.0
 sniffio==1.3.0
+soundfile==0.12.1
+soxr==0.3.7
+spacy==3.7.2
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
 starlette==0.36.3
+SudachiDict-core==20240109
+SudachiPy==0.6.8
+sympy==1.12
+tensorboard==2.16.1
+tensorboard-data-server==0.7.2
+tf-keras==2.15.0
+thinc==8.2.3
+threadpoolctl==3.3.0
+tiktoken==0.6.0
+tokenizers==0.15.2
 tomlkit==0.12.0
 toolz==0.12.1
+torch==2.2.0+cu118
+torchaudio==2.2.0+cu118
+torchvision==0.17.0+cu118
 tqdm==4.66.2
+trainer==0.0.36
+transformers==4.37.2
+TTS==0.22.0
 typer==0.9.0
 typing_extensions==4.9.0
 tzdata==2024.1
+tzlocal==5.2
+umap-learn==0.5.5
+Unidecode==1.3.8
 urllib3==2.2.0
 uvicorn==0.27.1
+wasabi==1.1.2
+weasel==0.3.4
 websockets==11.0.3
-TTS
+Werkzeug==3.0.1
+whisper-timestamped==1.14.4
+yarl==1.9.4
--- a/src/engines/LLMEngine/AnthropicLLMEngine.py
+++ b/src/engines/LLMEngine/AnthropicLLMEngine.py
@@ -0,0 +1,57 @@
+import anthropic
+import gradio as gr
+import orjson
+
+from .BaseLLMEngine import BaseLLMEngine
+
+# Assuming these are the models supported by Anthropics that you wish to include
+ANTHROPIC_POSSIBLE_MODELS = [
+    "claude-2.1",
+    # Add more models as needed
+]
+
+class AnthropicsLLMEngine(BaseLLMEngine):
+    num_options = 1
+    name = "Anthropics"
+    description = "Anthropics language model engine."
+
+    def __init__(self, options: list) -> None:
+        self.model = options[0]
+        self.client = anthropic.Anthropic(api_key="YourAnthropicAPIKeyHere")  # Ensure API key is securely managed
+        super().__init__()
+
+    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 1024, temperature: float = 1.0, json_mode: bool = False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str | dict:
+        # Note: Adjust the parameters as per Anthropics API capabilities
+        prompt = f"""{anthropic.HUMAN_PROMPT} {system_prompt} {anthropic.HUMAN_PROMPT} {chat_prompt} {anthropic.AI_PROMPT}"""
+        if json_mode:
+            # anthopic does not officially support JSON mode, but we can bias the output towards a JSON-like format
+            prompt += " {"
+        response: anthropic.types.Completion = self.client.completions.create(
+            max_tokens_to_sample=max_tokens,
+            prompt=prompt,
+            model=self.model,
+            top_p=top_p,
+            temperature=temperature,
+            frequency_penalty=frequency_penalty,
+        )
+
+        content = response.completion
+        if json_mode:
+            #we add back the opening curly brace wich is not included in the response since it is in the prompt
+            content = "{" + content
+            #we remove everything after the last closing curly brace
+            content = content[:content.rfind("}") + 1]
+            return orjson.loads(content)
+        else:
+            return content
+
+    @classmethod
+    def get_options(cls) -> list:
+        return [
+            gr.Dropdown(
+                label="Model",
+                choices=ANTHROPIC_POSSIBLE_MODELS,
+                max_choices=1,
+                value=ANTHROPIC_POSSIBLE_MODELS[0]
+            )
+        ]
--- a/src/engines/LLMEngine/BaseLLMEngine.py
+++ b/src/engines/LLMEngine/BaseLLMEngine.py
@@ -6,5 +6,5 @@ import openai
 class BaseLLMEngine(BaseEngine):

    @abstractmethod
-    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int, temperature: float, top_p: float, frequency_penalty: float, presence_penalty: float) -> str:
+    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int, temperature: float, top_p: float, frequency_penalty: float, presence_penalty: float) -> str | dict:
        pass
--- a/src/engines/LLMEngine/OpenaiLLMEngine.py
+++ b/src/engines/LLMEngine/OpenaiLLMEngine.py
@@ -1,49 +1,43 @@
-import openai
+import anthropic
 import gradio as gr
-import orjson
-
-from abc import ABC, abstractmethod

 from .BaseLLMEngine import BaseLLMEngine

-OPENAI_POSSIBLE_MODELS = [
-    "gpt-3.5-turbo-0125",
-    "gpt-4-turbo-preview",
+# Assuming these are the models supported by Anthropics that you wish to include
+ANTHROPIC_POSSIBLE_MODELS = [
+    "claude-2.1",
+    # Add more models as needed
 ]

-class OpenaiLLMEngine(BaseLLMEngine):
+class AnthropicsLLMEngine(BaseLLMEngine):
    num_options = 1
-    name = "OpenAI"
-    description = "OpenAI language model engine."
+    name = "Anthropics"
+    description = "Anthropics language model engine."

    def __init__(self, options: list) -> None:
        self.model = options[0]
+        self.client = anthropic.Anthropic(api_key="YourAnthropicAPIKeyHere")  # Ensure API key is securely managed
        super().__init__()

-    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 512, temperature: float = 1.0, json_mode: bool= False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str:
-        response = openai.chat.completions.create(
-            model=self.model,
+    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 1024, temperature: float = 1.0, json_mode: bool = False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str | dict:
+        # Note: Adjust the parameters as per Anthropics API capabilities
+        message = self.client.messages.create(
+            max_tokens=max_tokens,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": chat_prompt},
            ],
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            response_format={ "type": "json_object" } if json_mode else openai._types.NOT_GIVEN
+            model=self.model,
        )
-        return response.choices[0].message.content if not json_mode else orjson.loads(response.choices[0].message.content)
-
+        return message.content

    @classmethod
    def get_options(cls) -> list:
        return [
            gr.Dropdown(
                label="Model",
-                choices=OPENAI_POSSIBLE_MODELS,
+                choices=ANTHROPIC_POSSIBLE_MODELS,
                max_choices=1,
-                value=OPENAI_POSSIBLE_MODELS[0]
+                value=ANTHROPIC_POSSIBLE_MODELS[0]
            )
        ]
--- a/src/engines/TTSEngine/BaseTTSEngine.py
+++ b/src/engines/TTSEngine/BaseTTSEngine.py
@@ -1,8 +1,16 @@
 import moviepy.editor as mp
+import whisper_timestamped as wt
+
+from typing import TypedDict
+from torch.cuda import is_available
 from abc import ABC, abstractmethod
-# Assuming BaseEngine is defined elsewhere in your project
+
 from ..BaseEngine import BaseEngine

+class Word(TypedDict):
+    start: str
+    end: str
+    text: str

 class BaseTTSEngine(BaseEngine):

@@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine):
    def synthesize(self, text: str, path: str) -> str:
        pass
    
+    def time_with_whisper(self, path: str) -> list[Word]:
+            """
+            Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+            Args:
+                path (str): The path to the audio file.
+
+            Returns:
+                list[Word]: A list of Word objects representing the transcribed words.
+                Example:
+                ```json
+                [
+                    {
+                        "start": "0.00",
+                        "end": "0.50",
+                        "text": "Hello"
+                    },
+                    {
+                        "start": "0.50",
+                        "end": "1.00",
+                        "text": "world"
+                    }
+                ]
+                ```
+            """
+            device = "cuda" if is_available() else "cpu"
+            audio = wt.load_audio(path)
+            model = wt.load_model("tiny", device=device)
+            
+            result = wt.transcribe(model=model, audio=audio)
+            results = [word for chunk in result for word in chunk["words"]]
+            for result in results:
+                # Not needed for the current use case
+                del result["confidence"]
+            return results
+
    def force_duration(self, duration: float, path: str):
+        """
+        Forces the audio clip at the given path to have the specified duration.
+
+        Args:
+            duration (float): The desired duration in seconds.
+            path (str): The path to the audio clip file.
+
+        Returns:
+            None
+        """
        audio_clip = mp.AudioFileClip(path)
        
        if audio_clip.duration > duration:
--- a/src/engines/TTSEngine/CoquiTTSEngine.py
+++ b/src/engines/TTSEngine/CoquiTTSEngine.py
@@ -1,9 +1,9 @@
 import gradio as gr

-# import TTS
+import TTS
 import os

-# import torch
+import torch

 from .BaseTTSEngine import BaseTTSEngine

@@ -102,15 +102,25 @@ class CoquiTTSEngine(BaseTTSEngine):

        os.environ["COQUI_TOS_AGREED"] = "1"

-    #        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
-    #       device = "cuda" if torch.cuda.is_available() else "cpu"
-    #       self.tts.to(device)
+        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tts.to(device)

-    def synthesize(self, text: str, path: str) -> str:
-        #      self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
+    def synthesize(self, text: str, path: str):
+            """
+            Synthesizes the given text into speech and saves it to the specified file path.
+
+            Args:
+                text (str): The text to synthesize into speech.
+                path (str): The file path to save the synthesized speech.
+
+            Returns:
+                float: The time taken to synthesize the speech with whispering effect.
+            """
+            self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
            if self.to_force_duration:
                self.force_duration(float(self.duration), path)
-        return path
+            return self.time_with_whisper(path)

    @classmethod
    def get_options(cls) -> list:
@@ -129,12 +139,11 @@ class CoquiTTSEngine(BaseTTSEngine):
            ),
        ]
    
-        duration_checkbox = gr.Checkbox(value=False)
-        duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False)
+        duration_checkbox = gr.Checkbox(label="Force duration", info="Force the duration of the generated audio to be at most the specified value", value=False)
+        duration = gr.Number(label="Duration [s]", value=57, step=1, minimum=10, visible=False)
        duration_switch = lambda x: gr.update(visible=x)
        duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration])
-        duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration")

-        options.append(duration_checkbox_group)
+        options.append(duration_checkbox)
        options.append(duration)
        return options
--- a/src/engines/init.py
+++ b/src/engines/init.py
@@ -1,5 +1,5 @@
-from . import TTSEngine
 from .BaseEngine import BaseEngine
+from . import TTSEngine
 from . import ScriptEngine
 from . import LLMEngine