🚀 Maaany things

2026-01-02 09:16:19 +00:00 · 2024-02-15 14:11:16 +01:00
parent 57bcf0af8e
commit 5be7937ffa
7 changed files with 272 additions and 50 deletions
--- a/src/engines/LLMEngine/AnthropicLLMEngine.py
+++ b/src/engines/LLMEngine/AnthropicLLMEngine.py
@@ -0,0 +1,57 @@
+import anthropic
+import gradio as gr
+import orjson
+
+from .BaseLLMEngine import BaseLLMEngine
+
+# Assuming these are the models supported by Anthropics that you wish to include
+ANTHROPIC_POSSIBLE_MODELS = [
+    "claude-2.1",
+    # Add more models as needed
+]
+
+class AnthropicsLLMEngine(BaseLLMEngine):
+    num_options = 1
+    name = "Anthropics"
+    description = "Anthropics language model engine."
+
+    def __init__(self, options: list) -> None:
+        self.model = options[0]
+        self.client = anthropic.Anthropic(api_key="YourAnthropicAPIKeyHere")  # Ensure API key is securely managed
+        super().__init__()
+
+    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 1024, temperature: float = 1.0, json_mode: bool = False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str | dict:
+        # Note: Adjust the parameters as per Anthropics API capabilities
+        prompt = f"""{anthropic.HUMAN_PROMPT} {system_prompt} {anthropic.HUMAN_PROMPT} {chat_prompt} {anthropic.AI_PROMPT}"""
+        if json_mode:
+            # anthopic does not officially support JSON mode, but we can bias the output towards a JSON-like format
+            prompt += " {"
+        response: anthropic.types.Completion = self.client.completions.create(
+            max_tokens_to_sample=max_tokens,
+            prompt=prompt,
+            model=self.model,
+            top_p=top_p,
+            temperature=temperature,
+            frequency_penalty=frequency_penalty,
+        )
+
+        content = response.completion
+        if json_mode:
+            #we add back the opening curly brace wich is not included in the response since it is in the prompt
+            content = "{" + content
+            #we remove everything after the last closing curly brace
+            content = content[:content.rfind("}") + 1]
+            return orjson.loads(content)
+        else:
+            return content
+
+    @classmethod
+    def get_options(cls) -> list:
+        return [
+            gr.Dropdown(
+                label="Model",
+                choices=ANTHROPIC_POSSIBLE_MODELS,
+                max_choices=1,
+                value=ANTHROPIC_POSSIBLE_MODELS[0]
+            )
+        ]
--- a/src/engines/LLMEngine/BaseLLMEngine.py
+++ b/src/engines/LLMEngine/BaseLLMEngine.py
@@ -6,5 +6,5 @@ import openai
 class BaseLLMEngine(BaseEngine):

    @abstractmethod
-    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int, temperature: float, top_p: float, frequency_penalty: float, presence_penalty: float) -> str:
+    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int, temperature: float, top_p: float, frequency_penalty: float, presence_penalty: float) -> str | dict:
        pass
--- a/src/engines/LLMEngine/OpenaiLLMEngine.py
+++ b/src/engines/LLMEngine/OpenaiLLMEngine.py
@@ -1,49 +1,43 @@
-import openai
+import anthropic
 import gradio as gr
-import orjson
-
-from abc import ABC, abstractmethod

 from .BaseLLMEngine import BaseLLMEngine

-OPENAI_POSSIBLE_MODELS = [
-    "gpt-3.5-turbo-0125",
-    "gpt-4-turbo-preview",
+# Assuming these are the models supported by Anthropics that you wish to include
+ANTHROPIC_POSSIBLE_MODELS = [
+    "claude-2.1",
+    # Add more models as needed
 ]

-class OpenaiLLMEngine(BaseLLMEngine):
+class AnthropicsLLMEngine(BaseLLMEngine):
    num_options = 1
-    name = "OpenAI"
-    description = "OpenAI language model engine."
+    name = "Anthropics"
+    description = "Anthropics language model engine."

    def __init__(self, options: list) -> None:
        self.model = options[0]
+        self.client = anthropic.Anthropic(api_key="YourAnthropicAPIKeyHere")  # Ensure API key is securely managed
        super().__init__()
-    
-    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 512, temperature: float = 1.0, json_mode: bool= False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str:
-        response = openai.chat.completions.create(
-            model=self.model,
+
+    def generate(self, system_prompt: str, chat_prompt: str, max_tokens: int = 1024, temperature: float = 1.0, json_mode: bool = False, top_p: float = 1, frequency_penalty: float = 0, presence_penalty: float = 0) -> str | dict:
+        # Note: Adjust the parameters as per Anthropics API capabilities
+        message = self.client.messages.create(
+            max_tokens=max_tokens,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": chat_prompt},
            ],
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            response_format={ "type": "json_object" } if json_mode else openai._types.NOT_GIVEN
+            model=self.model,
        )
-        return response.choices[0].message.content if not json_mode else orjson.loads(response.choices[0].message.content)
-
+        return message.content

    @classmethod
    def get_options(cls) -> list:
        return [
            gr.Dropdown(
                label="Model",
-                choices=OPENAI_POSSIBLE_MODELS,
+                choices=ANTHROPIC_POSSIBLE_MODELS,
                max_choices=1,
-                value=OPENAI_POSSIBLE_MODELS[0]
+                value=ANTHROPIC_POSSIBLE_MODELS[0]
            )
-        ]
+        ]
--- a/src/engines/TTSEngine/BaseTTSEngine.py
+++ b/src/engines/TTSEngine/BaseTTSEngine.py
@@ -1,8 +1,16 @@
 import moviepy.editor as mp
+import whisper_timestamped as wt
+
+from typing import TypedDict
+from torch.cuda import is_available
 from abc import ABC, abstractmethod
-# Assuming BaseEngine is defined elsewhere in your project
+
 from ..BaseEngine import BaseEngine

+class Word(TypedDict):
+    start: str
+    end: str
+    text: str

 class BaseTTSEngine(BaseEngine):

@@ -10,7 +18,53 @@ class BaseTTSEngine(BaseEngine):
    def synthesize(self, text: str, path: str) -> str:
        pass
    
+    def time_with_whisper(self, path: str) -> list[Word]:
+            """
+            Transcribes the audio file at the given path using a pre-trained model and returns a list of words.
+
+            Args:
+                path (str): The path to the audio file.
+
+            Returns:
+                list[Word]: A list of Word objects representing the transcribed words.
+                Example:
+                ```json
+                [
+                    {
+                        "start": "0.00",
+                        "end": "0.50",
+                        "text": "Hello"
+                    },
+                    {
+                        "start": "0.50",
+                        "end": "1.00",
+                        "text": "world"
+                    }
+                ]
+                ```
+            """
+            device = "cuda" if is_available() else "cpu"
+            audio = wt.load_audio(path)
+            model = wt.load_model("tiny", device=device)
+            
+            result = wt.transcribe(model=model, audio=audio)
+            results = [word for chunk in result for word in chunk["words"]]
+            for result in results:
+                # Not needed for the current use case
+                del result["confidence"]
+            return results
+
    def force_duration(self, duration: float, path: str):
+        """
+        Forces the audio clip at the given path to have the specified duration.
+
+        Args:
+            duration (float): The desired duration in seconds.
+            path (str): The path to the audio clip file.
+
+        Returns:
+            None
+        """
        audio_clip = mp.AudioFileClip(path)
        
        if audio_clip.duration > duration:
--- a/src/engines/TTSEngine/CoquiTTSEngine.py
+++ b/src/engines/TTSEngine/CoquiTTSEngine.py
@@ -1,9 +1,9 @@
 import gradio as gr

-# import TTS
+import TTS
 import os

-# import torch
+import torch

 from .BaseTTSEngine import BaseTTSEngine

@@ -102,15 +102,25 @@ class CoquiTTSEngine(BaseTTSEngine):

        os.environ["COQUI_TOS_AGREED"] = "1"

-    #        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
-    #       device = "cuda" if torch.cuda.is_available() else "cpu"
-    #       self.tts.to(device)
+        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tts.to(device)

-    def synthesize(self, text: str, path: str) -> str:
-        #      self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
-        if self.to_force_duration:
-            self.force_duration(float(self.duration), path)
-        return path
+    def synthesize(self, text: str, path: str):
+            """
+            Synthesizes the given text into speech and saves it to the specified file path.
+
+            Args:
+                text (str): The text to synthesize into speech.
+                path (str): The file path to save the synthesized speech.
+
+            Returns:
+                float: The time taken to synthesize the speech with whispering effect.
+            """
+            self.tts.tts_to_file(text=text, file_path=path, lang=self.language, speaker=self.voice)
+            if self.to_force_duration:
+                self.force_duration(float(self.duration), path)
+            return self.time_with_whisper(path)

    @classmethod
    def get_options(cls) -> list:
@@ -129,12 +139,11 @@ class CoquiTTSEngine(BaseTTSEngine):
            ),
        ]
    
-        duration_checkbox = gr.Checkbox(value=False)
-        duration = gr.Number(label="Duration", value=57, step=1, minimum=10, visible=False)
+        duration_checkbox = gr.Checkbox(label="Force duration", info="Force the duration of the generated audio to be at most the specified value", value=False)
+        duration = gr.Number(label="Duration [s]", value=57, step=1, minimum=10, visible=False)
        duration_switch = lambda x: gr.update(visible=x)
        duration_checkbox.change(duration_switch, inputs=[duration_checkbox], outputs=[duration])
-        duration_checkbox_group = gr.CheckboxGroup([duration_checkbox], label="Force duration")

-        options.append(duration_checkbox_group)
+        options.append(duration_checkbox)
        options.append(duration)
        return options
--- a/src/engines/init.py
+++ b/src/engines/init.py
@@ -1,5 +1,5 @@
-from . import TTSEngine
 from .BaseEngine import BaseEngine
+from . import TTSEngine
 from . import ScriptEngine
 from . import LLMEngine