♻️ Refactor implementation for future new parsers additions (#16)

2026-03-02 22:14:54 +00:00 · 2025-10-01 20:38:30 +02:00
parent a9d753ed25
commit a1e97fc1b2
9 changed files with 179 additions and 104 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@
 __pycache__/

 *.pyc
+
+.idea/
--- a/README.md
+++ b/README.md
@@ -33,6 +33,16 @@ The project uses:
  - json5
  - requests

+### Architecture
+
+The codebase follows a modular parser-based architecture:
+- **Base Parser**: Abstract base class (`BuildParser`) defining the interface for all parsers
+- **Parser System**: Extensible design allowing multiple parsers to extract different data types from Discord builds
+- **Current Parsers**:
+  - `EmojisParser`: Extracts emoji data from Discord builds
+
+This architecture makes it easy to add new parsers for extracting other types of data (e.g., stickers, assets) in the future.
+
 ## Output

 The emoji data is saved in `build/emojis.json` in the following format:
--- a/build/.emojis.json.hash
+++ b/build/.emojis.json.hash
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,5 +42,6 @@ extend-ignore = [
    "TRY003",
    "T201",
    "D100",
-    "D400"
+    "D400",
+    "D104"
 ]
--- a/src/main.py
+++ b/src/main.py
@@ -1,14 +1,11 @@
 # Copyright (c) Paillat-dev
 # SPDX-License-Identifier: MIT

-import hashlib
-import json
 import pathlib
 import sys
-from typing import Any

 from download_build import dowload
-from extract import extract_emojis_from_str
+from parsers import PARSERS


 def main() -> None:
@@ -16,34 +13,43 @@ def main() -> None:
    build_path = pathlib.Path.cwd() / "build"
    build_path.mkdir(exist_ok=True)

-    out_path = build_path / "emojis.json"
-    hash_path = build_path / "hash.txt"
+    changes: bool = False

-    if not out_path.exists():
-        out_path.touch()
+    build_download: str = dowload()
+    for parser_cls in PARSERS:
+        parser = parser_cls(build_download)
+        out_path = build_path / parser.FILE_NAME
+        hash_path = build_path / f".{parser.FILE_NAME}.hash"

-    if hash_path.exists():
-        with hash_path.open("r", encoding="utf-8") as hash_file:
-            current_hash = hash_file.read()
-    else:
-        current_hash = ""
+        if not out_path.exists():
+            out_path.touch()

-    new: dict[Any, Any] = extract_emojis_from_str(dowload())  # pyright: ignore[reportExplicitAny]
-    new_dump = json.dumps(new, indent=4, ensure_ascii=False).encode("utf-8")
-    new_hash = hashlib.sha256(string=new_dump).hexdigest()
+        if hash_path.exists():
+            with hash_path.open("r", encoding="utf-8") as hash_file:
+                current_hash = hash_file.read()
+        else:
+            current_hash = ""

-    if current_hash == new_hash:
-        print("No changes")
+        new_dump: bytes
+        new_hash: str
+        new_dump, new_hash = parser()
+
+        if current_hash == new_hash:
+            print(f"No changes for {parser.FILE_NAME}")
+            continue
+
+        with out_path.open("wb") as out_file:
+            out_file.write(new_dump)
+
+        with hash_path.open("w", encoding="utf-8") as hash_file:
+            hash_file.write(new_hash)
+
+        changes = True
+        print("Updated emojis.json")
+
+    if not changes:
        sys.exit(3)  # No changes

-    with out_path.open("wb") as out_file:
-        out_file.write(new_dump)
-
-    with hash_path.open("w", encoding="utf-8") as hash_file:
-        hash_file.write(new_hash)
-
-    print("Updated emojis.json")
-

 if __name__ == "__main__":
    main()
--- a/src/extract.py
+++ b/src/extract.py
@@ -1,80 +1,3 @@
 # Copyright (c) Paillat-dev
 # SPDX-License-Identifier: MIT

-import json
-import re
-from collections.abc import Mapping, Sequence
-from typing import Any
-from warnings import warn
-
-import json5
-
-type AnyDict = dict[Any, Any]  # pyright: ignore[reportExplicitAny]
-type AnyList = list[Any]  # pyright: ignore[reportExplicitAny]
-type AnyTuple = tuple[Any, ...]  # pyright: ignore[reportExplicitAny]
-
-PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""")
-
-
-class ExtractError(Exception):
-    """Base class for all extract errors."""
-
-
-class NotFoundError(ExtractError):
-    """No matches found in the build."""
-
-
-class MultipleFoundError(ExtractError):
-    """Multiple matches found in the build."""
-
-
-_SUR = re.compile(r"[\uD800-\uDFFF]")
-
-
-def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "") -> None:
-    r"""Recursively walk *node* (dict / list / tuple / str) and print the location and code-point of every UTF-16 surrogate half it encounters.
-
-    >>> data = {"a": "OK", "b": ["\\uD83D", {"c": "x\\uDE00y"}]}
-    >>> report_surrogates(data)
-    b[0] : U+D83D
-    b[1].c : U+DE00
-    """  # noqa: E501
-    if isinstance(node, str):
-        for m in _SUR.finditer(node):
-            cp = ord(m.group())
-            warn(f"Surrogate found at {path or '<root>'} : U+{cp:04X}", SyntaxWarning, 2)
-        return
-
-    if isinstance(node, Mapping):
-        for k, v in node.items():
-            sub = f"{path}.{k}" if path else str(k)
-            report_surrogates(v, sub)
-        return
-
-    if isinstance(node, Sequence) and not isinstance(node, (str, bytes, bytearray)):  # pyright: ignore[reportUnnecessaryIsInstance]
-        for i, v in enumerate(node):
-            report_surrogates(v, f"{path}[{i}]")
-
-
-def extract_emojis_from_str(content: str) -> AnyDict:
-    """Extract emojis from a string containing the discord build."""
-    print("Searching for emojis...")
-    matches: list[str] = PATTERN.findall(content)
-
-    if len(matches) == 0:
-        raise NotFoundError("No matches found")
-    if len(matches) > 1:
-        raise MultipleFoundError("Multiple matches found")
-
-    match: str = matches[0]
-
-    print("Found emojis!")
-    print("Parsing json")
-
-    # First load with json5 to handle \x escapes, then dump and reload with json to collapse
-    # the surrogate pairs into a single code point. This is necessary because json5 doesn't
-    # support surrogate pairs, and json doesn't support \x escapes.
-    r = json.loads(json.dumps(json5.loads(match)))
-    report_surrogates(r)
-
-    return r
--- a/src/parsers/init.py
+++ b/src/parsers/init.py
@@ -0,0 +1,9 @@
+# Copyright (c) Paillat-dev
+# SPDX-License-Identifier: MIT
+
+from .base import BuildParser
+from .emojis_parser import EmojisParser
+
+PARSERS: list[type[BuildParser]] = [EmojisParser]
+
+__all__ = ("PARSERS", "EmojisParser")
--- a/src/parsers/base.py
+++ b/src/parsers/base.py
@@ -0,0 +1,32 @@
+# Copyright (c) Paillat-dev
+# SPDX-License-Identifier: MIT
+
+from abc import ABC, abstractmethod
+
+
+class ParseError(Exception, ABC):
+    """Base class for all parsing errors."""
+
+
+class BuildParser(ABC):
+    """Base class for all build parsers."""
+
+    FILE_NAME: str
+
+    def __init__(self, discord_build: str) -> None:
+        """Initialize the parser with the discord build.
+
+        Args:
+            discord_build (str): The content of the discord build to parse.
+
+        """
+        self.discord_build: str = discord_build
+
+    @abstractmethod
+    def __call__(self) -> tuple[bytes, str]:
+        """Return a tuple of the parsed data we want to obtain and a hash for verifying whether the data has changed.
+
+        Returns:
+            Tuple[T, str]: A tuple containing the parsed data and a hash string.
+
+        """
--- a/src/parsers/emojis_parser.py
+++ b/src/parsers/emojis_parser.py
@@ -0,0 +1,92 @@
+# Copyright (c) Paillat-dev
+# SPDX-License-Identifier: MIT
+
+import hashlib
+import json
+import re
+from typing import Literal, NotRequired, TypedDict, final, override
+
+import json5
+
+from .base import BuildParser, ParseError
+
+PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""")
+
+
+class EmojiParseError(ParseError):
+    """Base class for all emoji parsing errors."""
+
+
+class NotFoundError(EmojiParseError):
+    """No matches found in the build."""
+
+
+class MultipleFoundError(EmojiParseError):
+    """Multiple matches found in the build."""
+
+
+class EmojiData(TypedDict):
+    """Structure of an emoji's data."""
+
+    names: list[str]
+    surrogates: str
+    unicodeVersion: float
+    spriteIndex: int
+    hasMultiDiversityParent: NotRequired[bool]
+
+
+class EmojisData(TypedDict):
+    """Structure of the extracted emojis data."""
+
+    emojis: list[EmojiData]
+    emojisByCategory: dict[str, list[int]]
+    nameToEmoji: dict[str, int]
+    surrogateToEmoji: dict[str, int]
+    numDiversitySprites: int
+    numNonDiversitySprites: int
+
+
+_SUR = re.compile(r"[\uD800-\uDFFF]")
+
+
+@final
+class EmojisParser(BuildParser):
+    """Parser for extracting emojis from the discord build."""
+
+    FILE_NAME: Literal["emojis.json"] = "emojis.json"  # pyright: ignore[reportIncompatibleVariableOverride]
+
+    @override
+    def __call__(self) -> tuple[bytes, str]:
+        """Extract emojis from the discord build and return them as a JSON dump and a hash.
+
+        Returns:
+            Tuple[bytes, str]: A tuple containing the JSON dump of the emojis and a SHA-256 hash of the dump.
+
+        """
+        extracted = self.extract_emojis_from_str(self.discord_build)
+        new_dump = json.dumps(extracted, indent=4, ensure_ascii=False).encode("utf-8")
+        new_hash = hashlib.sha256(string=new_dump).hexdigest()
+        return new_dump, new_hash
+
+    @staticmethod
+    def extract_emojis_from_str(content: str) -> EmojisData:
+        """Extract emojis from a string containing the discord build."""
+        print("Searching for emojis...")
+        matches: list[str] = PATTERN.findall(content)
+
+        if len(matches) == 0:
+            raise NotFoundError("No matches found")
+        if len(matches) > 1:
+            raise MultipleFoundError("Multiple matches found")
+
+        match: str = matches[0]
+
+        print("Found emojis!")
+        print("Parsing json")
+
+        # First load with json5 to handle \x escapes, then dump and reload with json to collapse
+        # the surrogate pairs into a single code point. This is necessary because json5 doesn't
+        # support surrogate pairs, and json doesn't support \x escapes.
+        r: EmojisData = json.loads(json.dumps(json5.loads(match)))
+
+        return r