diff --git a/.gitignore b/.gitignore index 272e79a..220c28e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ __pycache__/ *.pyc + +.idea/ \ No newline at end of file diff --git a/README.md b/README.md index 4f127cc..158026c 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,16 @@ The project uses: - json5 - requests +### Architecture + +The codebase follows a modular parser-based architecture: +- **Base Parser**: Abstract base class (`BuildParser`) defining the interface for all parsers +- **Parser System**: Extensible design allowing multiple parsers to extract different data types from Discord builds +- **Current Parsers**: + - `EmojisParser`: Extracts emoji data from Discord builds + +This architecture makes it easy to add new parsers for extracting other types of data (e.g., stickers, assets) in the future. + ## Output The emoji data is saved in `build/emojis.json` in the following format: diff --git a/build/hash.txt b/build/.emojis.json.hash similarity index 100% rename from build/hash.txt rename to build/.emojis.json.hash diff --git a/pyproject.toml b/pyproject.toml index a530a0e..aff1bec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,5 +42,6 @@ extend-ignore = [ "TRY003", "T201", "D100", - "D400" + "D400", + "D104" ] \ No newline at end of file diff --git a/src/__main__.py b/src/__main__.py index dc98d15..665536c 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,14 +1,11 @@ # Copyright (c) Paillat-dev # SPDX-License-Identifier: MIT -import hashlib -import json import pathlib import sys -from typing import Any from download_build import dowload -from extract import extract_emojis_from_str +from parsers import PARSERS def main() -> None: @@ -16,34 +13,43 @@ def main() -> None: build_path = pathlib.Path.cwd() / "build" build_path.mkdir(exist_ok=True) - out_path = build_path / "emojis.json" - hash_path = build_path / "hash.txt" + changes: bool = False - if not out_path.exists(): - out_path.touch() + build_download: str = dowload() + for parser_cls in PARSERS: + parser = parser_cls(build_download) + out_path = build_path / parser.FILE_NAME + hash_path = build_path / f".{parser.FILE_NAME}.hash" - if hash_path.exists(): - with hash_path.open("r", encoding="utf-8") as hash_file: - current_hash = hash_file.read() - else: - current_hash = "" + if not out_path.exists(): + out_path.touch() - new: dict[Any, Any] = extract_emojis_from_str(dowload()) # pyright: ignore[reportExplicitAny] - new_dump = json.dumps(new, indent=4, ensure_ascii=False).encode("utf-8") - new_hash = hashlib.sha256(string=new_dump).hexdigest() + if hash_path.exists(): + with hash_path.open("r", encoding="utf-8") as hash_file: + current_hash = hash_file.read() + else: + current_hash = "" - if current_hash == new_hash: - print("No changes") + new_dump: bytes + new_hash: str + new_dump, new_hash = parser() + + if current_hash == new_hash: + print(f"No changes for {parser.FILE_NAME}") + continue + + with out_path.open("wb") as out_file: + out_file.write(new_dump) + + with hash_path.open("w", encoding="utf-8") as hash_file: + hash_file.write(new_hash) + + changes = True + print("Updated emojis.json") + + if not changes: sys.exit(3) # No changes - with out_path.open("wb") as out_file: - out_file.write(new_dump) - - with hash_path.open("w", encoding="utf-8") as hash_file: - hash_file.write(new_hash) - - print("Updated emojis.json") - if __name__ == "__main__": main() diff --git a/src/extract.py b/src/extract.py index 7274674..2d29390 100644 --- a/src/extract.py +++ b/src/extract.py @@ -1,80 +1,3 @@ # Copyright (c) Paillat-dev # SPDX-License-Identifier: MIT -import json -import re -from collections.abc import Mapping, Sequence -from typing import Any -from warnings import warn - -import json5 - -type AnyDict = dict[Any, Any] # pyright: ignore[reportExplicitAny] -type AnyList = list[Any] # pyright: ignore[reportExplicitAny] -type AnyTuple = tuple[Any, ...] # pyright: ignore[reportExplicitAny] - -PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""") - - -class ExtractError(Exception): - """Base class for all extract errors.""" - - -class NotFoundError(ExtractError): - """No matches found in the build.""" - - -class MultipleFoundError(ExtractError): - """Multiple matches found in the build.""" - - -_SUR = re.compile(r"[\uD800-\uDFFF]") - - -def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "") -> None: - r"""Recursively walk *node* (dict / list / tuple / str) and print the location and code-point of every UTF-16 surrogate half it encounters. - - >>> data = {"a": "OK", "b": ["\\uD83D", {"c": "x\\uDE00y"}]} - >>> report_surrogates(data) - b[0] : U+D83D - b[1].c : U+DE00 - """ # noqa: E501 - if isinstance(node, str): - for m in _SUR.finditer(node): - cp = ord(m.group()) - warn(f"Surrogate found at {path or ''} : U+{cp:04X}", SyntaxWarning, 2) - return - - if isinstance(node, Mapping): - for k, v in node.items(): - sub = f"{path}.{k}" if path else str(k) - report_surrogates(v, sub) - return - - if isinstance(node, Sequence) and not isinstance(node, (str, bytes, bytearray)): # pyright: ignore[reportUnnecessaryIsInstance] - for i, v in enumerate(node): - report_surrogates(v, f"{path}[{i}]") - - -def extract_emojis_from_str(content: str) -> AnyDict: - """Extract emojis from a string containing the discord build.""" - print("Searching for emojis...") - matches: list[str] = PATTERN.findall(content) - - if len(matches) == 0: - raise NotFoundError("No matches found") - if len(matches) > 1: - raise MultipleFoundError("Multiple matches found") - - match: str = matches[0] - - print("Found emojis!") - print("Parsing json") - - # First load with json5 to handle \x escapes, then dump and reload with json to collapse - # the surrogate pairs into a single code point. This is necessary because json5 doesn't - # support surrogate pairs, and json doesn't support \x escapes. - r = json.loads(json.dumps(json5.loads(match))) - report_surrogates(r) - - return r diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py new file mode 100644 index 0000000..4685569 --- /dev/null +++ b/src/parsers/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Paillat-dev +# SPDX-License-Identifier: MIT + +from .base import BuildParser +from .emojis_parser import EmojisParser + +PARSERS: list[type[BuildParser]] = [EmojisParser] + +__all__ = ("PARSERS", "EmojisParser") diff --git a/src/parsers/base.py b/src/parsers/base.py new file mode 100644 index 0000000..e28a8d8 --- /dev/null +++ b/src/parsers/base.py @@ -0,0 +1,32 @@ +# Copyright (c) Paillat-dev +# SPDX-License-Identifier: MIT + +from abc import ABC, abstractmethod + + +class ParseError(Exception, ABC): + """Base class for all parsing errors.""" + + +class BuildParser(ABC): + """Base class for all build parsers.""" + + FILE_NAME: str + + def __init__(self, discord_build: str) -> None: + """Initialize the parser with the discord build. + + Args: + discord_build (str): The content of the discord build to parse. + + """ + self.discord_build: str = discord_build + + @abstractmethod + def __call__(self) -> tuple[bytes, str]: + """Return a tuple of the parsed data we want to obtain and a hash for verifying whether the data has changed. + + Returns: + Tuple[T, str]: A tuple containing the parsed data and a hash string. + + """ diff --git a/src/parsers/emojis_parser.py b/src/parsers/emojis_parser.py new file mode 100644 index 0000000..555662d --- /dev/null +++ b/src/parsers/emojis_parser.py @@ -0,0 +1,92 @@ +# Copyright (c) Paillat-dev +# SPDX-License-Identifier: MIT + +import hashlib +import json +import re +from typing import Literal, NotRequired, TypedDict, final, override + +import json5 + +from .base import BuildParser, ParseError + +PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""") + + +class EmojiParseError(ParseError): + """Base class for all emoji parsing errors.""" + + +class NotFoundError(EmojiParseError): + """No matches found in the build.""" + + +class MultipleFoundError(EmojiParseError): + """Multiple matches found in the build.""" + + +class EmojiData(TypedDict): + """Structure of an emoji's data.""" + + names: list[str] + surrogates: str + unicodeVersion: float + spriteIndex: int + hasMultiDiversityParent: NotRequired[bool] + + +class EmojisData(TypedDict): + """Structure of the extracted emojis data.""" + + emojis: list[EmojiData] + emojisByCategory: dict[str, list[int]] + nameToEmoji: dict[str, int] + surrogateToEmoji: dict[str, int] + numDiversitySprites: int + numNonDiversitySprites: int + + +_SUR = re.compile(r"[\uD800-\uDFFF]") + + +@final +class EmojisParser(BuildParser): + """Parser for extracting emojis from the discord build.""" + + FILE_NAME: Literal["emojis.json"] = "emojis.json" # pyright: ignore[reportIncompatibleVariableOverride] + + @override + def __call__(self) -> tuple[bytes, str]: + """Extract emojis from the discord build and return them as a JSON dump and a hash. + + Returns: + Tuple[bytes, str]: A tuple containing the JSON dump of the emojis and a SHA-256 hash of the dump. + + """ + extracted = self.extract_emojis_from_str(self.discord_build) + new_dump = json.dumps(extracted, indent=4, ensure_ascii=False).encode("utf-8") + new_hash = hashlib.sha256(string=new_dump).hexdigest() + return new_dump, new_hash + + @staticmethod + def extract_emojis_from_str(content: str) -> EmojisData: + """Extract emojis from a string containing the discord build.""" + print("Searching for emojis...") + matches: list[str] = PATTERN.findall(content) + + if len(matches) == 0: + raise NotFoundError("No matches found") + if len(matches) > 1: + raise MultipleFoundError("Multiple matches found") + + match: str = matches[0] + + print("Found emojis!") + print("Parsing json") + + # First load with json5 to handle \x escapes, then dump and reload with json to collapse + # the surrogate pairs into a single code point. This is necessary because json5 doesn't + # support surrogate pairs, and json doesn't support \x escapes. + r: EmojisData = json.loads(json.dumps(json5.loads(match))) + + return r