♻️ Refactor implementation for future new parsers additions (#16)

This commit is contained in:
2025-10-01 20:38:30 +02:00
committed by GitHub
parent a9d753ed25
commit a1e97fc1b2
9 changed files with 179 additions and 104 deletions

2
.gitignore vendored
View File

@@ -2,3 +2,5 @@
__pycache__/
*.pyc
.idea/

View File

@@ -33,6 +33,16 @@ The project uses:
- json5
- requests
### Architecture
The codebase follows a modular parser-based architecture:
- **Base Parser**: Abstract base class (`BuildParser`) defining the interface for all parsers
- **Parser System**: Extensible design allowing multiple parsers to extract different data types from Discord builds
- **Current Parsers**:
- `EmojisParser`: Extracts emoji data from Discord builds
This architecture makes it easy to add new parsers for extracting other types of data (e.g., stickers, assets) in the future.
## Output
The emoji data is saved in `build/emojis.json` in the following format:

View File

@@ -42,5 +42,6 @@ extend-ignore = [
"TRY003",
"T201",
"D100",
"D400"
"D400",
"D104"
]

View File

@@ -1,14 +1,11 @@
# Copyright (c) Paillat-dev
# SPDX-License-Identifier: MIT
import hashlib
import json
import pathlib
import sys
from typing import Any
from download_build import dowload
from extract import extract_emojis_from_str
from parsers import PARSERS
def main() -> None:
@@ -16,34 +13,43 @@ def main() -> None:
build_path = pathlib.Path.cwd() / "build"
build_path.mkdir(exist_ok=True)
out_path = build_path / "emojis.json"
hash_path = build_path / "hash.txt"
changes: bool = False
if not out_path.exists():
out_path.touch()
build_download: str = dowload()
for parser_cls in PARSERS:
parser = parser_cls(build_download)
out_path = build_path / parser.FILE_NAME
hash_path = build_path / f".{parser.FILE_NAME}.hash"
if hash_path.exists():
with hash_path.open("r", encoding="utf-8") as hash_file:
current_hash = hash_file.read()
else:
current_hash = ""
if not out_path.exists():
out_path.touch()
new: dict[Any, Any] = extract_emojis_from_str(dowload()) # pyright: ignore[reportExplicitAny]
new_dump = json.dumps(new, indent=4, ensure_ascii=False).encode("utf-8")
new_hash = hashlib.sha256(string=new_dump).hexdigest()
if hash_path.exists():
with hash_path.open("r", encoding="utf-8") as hash_file:
current_hash = hash_file.read()
else:
current_hash = ""
if current_hash == new_hash:
print("No changes")
new_dump: bytes
new_hash: str
new_dump, new_hash = parser()
if current_hash == new_hash:
print(f"No changes for {parser.FILE_NAME}")
continue
with out_path.open("wb") as out_file:
out_file.write(new_dump)
with hash_path.open("w", encoding="utf-8") as hash_file:
hash_file.write(new_hash)
changes = True
print("Updated emojis.json")
if not changes:
sys.exit(3) # No changes
with out_path.open("wb") as out_file:
out_file.write(new_dump)
with hash_path.open("w", encoding="utf-8") as hash_file:
hash_file.write(new_hash)
print("Updated emojis.json")
if __name__ == "__main__":
main()

View File

@@ -1,80 +1,3 @@
# Copyright (c) Paillat-dev
# SPDX-License-Identifier: MIT
import json
import re
from collections.abc import Mapping, Sequence
from typing import Any
from warnings import warn
import json5
type AnyDict = dict[Any, Any] # pyright: ignore[reportExplicitAny]
type AnyList = list[Any] # pyright: ignore[reportExplicitAny]
type AnyTuple = tuple[Any, ...] # pyright: ignore[reportExplicitAny]
PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""")
class ExtractError(Exception):
"""Base class for all extract errors."""
class NotFoundError(ExtractError):
"""No matches found in the build."""
class MultipleFoundError(ExtractError):
"""Multiple matches found in the build."""
_SUR = re.compile(r"[\uD800-\uDFFF]")
def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "") -> None:
r"""Recursively walk *node* (dict / list / tuple / str) and print the location and code-point of every UTF-16 surrogate half it encounters.
>>> data = {"a": "OK", "b": ["\\uD83D", {"c": "x\\uDE00y"}]}
>>> report_surrogates(data)
b[0] : U+D83D
b[1].c : U+DE00
""" # noqa: E501
if isinstance(node, str):
for m in _SUR.finditer(node):
cp = ord(m.group())
warn(f"Surrogate found at {path or '<root>'} : U+{cp:04X}", SyntaxWarning, 2)
return
if isinstance(node, Mapping):
for k, v in node.items():
sub = f"{path}.{k}" if path else str(k)
report_surrogates(v, sub)
return
if isinstance(node, Sequence) and not isinstance(node, (str, bytes, bytearray)): # pyright: ignore[reportUnnecessaryIsInstance]
for i, v in enumerate(node):
report_surrogates(v, f"{path}[{i}]")
def extract_emojis_from_str(content: str) -> AnyDict:
"""Extract emojis from a string containing the discord build."""
print("Searching for emojis...")
matches: list[str] = PATTERN.findall(content)
if len(matches) == 0:
raise NotFoundError("No matches found")
if len(matches) > 1:
raise MultipleFoundError("Multiple matches found")
match: str = matches[0]
print("Found emojis!")
print("Parsing json")
# First load with json5 to handle \x escapes, then dump and reload with json to collapse
# the surrogate pairs into a single code point. This is necessary because json5 doesn't
# support surrogate pairs, and json doesn't support \x escapes.
r = json.loads(json.dumps(json5.loads(match)))
report_surrogates(r)
return r

9
src/parsers/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
# Copyright (c) Paillat-dev
# SPDX-License-Identifier: MIT
from .base import BuildParser
from .emojis_parser import EmojisParser
PARSERS: list[type[BuildParser]] = [EmojisParser]
__all__ = ("PARSERS", "EmojisParser")

32
src/parsers/base.py Normal file
View File

@@ -0,0 +1,32 @@
# Copyright (c) Paillat-dev
# SPDX-License-Identifier: MIT
from abc import ABC, abstractmethod
class ParseError(Exception, ABC):
"""Base class for all parsing errors."""
class BuildParser(ABC):
"""Base class for all build parsers."""
FILE_NAME: str
def __init__(self, discord_build: str) -> None:
"""Initialize the parser with the discord build.
Args:
discord_build (str): The content of the discord build to parse.
"""
self.discord_build: str = discord_build
@abstractmethod
def __call__(self) -> tuple[bytes, str]:
"""Return a tuple of the parsed data we want to obtain and a hash for verifying whether the data has changed.
Returns:
Tuple[T, str]: A tuple containing the parsed data and a hash string.
"""

View File

@@ -0,0 +1,92 @@
# Copyright (c) Paillat-dev
# SPDX-License-Identifier: MIT
import hashlib
import json
import re
from typing import Literal, NotRequired, TypedDict, final, override
import json5
from .base import BuildParser, ParseError
PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""")
class EmojiParseError(ParseError):
"""Base class for all emoji parsing errors."""
class NotFoundError(EmojiParseError):
"""No matches found in the build."""
class MultipleFoundError(EmojiParseError):
"""Multiple matches found in the build."""
class EmojiData(TypedDict):
"""Structure of an emoji's data."""
names: list[str]
surrogates: str
unicodeVersion: float
spriteIndex: int
hasMultiDiversityParent: NotRequired[bool]
class EmojisData(TypedDict):
"""Structure of the extracted emojis data."""
emojis: list[EmojiData]
emojisByCategory: dict[str, list[int]]
nameToEmoji: dict[str, int]
surrogateToEmoji: dict[str, int]
numDiversitySprites: int
numNonDiversitySprites: int
_SUR = re.compile(r"[\uD800-\uDFFF]")
@final
class EmojisParser(BuildParser):
"""Parser for extracting emojis from the discord build."""
FILE_NAME: Literal["emojis.json"] = "emojis.json" # pyright: ignore[reportIncompatibleVariableOverride]
@override
def __call__(self) -> tuple[bytes, str]:
"""Extract emojis from the discord build and return them as a JSON dump and a hash.
Returns:
Tuple[bytes, str]: A tuple containing the JSON dump of the emojis and a SHA-256 hash of the dump.
"""
extracted = self.extract_emojis_from_str(self.discord_build)
new_dump = json.dumps(extracted, indent=4, ensure_ascii=False).encode("utf-8")
new_hash = hashlib.sha256(string=new_dump).hexdigest()
return new_dump, new_hash
@staticmethod
def extract_emojis_from_str(content: str) -> EmojisData:
"""Extract emojis from a string containing the discord build."""
print("Searching for emojis...")
matches: list[str] = PATTERN.findall(content)
if len(matches) == 0:
raise NotFoundError("No matches found")
if len(matches) > 1:
raise MultipleFoundError("Multiple matches found")
match: str = matches[0]
print("Found emojis!")
print("Parsing json")
# First load with json5 to handle \x escapes, then dump and reload with json to collapse
# the surrogate pairs into a single code point. This is necessary because json5 doesn't
# support surrogate pairs, and json doesn't support \x escapes.
r: EmojisData = json.loads(json.dumps(json5.loads(match)))
return r