First commit

2026-03-02 22:14:54 +00:00 · 2025-05-03 16:22:11 +02:00
commit 5c622e590f
11 changed files with 39956 additions and 0 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,47 @@
+import os
+import pathlib
+import hashlib
+import json
+
+import sys
+from typing import Any
+
+from extract import extract_emojis_from_str
+from download_build import dowload
+
+
+def main() -> None:
+    build_path = pathlib.Path(os.getcwd()) / "build"
+    build_path.mkdir(exist_ok=True)
+
+    out_path = build_path / "emojis.json"
+    hash_path = build_path / "hash.txt"
+
+    if not out_path.exists():
+        out_path.touch()
+
+    if hash_path.exists():
+        with hash_path.open("r", encoding="utf-8") as hash_file:
+            current_hash = hash_file.read()
+    else:
+        current_hash = ""
+
+    new: dict[Any, Any] = extract_emojis_from_str(dowload())  # pyright: ignore[reportExplicitAny]
+    new_dump = json.dumps(new, indent=4, ensure_ascii=False).encode("utf-8")
+    new_hash = hashlib.sha256(string=new_dump).hexdigest()
+
+    if current_hash == new_hash:
+        print("No changes")
+        sys.exit(2)  # No changes
+
+    with out_path.open("wb") as out_file:
+        out_file.write(new_dump)
+
+    with hash_path.open("w", encoding="utf-8") as hash_file:
+        hash_file.write(new_hash)
+
+    print("Updated emojis.json")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/download_build.py
+++ b/src/download_build.py
@@ -0,0 +1,10 @@
+import requests
+
+URL = "https://raw.githubusercontent.com/Discord-Datamining/Discord-Datamining/refs/heads/master/current.js"
+
+
+def dowload() -> str:
+    print("Downloading the latest discord build")
+    response = requests.get(URL)
+    response.raise_for_status()
+    return response.text
--- a/src/extract.py
+++ b/src/extract.py
@@ -0,0 +1,78 @@
+import json
+import re
+from warnings import warn
+import json5
+from typing import Any
+from collections.abc import Mapping, Sequence
+
+
+type AnyDict = dict[Any, Any]  # pyright: ignore[reportExplicitAny]
+type AnyList = list[Any]  # pyright: ignore[reportExplicitAny]
+type AnyTuple = tuple[Any, ...]  # pyright: ignore[reportExplicitAny]
+
+PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""")
+
+
+class ExtractError(Exception):
+    pass
+
+
+class NotFoundError(ExtractError):
+    pass
+
+
+class MultipleFoundError(ExtractError):
+    pass
+
+
+_SUR = re.compile(r"[\uD800-\uDFFF]")
+
+
+def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "") -> None:
+    """
+    Recursively walk *node* (dict / list / tuple / str) and print the location
+    and code-point of every UTF-16 surrogate half it encounters.
+
+    >>> data = {"a": "OK", "b": ["\\uD83D", {"c": "x\\uDE00y"}]}
+    >>> report_surrogates(data)
+    b[0] : U+D83D
+    b[1].c : U+DE00
+    """
+    if isinstance(node, str):
+        for m in _SUR.finditer(node):
+            cp = ord(m.group())
+            warn(f"Surrogate found at {path or '<root>'} : U+{cp:04X}")
+        return
+
+    if isinstance(node, Mapping):
+        for k, v in node.items():
+            sub = f"{path}.{k}" if path else str(k)
+            report_surrogates(v, sub)
+        return
+
+    if isinstance(node, Sequence) and not isinstance(node, (str, bytes, bytearray)):  # pyright: ignore[reportUnnecessaryIsInstance]
+        for i, v in enumerate(node):
+            report_surrogates(v, f"{path}[{i}]")
+
+
+def extract_emojis_from_str(content: str) -> AnyDict:
+    print("Searching for emojis...")
+    matches: list[str] = PATTERN.findall(content)
+
+    if len(matches) == 0:
+        raise NotFoundError("No matches found")
+    elif len(matches) > 1:
+        raise MultipleFoundError("Multiple matches found")
+
+    match: str = matches[0]
+
+    print("Found emojis!")
+    print("Parsing json")
+
+    # First load with json5 to handle \x escapes, then dump and reload with json to collapse
+    # the surrogate pairs into a single code point. This is necessary because json5 doesn't
+    # support surrogate pairs, and json doesn't support \x escapes.
+    r = json.loads(json.dumps(json5.loads(match)))
+    report_surrogates(r)
+
+    return r