Refactor code structure and enhance documentation in main modules

- Updated `pyproject.toml` to include configuration for `ruff` linting tool.
- Improved `download_build.py` by adding a docstring to the `dowload` function.
- Refactored `__main__.py` for clarity and consistency in path handling.
- Enhanced `extract.py` with detailed docstrings for error classes and functions.
This commit is contained in:
2025-05-03 18:13:58 +02:00
parent d6df9c76d0
commit 1749dbffc7
4 changed files with 47 additions and 18 deletions

View File

@@ -14,6 +14,33 @@ dev = [
] ]
[tool.basedpyright] [tool.basedpyright]
pythonVersion = "3.13"
typeCheckingMode = "all" typeCheckingMode = "all"
reportUnusedCallResult = false reportUnusedCallResult = false
reportAny = false reportAny = false
[tool.ruff]
target-version = "py313"
line-length = 120
indent-width = 4
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"
docstring-code-format = false
docstring-code-line-length = "dynamic"
[tool.ruff.lint]
select = ["ALL"]
extend-ignore = [
"D203",
"D213",
"COM812",
"EM101",
"TRY003",
"T201",
"D100",
"D400"
]

View File

@@ -1,20 +1,19 @@
# Copyright (c) Paillat-dev # Copyright (c) Paillat-dev
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import os
import pathlib
import hashlib import hashlib
import json import json
import pathlib
import sys import sys
from typing import Any from typing import Any
from extract import extract_emojis_from_str
from download_build import dowload from download_build import dowload
from extract import extract_emojis_from_str
def main() -> None: def main() -> None:
build_path = pathlib.Path(os.getcwd()) / "build" """Download the latest discord build and extract emojis."""
build_path = pathlib.Path.cwd() / "build"
build_path.mkdir(exist_ok=True) build_path.mkdir(exist_ok=True)
out_path = build_path / "emojis.json" out_path = build_path / "emojis.json"

View File

@@ -7,7 +7,11 @@ URL = "https://raw.githubusercontent.com/Discord-Datamining/Discord-Datamining/r
def dowload() -> str: def dowload() -> str:
"""Download the latest discord build from the datamining repository.
Returns the content of the file as a string.
"""
print("Downloading the latest discord build") print("Downloading the latest discord build")
response = requests.get(URL) response = requests.get(URL, timeout=10)
response.raise_for_status() response.raise_for_status()
return response.text return response.text

View File

@@ -3,11 +3,11 @@
import json import json
import re import re
from warnings import warn
import json5
from typing import Any
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
from typing import Any
from warnings import warn
import json5
type AnyDict = dict[Any, Any] # pyright: ignore[reportExplicitAny] type AnyDict = dict[Any, Any] # pyright: ignore[reportExplicitAny]
type AnyList = list[Any] # pyright: ignore[reportExplicitAny] type AnyList = list[Any] # pyright: ignore[reportExplicitAny]
@@ -17,34 +17,32 @@ PATTERN = re.compile(r"""(?<=\(')(\{"emojis".*?\})(?='\))""")
class ExtractError(Exception): class ExtractError(Exception):
pass """Base class for all extract errors."""
class NotFoundError(ExtractError): class NotFoundError(ExtractError):
pass """No matches found in the build."""
class MultipleFoundError(ExtractError): class MultipleFoundError(ExtractError):
pass """Multiple matches found in the build."""
_SUR = re.compile(r"[\uD800-\uDFFF]") _SUR = re.compile(r"[\uD800-\uDFFF]")
def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "") -> None: def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "") -> None:
""" r"""Recursively walk *node* (dict / list / tuple / str) and print the location and code-point of every UTF-16 surrogate half it encounters.
Recursively walk *node* (dict / list / tuple / str) and print the location
and code-point of every UTF-16 surrogate half it encounters.
>>> data = {"a": "OK", "b": ["\\uD83D", {"c": "x\\uDE00y"}]} >>> data = {"a": "OK", "b": ["\\uD83D", {"c": "x\\uDE00y"}]}
>>> report_surrogates(data) >>> report_surrogates(data)
b[0] : U+D83D b[0] : U+D83D
b[1].c : U+DE00 b[1].c : U+DE00
""" """ # noqa: E501
if isinstance(node, str): if isinstance(node, str):
for m in _SUR.finditer(node): for m in _SUR.finditer(node):
cp = ord(m.group()) cp = ord(m.group())
warn(f"Surrogate found at {path or '<root>'} : U+{cp:04X}") warn(f"Surrogate found at {path or '<root>'} : U+{cp:04X}", SyntaxWarning, 2)
return return
if isinstance(node, Mapping): if isinstance(node, Mapping):
@@ -59,12 +57,13 @@ def report_surrogates(node: AnyDict | AnyList | AnyTuple | str, path: str = "")
def extract_emojis_from_str(content: str) -> AnyDict: def extract_emojis_from_str(content: str) -> AnyDict:
"""Extract emojis from a string containing the discord build."""
print("Searching for emojis...") print("Searching for emojis...")
matches: list[str] = PATTERN.findall(content) matches: list[str] = PATTERN.findall(content)
if len(matches) == 0: if len(matches) == 0:
raise NotFoundError("No matches found") raise NotFoundError("No matches found")
elif len(matches) > 1: if len(matches) > 1:
raise MultipleFoundError("Multiple matches found") raise MultipleFoundError("Multiple matches found")
match: str = matches[0] match: str = matches[0]