Source code for xpfcorpus.io.repository

"""Package repository for bundled JSON data."""

from __future__ import annotations

import json
from functools import lru_cache
from importlib import resources
from typing import Optional

from ..engine.rules import LanguageData
from ..exceptions import LanguageNotFoundError
from .json_loader import JSONLoader
from .yaml_exporter import export_to_yaml


[docs] class PackageRepository: """ Access bundled language data from the package. This uses importlib.resources to access JSON files bundled with the package. """ _index: Optional[dict[str, dict]] = None @classmethod def _get_index(cls) -> dict[str, dict]: """Load and cache the language index.""" if cls._index is None: try: # Python 3.9+ style files = resources.files("xpfcorpus.data") index_file = files.joinpath("index.json") content = index_file.read_text(encoding="utf-8") cls._index = json.loads(content) except (FileNotFoundError, AttributeError, TypeError): # Fallback: no bundled data yet cls._index = {} return cls._index
[docs] @classmethod def available_languages(cls) -> dict[str, dict]: """ Get a dictionary of available languages. Returns: Dict mapping language codes to metadata: { "es": {"scripts": ["latin"], "default": "latin"}, "tt": {"scripts": ["latin", "cyrillic"], "default": null}, ... } """ return cls._get_index().copy()
[docs] @classmethod def has_language(cls, code: str) -> bool: """ Check if a language is available. Checks both the index and the filesystem for language files. This allows variants (e.g., es-ES.json) to exist without being in index.json. """ # First check the index if code in cls._get_index(): return True # Also check if the file exists (for variants not in index) try: files = resources.files("xpfcorpus.data.languages") lang_file = files.joinpath(f"{code}.json") # Try to read to check if it exists _ = lang_file.read_text(encoding="utf-8") return True except (FileNotFoundError, AttributeError, TypeError): return False
[docs] @classmethod def get_scripts(cls, code: str) -> list[str]: """ Get available scripts for a language. Raises: LanguageNotFoundError: If the language is not available. """ index = cls._get_index() if code not in index: raise LanguageNotFoundError(code, list(index.keys())) return index[code].get("scripts", [])
[docs] @classmethod def get_default_script(cls, code: str) -> Optional[str]: """ Get the default script for a language. Returns None if no default is set. Raises: LanguageNotFoundError: If the language is not available. """ index = cls._get_index() if code not in index: raise LanguageNotFoundError(code, list(index.keys())) return index[code].get("default")
[docs] @classmethod @lru_cache(maxsize=128) def load_language(cls, code: str) -> LanguageData: """ Load language data from the bundled JSON files. Args: code: Language code (e.g., "es", "tt"). Returns: LanguageData object. Raises: LanguageNotFoundError: If the language is not available. """ index = cls._get_index() if code not in index: raise LanguageNotFoundError(code, list(index.keys())) try: files = resources.files("xpfcorpus.data.languages") lang_file = files.joinpath(f"{code}.json") content = lang_file.read_text(encoding="utf-8") data = json.loads(content) # Ensure code is set in metadata if "metadata" not in data: data["metadata"] = {} if "code" not in data["metadata"]: data["metadata"]["code"] = code # Add default_script from index if not in file if "default_script" not in data["metadata"]: data["metadata"]["default_script"] = index[code].get("default") return JSONLoader.from_dict(data) except (FileNotFoundError, AttributeError, TypeError) as e: raise LanguageNotFoundError(code) from e
[docs] @classmethod def export_language_yaml(cls, code: str) -> str: """ Export a language's data as a YAML string. Args: code: Language code. Returns: YAML-formatted string. """ lang_data = cls.load_language(code) return export_to_yaml(lang_data)
[docs] @classmethod def clear_cache(cls): """Clear the loaded language cache.""" cls._index = None cls.load_language.cache_clear()