Source code for xpfcorpus.io.json_loader

"""JSON format loader for xpfcorpus - no external dependencies."""

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any

from ..engine.rules import (
    LanguageData,
    RuleSet,
    ScriptData,
    SubRule,
    VerifyEntry,
)


[docs] class JSONLoader: """Load language data from JSON files."""
[docs] @classmethod def load(cls, path: Path | str) -> LanguageData: """ Load language data from a JSON file. Args: path: Path to the JSON file. Returns: LanguageData object. """ path = Path(path) with open(path, "r", encoding="utf-8") as f: data = json.load(f) return cls.from_dict(data)
[docs] @classmethod def load_string(cls, content: str) -> LanguageData: """Load language data from a JSON string.""" data = json.loads(content) return cls.from_dict(data)
[docs] @classmethod def from_dict(cls, data: dict[str, Any]) -> LanguageData: """ Convert a dictionary to LanguageData. Expected structure: { "metadata": { "code": "es", "name": "Spanish", "family": "...", "macroarea": "...", "compromised": false, "default_script": "latin" }, "scripts": { "latin": { "verify": [...], "rules": {...} } } } """ metadata = data.get("metadata", {}) scripts: dict[str, ScriptData] = {} for script_name, script_data in data.get("scripts", {}).items(): scripts[script_name] = cls._parse_script_data( script_data, metadata.get("classes", {}) ) return LanguageData( code=metadata.get("code", ""), name=metadata.get("name", ""), family=metadata.get("family", ""), macroarea=metadata.get("macroarea", ""), compromised=metadata.get("compromised"), default_script=metadata.get("default_script"), scripts=scripts, )
@classmethod def _parse_script_data( cls, data: dict[str, Any], global_classes: dict[str, str] ) -> ScriptData: """Parse a script's rules and verify data.""" rules_data = data.get("rules", {}) verify_data = data.get("verify", []) # Parse verify entries verify_entries = [ VerifyEntry( word=v.get("word", ""), phonemes=v.get("phonemes", ""), comment=v.get("comment", ""), ) for v in verify_data ] # Parse rules rules = cls._parse_rules(rules_data, global_classes) return ScriptData(rules=rules, verify=verify_entries) @classmethod def _parse_rules( cls, data: dict[str, Any], global_classes: dict[str, str] ) -> RuleSet: """Parse rules from a dictionary.""" # Merge global classes with script-specific classes classes = {**global_classes, **data.get("classes", {})} # Parse pre rules pre: dict[str, str] = {} for p in data.get("pre", []): sfrom = p.get("from", "") sto = p.get("to", "") # Pre rules map character-to-character for i, char in enumerate(sfrom): if i < len(sto): pre[char] = sto[i] # Parse match rules (simple character mappings) matches: dict[str, str] = {} for m in data.get("match", []): sfrom = m.get("from", "") sto = m.get("to", "") # Expand class references while re.search(r"\{.*\}", sto): sto = sto.format(**classes) matches[sfrom] = sto # Parse sub rules subs = [ cls._parse_sub_rule(s, classes) for s in data.get("sub", []) ] # Parse ipasub rules ipasubs = [ cls._parse_sub_rule(s, classes) for s in data.get("ipasub", []) ] # Parse word rules words: dict[str, list[str]] = {} for w in data.get("word", []): word = w.get("word", "") phonemes = w.get("phonemes", "") words[word] = phonemes.split() return RuleSet( classes=classes, pre=pre, matches=matches, subs=subs, ipasubs=ipasubs, words=words, ) @classmethod def _parse_sub_rule( cls, data: dict[str, Any], classes: dict[str, str] ) -> SubRule: """Parse a single sub rule, expanding class references.""" sfrom = data.get("from", "") sto = data.get("to", "") precede = data.get("precede", "") follow = data.get("follow", "") weight = float(data.get("weight", 1.0)) # Expand class references for field in [sfrom, sto, precede, follow]: while re.search(r"\{.*\}", field): field = field.format(**classes) # Re-expand after the loop (variables are local) while re.search(r"\{.*\}", sfrom): sfrom = sfrom.format(**classes) while re.search(r"\{.*\}", sto): sto = sto.format(**classes) while re.search(r"\{.*\}", precede): precede = precede.format(**classes) while re.search(r"\{.*\}", follow): follow = follow.format(**classes) return SubRule( sfrom=sfrom, sto=sto, weight=weight, precede=precede, follow=follow, )