Source code for xpfcorpus.io.json_loader

"""JSON format loader for xpfcorpus - no external dependencies."""

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any

from ..engine.rules import (
    LanguageData,
    RuleSet,
    ScriptData,
    SubRule,
    VerifyEntry,
)



[docs]
class JSONLoader:
    """Load language data from JSON files."""


[docs]
    @classmethod
    def load(cls, path: Path | str) -> LanguageData:
        """
        Load language data from a JSON file.

        Args:
            path: Path to the JSON file.

        Returns:
            LanguageData object.
        """
        path = Path(path)
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return cls.from_dict(data)



[docs]
    @classmethod
    def load_string(cls, content: str) -> LanguageData:
        """Load language data from a JSON string."""
        data = json.loads(content)
        return cls.from_dict(data)



[docs]
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> LanguageData:
        """
        Convert a dictionary to LanguageData.

        Expected structure:
        {
            "metadata": {
                "code": "es",
                "name": "Spanish",
                "family": "...",
                "macroarea": "...",
                "compromised": false,
                "default_script": "latin"
            },
            "scripts": {
                "latin": {
                    "verify": [...],
                    "rules": {...}
                }
            }
        }
        """
        metadata = data.get("metadata", {})

        scripts: dict[str, ScriptData] = {}
        for script_name, script_data in data.get("scripts", {}).items():
            scripts[script_name] = cls._parse_script_data(
                script_data, metadata.get("classes", {})
            )

        return LanguageData(
            code=metadata.get("code", ""),
            name=metadata.get("name", ""),
            family=metadata.get("family", ""),
            macroarea=metadata.get("macroarea", ""),
            compromised=metadata.get("compromised"),
            default_script=metadata.get("default_script"),
            scripts=scripts,
        )


    @classmethod
    def _parse_script_data(
        cls, data: dict[str, Any], global_classes: dict[str, str]
    ) -> ScriptData:
        """Parse a script's rules and verify data."""
        rules_data = data.get("rules", {})
        verify_data = data.get("verify", [])

        # Parse verify entries
        verify_entries = [
            VerifyEntry(
                word=v.get("word", ""),
                phonemes=v.get("phonemes", ""),
                comment=v.get("comment", ""),
            )
            for v in verify_data
        ]

        # Parse rules
        rules = cls._parse_rules(rules_data, global_classes)

        return ScriptData(rules=rules, verify=verify_entries)

    @classmethod
    def _parse_rules(
        cls, data: dict[str, Any], global_classes: dict[str, str]
    ) -> RuleSet:
        """Parse rules from a dictionary."""
        # Merge global classes with script-specific classes
        classes = {**global_classes, **data.get("classes", {})}

        # Parse pre rules
        pre: dict[str, str] = {}
        for p in data.get("pre", []):
            sfrom = p.get("from", "")
            sto = p.get("to", "")
            # Pre rules map character-to-character
            for i, char in enumerate(sfrom):
                if i < len(sto):
                    pre[char] = sto[i]

        # Parse match rules (simple character mappings)
        matches: dict[str, str] = {}
        for m in data.get("match", []):
            sfrom = m.get("from", "")
            sto = m.get("to", "")
            # Expand class references
            while re.search(r"\{.*\}", sto):
                sto = sto.format(**classes)
            matches[sfrom] = sto

        # Parse sub rules
        subs = [
            cls._parse_sub_rule(s, classes)
            for s in data.get("sub", [])
        ]

        # Parse ipasub rules
        ipasubs = [
            cls._parse_sub_rule(s, classes)
            for s in data.get("ipasub", [])
        ]

        # Parse word rules
        words: dict[str, list[str]] = {}
        for w in data.get("word", []):
            word = w.get("word", "")
            phonemes = w.get("phonemes", "")
            words[word] = phonemes.split()

        return RuleSet(
            classes=classes,
            pre=pre,
            matches=matches,
            subs=subs,
            ipasubs=ipasubs,
            words=words,
        )

    @classmethod
    def _parse_sub_rule(
        cls, data: dict[str, Any], classes: dict[str, str]
    ) -> SubRule:
        """Parse a single sub rule, expanding class references."""
        sfrom = data.get("from", "")
        sto = data.get("to", "")
        precede = data.get("precede", "")
        follow = data.get("follow", "")
        weight = float(data.get("weight", 1.0))

        # Expand class references
        for field in [sfrom, sto, precede, follow]:
            while re.search(r"\{.*\}", field):
                field = field.format(**classes)

        # Re-expand after the loop (variables are local)
        while re.search(r"\{.*\}", sfrom):
            sfrom = sfrom.format(**classes)
        while re.search(r"\{.*\}", sto):
            sto = sto.format(**classes)
        while re.search(r"\{.*\}", precede):
            precede = precede.format(**classes)
        while re.search(r"\{.*\}", follow):
            follow = follow.format(**classes)

        return SubRule(
            sfrom=sfrom,
            sto=sto,
            weight=weight,
            precede=precede,
            follow=follow,
        )