Source code for xpfcorpus.engine.rules

"""Data structures for xpfcorpus rules and language data."""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from typing import Optional



[docs]
@dataclass
class SubRule:
    """
    A substitution rule with pattern matching and context.

    Wraps a regex-based substitution with optional precede/follow context
    and a weight for rule prioritization.
    """
    sfrom: str
    sto: str
    weight: float = 1.0
    precede: str = ""
    follow: str = ""

    # Compiled regexes (computed on first use)
    _sfrom_re: Optional[re.Pattern] = field(default=None, repr=False, compare=False)
    _precede_re: Optional[re.Pattern] = field(default=None, repr=False, compare=False)
    _follow_re: Optional[re.Pattern] = field(default=None, repr=False, compare=False)

    @property
    def sfrom_re(self) -> re.Pattern:
        if self._sfrom_re is None:
            self._sfrom_re = re.compile(self.sfrom)
        return self._sfrom_re

    @property
    def precede_re(self) -> re.Pattern:
        if self._precede_re is None:
            self._precede_re = re.compile(self.precede + "$")
        return self._precede_re

    @property
    def follow_re(self) -> re.Pattern:
        if self._follow_re is None:
            self._follow_re = re.compile("^" + self.follow)
        return self._follow_re


[docs]
    def matches(self, sfrom: str, precede: str, follow: str) -> Optional[float]:
        """
        Check if this rule matches the given context.

        Returns the rule weight if matched, None otherwise.
        """
        if (self.sfrom_re.match(sfrom) and
            self.precede_re.search(precede) and
            self.follow_re.search(follow)):
            return self.weight
        return None



[docs]
    def substitute(self, text: str) -> str:
        """Apply this rule's substitution to the given text."""
        return self.sfrom_re.sub(self.sto, text)


    def __lt__(self, other: SubRule) -> bool:
        if not isinstance(other, SubRule):
            raise TypeError(f"Cannot compare SubRule with {type(other)}")
        return self.weight < other.weight




[docs]
@dataclass
class RuleSet:
    """
    A complete set of rules for translating a script.

    Contains:
    - classes: character class definitions for use in other rules
    - pre: character-level preprocessing (as a translation table)
    - matches: simple character-to-phoneme mappings (no context)
    - subs: context-sensitive substitution rules
    - ipasubs: post-processing substitution rules on IPA output
    - words: whole-word exception mappings
    """
    classes: dict[str, str] = field(default_factory=dict)
    pre: dict[str, str] = field(default_factory=dict)  # sfrom -> sto for maketrans
    matches: dict[str, str] = field(default_factory=dict)
    subs: list[SubRule] = field(default_factory=list)
    ipasubs: list[SubRule] = field(default_factory=list)
    words: dict[str, list[str]] = field(default_factory=dict)


[docs]
    def get_pre_translation_table(self) -> dict[int, str]:
        """Build a str.maketrans table from the pre rules."""
        if not self.pre:
            return str.maketrans("", "")
        # Combine all pre mappings
        sfrom = "".join(self.pre.keys())
        sto = "".join(self.pre.values())
        return str.maketrans(sfrom, sto)





[docs]
@dataclass
class VerifyEntry:
    """A single verification entry: word and expected phonemes."""
    word: str
    phonemes: str
    comment: str = ""




[docs]
@dataclass
class ScriptData:
    """Data for a single script of a language."""
    rules: RuleSet
    verify: list[VerifyEntry] = field(default_factory=list)




[docs]
@dataclass
class LanguageData:
    """
    Complete data for a language, including all scripts.

    A language may have multiple scripts (e.g., tt-latin, tt-cyrillic).
    If there's a default_script, that script is used when no script
    is explicitly specified.
    """
    code: str
    name: str = ""
    family: str = ""
    macroarea: str = ""
    compromised: Optional[dict | bool] = None
    default_script: Optional[str] = None
    scripts: dict[str, ScriptData] = field(default_factory=dict)


[docs]
    def get_script_data(self, script: Optional[str] = None) -> ScriptData:
        """
        Get the ScriptData for the specified script, or the default.

        Raises ValueError if no script specified and no default exists.
        """
        if script is None:
            if self.default_script is None:
                available = list(self.scripts.keys())
                raise ValueError(
                    f"Language '{self.code}' has no default script; "
                    f"specify one of: {', '.join(available)}"
                )
            script = self.default_script

        if script not in self.scripts:
            available = list(self.scripts.keys())
            raise ValueError(
                f"Script '{script}' not found for language '{self.code}'; "
                f"available scripts: {', '.join(available)}"
            )

        return self.scripts[script]