Source code for xpfcorpus.io.legacy_loader

"""Legacy format loader for .rules and .verify files."""

from __future__ import annotations

import csv
import re
from pathlib import Path
from typing import TextIO

from ..engine.rules import RuleSet, ScriptData, SubRule, VerifyEntry
from ..exceptions import RulesParseError


def _sniff_dialect(filestream: TextIO) -> tuple[list[str], csv.Dialect]:
    """
    Determine the CSV dialect of a file.

    Returns (lines, dialect) where lines excludes comments and empty lines.
    """
    lines = [
        line for line in filestream
        if not (line.startswith("#") or len(line.strip()) == 0)
    ]

    if all(line.find("\t") >= 0 for line in lines):
        dialect = csv.get_dialect("excel-tab")
    else:
        sample = "\n".join(lines)
        try:
            dialect = csv.Sniffer().sniff(sample)
        except csv.Error:
            dialect = csv.get_dialect("excel")

    return lines, dialect


[docs] class LegacyLoader: """Load language data from .rules and .verify files."""
[docs] @classmethod def load_rules(cls, path: Path | str) -> RuleSet: """ Load rules from a .rules file. Args: path: Path to the .rules file. Returns: RuleSet object. """ path = Path(path) if not path.exists(): raise RulesParseError(str(path), "File not found") classes: dict[str, str] = {} pre: dict[str, str] = {} matches: dict[str, str] = {} subs: list[SubRule] = [] ipasubs: list[SubRule] = [] words: dict[str, list[str]] = {} with open(path, "r", encoding="utf-8") as f: lines, dialect = _sniff_dialect(f) reader = csv.DictReader(lines, dialect=dialect) for row in reader: try: rule_type = row.get("type", "").strip() if rule_type == "class": sfrom = row.get("sfrom", "") sto = row.get("sto", "") classes[sfrom] = sto elif rule_type == "pre": sfrom = row.get("sfrom", "") sto = row.get("sto", "") # Pre rules map character-to-character for i, char in enumerate(sfrom): if i < len(sto): pre[char] = sto[i] elif rule_type == "match": sfrom = row.get("sfrom", "") sto = row.get("sto", "") # Expand class references while re.search(r"\{.*\}", sto): sto = sto.format(**classes) matches[sfrom] = sto elif rule_type == "sub": sub = cls._parse_sub_rule(row, classes) subs.append(sub) elif rule_type == "ipasub": sub = cls._parse_sub_rule(row, classes) ipasubs.append(sub) elif rule_type == "word": word = row.get("sfrom", "") sto = row.get("sto", "") words[word] = sto.split() except Exception: # Skip malformed rules continue return RuleSet( classes=classes, pre=pre, matches=matches, subs=subs, ipasubs=ipasubs, words=words, )
@classmethod def _parse_sub_rule( cls, row: dict[str, str], classes: dict[str, str] ) -> SubRule: """Parse a sub/ipasub rule from a CSV row.""" sfrom = row.get("sfrom", "") sto = row.get("sto", "") precede = row.get("precede", "") follow = row.get("follow", "") weight_str = row.get("weight", "1.0") # Parse weight try: weight = float(weight_str) if weight_str else 1.0 except ValueError: weight = 1.0 # Expand class references for _ in range(10): # Limit iterations to prevent infinite loops changed = False if re.search(r"\{.*\}", sfrom): sfrom = sfrom.format(**classes) changed = True if re.search(r"\{.*\}", sto): sto = sto.format(**classes) changed = True if re.search(r"\{.*\}", precede): precede = precede.format(**classes) changed = True if re.search(r"\{.*\}", follow): follow = follow.format(**classes) changed = True if not changed: break return SubRule( sfrom=sfrom, sto=sto, weight=weight, precede=precede, follow=follow, )
[docs] @classmethod def load_verify(cls, path: Path | str) -> list[VerifyEntry]: """ Load verification entries from a .verify file. Args: path: Path to the .verify or .verify.csv file. Returns: List of VerifyEntry objects. """ path = Path(path) if not path.exists(): return [] entries: list[VerifyEntry] = [] with open(path, "r", encoding="utf-8") as f: lines, dialect = _sniff_dialect(f) reader = csv.reader(lines, dialect=dialect) for row in reader: if len(row) < 2: continue word = row[0].strip() phonemes = row[1].strip() comment = row[2].strip() if len(row) > 2 else "" if word and phonemes: entries.append(VerifyEntry( word=word, phonemes=phonemes, comment=comment, )) return entries
[docs] @classmethod def load_from_files( cls, rules_path: Path | str, verify_path: Path | str | None = None, ) -> ScriptData: """ Load script data from .rules and .verify files. Args: rules_path: Path to the .rules file. verify_path: Path to the .verify file (optional). Returns: ScriptData object. """ rules = cls.load_rules(rules_path) verify: list[VerifyEntry] = [] if verify_path: verify = cls.load_verify(verify_path) return ScriptData(rules=rules, verify=verify)