Source code for xpfcorpus.io.legacy_loader

"""Legacy format loader for .rules and .verify files."""

from __future__ import annotations

import csv
import re
from pathlib import Path
from typing import TextIO

from ..engine.rules import RuleSet, ScriptData, SubRule, VerifyEntry
from ..exceptions import RulesParseError


def _sniff_dialect(filestream: TextIO) -> tuple[list[str], csv.Dialect]:
    """
    Determine the CSV dialect of a file.

    Returns (lines, dialect) where lines excludes comments and empty lines.
    """
    lines = [
        line for line in filestream
        if not (line.startswith("#") or len(line.strip()) == 0)
    ]

    if all(line.find("\t") >= 0 for line in lines):
        dialect = csv.get_dialect("excel-tab")
    else:
        sample = "\n".join(lines)
        try:
            dialect = csv.Sniffer().sniff(sample)
        except csv.Error:
            dialect = csv.get_dialect("excel")

    return lines, dialect



[docs]
class LegacyLoader:
    """Load language data from .rules and .verify files."""


[docs]
    @classmethod
    def load_rules(cls, path: Path | str) -> RuleSet:
        """
        Load rules from a .rules file.

        Args:
            path: Path to the .rules file.

        Returns:
            RuleSet object.
        """
        path = Path(path)
        if not path.exists():
            raise RulesParseError(str(path), "File not found")

        classes: dict[str, str] = {}
        pre: dict[str, str] = {}
        matches: dict[str, str] = {}
        subs: list[SubRule] = []
        ipasubs: list[SubRule] = []
        words: dict[str, list[str]] = {}

        with open(path, "r", encoding="utf-8") as f:
            lines, dialect = _sniff_dialect(f)

        reader = csv.DictReader(lines, dialect=dialect)

        for row in reader:
            try:
                rule_type = row.get("type", "").strip()

                if rule_type == "class":
                    sfrom = row.get("sfrom", "")
                    sto = row.get("sto", "")
                    classes[sfrom] = sto

                elif rule_type == "pre":
                    sfrom = row.get("sfrom", "")
                    sto = row.get("sto", "")
                    # Pre rules map character-to-character
                    for i, char in enumerate(sfrom):
                        if i < len(sto):
                            pre[char] = sto[i]

                elif rule_type == "match":
                    sfrom = row.get("sfrom", "")
                    sto = row.get("sto", "")
                    # Expand class references
                    while re.search(r"\{.*\}", sto):
                        sto = sto.format(**classes)
                    matches[sfrom] = sto

                elif rule_type == "sub":
                    sub = cls._parse_sub_rule(row, classes)
                    subs.append(sub)

                elif rule_type == "ipasub":
                    sub = cls._parse_sub_rule(row, classes)
                    ipasubs.append(sub)

                elif rule_type == "word":
                    word = row.get("sfrom", "")
                    sto = row.get("sto", "")
                    words[word] = sto.split()

            except Exception:
                # Skip malformed rules
                continue

        return RuleSet(
            classes=classes,
            pre=pre,
            matches=matches,
            subs=subs,
            ipasubs=ipasubs,
            words=words,
        )


    @classmethod
    def _parse_sub_rule(
        cls, row: dict[str, str], classes: dict[str, str]
    ) -> SubRule:
        """Parse a sub/ipasub rule from a CSV row."""
        sfrom = row.get("sfrom", "")
        sto = row.get("sto", "")
        precede = row.get("precede", "")
        follow = row.get("follow", "")
        weight_str = row.get("weight", "1.0")

        # Parse weight
        try:
            weight = float(weight_str) if weight_str else 1.0
        except ValueError:
            weight = 1.0

        # Expand class references
        for _ in range(10):  # Limit iterations to prevent infinite loops
            changed = False
            if re.search(r"\{.*\}", sfrom):
                sfrom = sfrom.format(**classes)
                changed = True
            if re.search(r"\{.*\}", sto):
                sto = sto.format(**classes)
                changed = True
            if re.search(r"\{.*\}", precede):
                precede = precede.format(**classes)
                changed = True
            if re.search(r"\{.*\}", follow):
                follow = follow.format(**classes)
                changed = True
            if not changed:
                break

        return SubRule(
            sfrom=sfrom,
            sto=sto,
            weight=weight,
            precede=precede,
            follow=follow,
        )


[docs]
    @classmethod
    def load_verify(cls, path: Path | str) -> list[VerifyEntry]:
        """
        Load verification entries from a .verify file.

        Args:
            path: Path to the .verify or .verify.csv file.

        Returns:
            List of VerifyEntry objects.
        """
        path = Path(path)
        if not path.exists():
            return []

        entries: list[VerifyEntry] = []

        with open(path, "r", encoding="utf-8") as f:
            lines, dialect = _sniff_dialect(f)

        reader = csv.reader(lines, dialect=dialect)

        for row in reader:
            if len(row) < 2:
                continue

            word = row[0].strip()
            phonemes = row[1].strip()
            comment = row[2].strip() if len(row) > 2 else ""

            if word and phonemes:
                entries.append(VerifyEntry(
                    word=word,
                    phonemes=phonemes,
                    comment=comment,
                ))

        return entries



[docs]
    @classmethod
    def load_from_files(
        cls,
        rules_path: Path | str,
        verify_path: Path | str | None = None,
    ) -> ScriptData:
        """
        Load script data from .rules and .verify files.

        Args:
            rules_path: Path to the .rules file.
            verify_path: Path to the .verify file (optional).

        Returns:
            ScriptData object.
        """
        rules = cls.load_rules(rules_path)

        verify: list[VerifyEntry] = []
        if verify_path:
            verify = cls.load_verify(verify_path)

        return ScriptData(rules=rules, verify=verify)