Source code for xpfcorpus.engine.processor

"""Transcription processor - the core algorithm adapted from translate04.py."""

from __future__ import annotations

import re
from typing import Optional

from .rules import RuleSet, SubRule, VerifyEntry



[docs]
class TranscriptionProcessor:
    """
    Core transcription engine that converts graphemes to phonemes.

    This is a pure transcription class with no I/O operations.
    The algorithm is adapted from XPF Corpus's translate04.py.
    """


[docs]
    def __init__(self, rules: RuleSet, missing: str = "@"):
        """
        Initialize the processor with a rule set.

        Args:
            rules: The RuleSet containing all transcription rules.
            missing: Character to use for untranscribable graphemes.
        """
        self.rules = rules
        self.missing = missing

        # Expand class references in matches
        self._matches: dict[str, str] = {}
        for sfrom, sto in rules.matches.items():
            # Expand {class} references
            while re.search(r"\{.*\}", sto):
                sto = sto.format(**rules.classes)
            self._matches[sfrom] = sto

        # Build pre-transcription table
        self._pre_table = rules.get_pre_translation_table()



[docs]
    def transcribe(self, word: str) -> list[str]:
        """
        Transcribe a word from graphemes to phonemes.

        Args:
            word: The word to transcribe.

        Returns:
            List of phoneme strings.
        """
        # Check for whole-word exceptions
        if word in self.rules.words:
            return self.rules.words[word].copy()

        # Preprocess: apply pre-transcription and lowercase
        source = word.translate(self._pre_table).lower()

        # Process character by character
        source_chars = list(source)
        target_list: list[str] = []

        for idx, char in enumerate(source_chars):
            # If there's a direct match rule, use it (skip regex matching)
            if char in self._matches:
                transcription = self._matches[char]
            else:
                # Prepare context for rule matching
                precede = source[:idx]
                follow = source[idx + 1:]

                # Find all matching rules with their weights
                transcriptions = []
                for rule in self.rules.subs:
                    weight = rule.matches(char, precede, follow)
                    if weight is not None:
                        transcriptions.append((weight, rule.substitute(char)))

                # Choose the highest-weight transcription
                if transcriptions:
                    transcription = sorted(transcriptions)[-1][1]
                else:
                    transcription = self.missing

            # Skip empty transcriptions
            if transcription:
                target_list.append(transcription)

        # Join with spaces and apply IPA post-processing
        target_string = " ".join(target_list)

        # Apply ipasub rules in order of descending weight
        sorted_ipasubs = sorted(self.rules.ipasubs, key=lambda r: -r.weight)
        for rule in sorted_ipasubs:
            target_string = rule.substitute(target_string)

        return target_string.split()



[docs]
    def verify(
        self,
        entries: list[VerifyEntry],
        *,
        stop_on_first: bool = False,
    ) -> tuple[bool, list[str]]:
        """
        Verify transcription against expected outputs.

        Args:
            entries: List of VerifyEntry objects with word/phonemes pairs.
            stop_on_first: If True, stop at the first failure.

        Returns:
            Tuple of (all_passed, list_of_error_messages).
        """
        errors: list[str] = []

        for entry in entries:
            transcribed = self.transcribe(entry.word)
            transcribed_str = " ".join(transcribed)
            expected = entry.phonemes

            if transcribed_str != expected:
                error_msg = (
                    f"'{entry.word}' -> '{transcribed_str}' "
                    f"(expected: '{expected}')"
                )
                errors.append(error_msg)

                if stop_on_first:
                    return False, errors

        return len(errors) == 0, errors