"""High-level Transcriber facade for xpfcorpus."""
from __future__ import annotations
from pathlib import Path
from typing import Optional
from .engine.processor import TranscriptionProcessor
from .engine.rules import LanguageData, ScriptData
from .exceptions import (
LanguageNotFoundError,
ScriptNotFoundError,
ScriptRequiredError,
VerificationError,
)
from .io.language_code import parse_language_code
from .io.legacy_loader import LegacyLoader
from .io.repository import PackageRepository
[docs]
class Transcriber:
"""
High-level grapheme-to-phoneme transcriber.
Supports multiple data sources:
- Package repository (default): bundled JSON data
- External YAML file: custom language definitions
- Legacy format: .rules and .verify files
Examples:
# Basic usage - language with default script
>>> es = Transcriber("es")
>>> es.transcribe("ejemplo")
['e', 'x', 'e', 'm', 'p', 'l', 'o']
# Explicit script
>>> tt = Transcriber("tt", "cyrillic")
# External YAML file
>>> custom = Transcriber("custom", yaml_file="my_lang.yaml")
# Legacy format
>>> legacy = Transcriber("test", rules_file="es.rules")
"""
[docs]
def __init__(
self,
language: str,
script: Optional[str] = None,
*,
verify: bool = True,
yaml_file: Optional[Path | str] = None,
rules_file: Optional[Path | str] = None,
verify_file: Optional[Path | str] = None,
):
"""
Initialize a Transcriber for a language.
Args:
language: Language code (e.g., "es", "tt", "aak").
Supports BCP-47 style codes with script and region:
- "es-ES" (region preserved, treated as variant)
- "yi-Latn" (script extracted)
- "tt-cyrillic" (script extracted)
- "zh-Hans-CN" (script extracted, region preserved)
script: Script to use (e.g., "latin", "cyrillic").
Required for multi-script languages without a default.
If provided, overrides any script in the language code.
verify: If True, verify the rules on initialization.
Raises VerificationError if verification fails.
yaml_file: Path to an external YAML file (requires PyYAML).
rules_file: Path to a legacy .rules file.
verify_file: Path to a legacy .verify file.
"""
# Parse language code to extract language, script, and region
parsed_lang, parsed_script, parsed_region = parse_language_code(language, script)
# Use explicit script if provided, otherwise use parsed script
effective_script = script if script is not None else parsed_script
self._language = parsed_lang
self._variant: Optional[str] = parsed_region # Store variant (region code)
self._original_code = language # Store original for reference
self._script: Optional[str] = None
self._lang_data: Optional[LanguageData] = None
self._script_data: Optional[ScriptData] = None
self._processor: Optional[TranscriptionProcessor] = None
# Load data from the appropriate source
if yaml_file is not None:
self._load_from_yaml(yaml_file, effective_script)
elif rules_file is not None:
self._load_from_legacy(rules_file, verify_file)
else:
self._load_from_repository(parsed_lang, effective_script, parsed_region)
# Verify if requested
if verify and self._script_data and self._script_data.verify:
passed, errors = self._processor.verify(self._script_data.verify)
if not passed:
raise VerificationError(parsed_lang, errors)
def _load_from_yaml(
self,
yaml_file: Path | str,
script: Optional[str],
) -> None:
"""Load from an external YAML file."""
from .io.yaml_loader import YAMLLoader
yaml_file = Path(yaml_file)
self._lang_data = YAMLLoader.load(yaml_file)
self._resolve_script(script)
self._init_processor()
def _load_from_legacy(
self,
rules_file: Path | str,
verify_file: Optional[Path | str],
) -> None:
"""Load from legacy .rules/.verify files."""
self._script_data = LegacyLoader.load_from_files(rules_file, verify_file)
self._script = "default"
self._init_processor()
def _load_from_repository(
self,
language: str,
script: Optional[str],
region: Optional[str] = None,
) -> None:
"""
Load from the package repository.
Tries to load variant-specific file (e.g., es-ES.json) first if region
is provided, then falls back to base language (e.g., es.json) with a warning.
When falling back, sets self._variant to None.
"""
import warnings
# Try to load variant first if region is specified
if region is not None:
variant_code = f"{language}-{region}"
if PackageRepository.has_language(variant_code):
self._lang_data = PackageRepository.load_language(variant_code)
self._resolve_script(script)
self._init_processor()
# Variant successfully loaded, keep self._variant as-is
return
# Variant not found, fall back to base language with warning
warnings.warn(
f"Language variant '{variant_code}' not found. "
f"Falling back to base language '{language}'. "
f"To create a variant, add {variant_code}.json to the data/languages directory.",
UserWarning,
stacklevel=3
)
# Clear variant since we're falling back to base language
self._variant = None
# Load base language
if not PackageRepository.has_language(language):
available = list(PackageRepository.available_languages().keys())
raise LanguageNotFoundError(language, available)
self._lang_data = PackageRepository.load_language(language)
self._resolve_script(script)
self._init_processor()
def _resolve_script(self, script: Optional[str]) -> None:
"""Resolve the script to use from language data."""
if self._lang_data is None:
return
available_scripts = list(self._lang_data.scripts.keys())
if script is not None:
# Explicit script requested
if script not in self._lang_data.scripts:
raise ScriptNotFoundError(
self._language, script, available_scripts
)
self._script = script
elif self._lang_data.default_script is not None:
# Use default script
self._script = self._lang_data.default_script
elif len(available_scripts) == 1:
# Only one script available
self._script = available_scripts[0]
else:
# No default, multiple scripts - must specify
raise ScriptRequiredError(self._language, available_scripts)
self._script_data = self._lang_data.scripts[self._script]
def _init_processor(self) -> None:
"""Initialize the transcription processor."""
if self._script_data is not None:
self._processor = TranscriptionProcessor(self._script_data.rules)
[docs]
def transcribe(self, word: str) -> list[str]:
"""
Transcribe a word from graphemes to phonemes.
Args:
word: The word to transcribe.
Returns:
List of phoneme strings.
"""
if self._processor is None:
return []
return self._processor.transcribe(word)
[docs]
def verify(self) -> tuple[bool, list[str]]:
"""
Verify the loaded rules against the verification data.
Returns:
Tuple of (all_passed, list_of_error_messages).
"""
if self._processor is None or self._script_data is None:
return True, []
return self._processor.verify(self._script_data.verify)
@property
def language(self) -> str:
"""The language code."""
return self._language
@property
def script(self) -> Optional[str]:
"""The script being used."""
return self._script
@property
def variant(self) -> Optional[str]:
"""
The language variant (region code) if a variant file was loaded.
Returns:
Region code (e.g., "ES", "MX") if variant file exists, otherwise None.
Examples:
>>> es = Transcriber("es")
>>> es.variant # None (base language)
>>>
>>> # If es-ES.json exists:
>>> es_es = Transcriber("es-ES")
>>> es_es.variant # "ES"
>>>
>>> # If es-ES.json doesn't exist (falls back to es.json):
>>> es_es = Transcriber("es-ES")
>>> es_es.variant # None
"""
return self._variant
@property
def name(self) -> str:
"""The language name."""
if self._lang_data is not None:
return self._lang_data.name
return ""
@property
def family(self) -> str:
"""The language family."""
if self._lang_data is not None:
return self._lang_data.family
return ""
@property
def is_compromised(self) -> bool:
"""Whether this language has known issues."""
if self._lang_data is not None:
return self._lang_data.compromised is not None
return False
[docs]
def __repr__(self) -> str:
script_part = f", script={self._script!r}" if self._script else ""
return f"Transcriber({self._language!r}{script_part})"
[docs]
def available_languages() -> dict[str, dict]:
"""
Get all available languages from the package repository.
Returns:
Dict mapping language codes to metadata:
{
"es": {"scripts": ["latin"], "default": "latin"},
"tt": {"scripts": ["latin", "cyrillic"], "default": null},
...
}
"""
return PackageRepository.available_languages()