Source code for xpfcorpus.io.language_code

"""Language code parsing utilities for BCP-47 style codes."""

from __future__ import annotations

import re
from typing import Optional, Tuple


[docs] def normalize_script(script: str) -> str: """ Normalize script name to a standard form. Handles both ISO 15924 4-letter codes and common names. Args: script: Script name or code (e.g., "Latn", "latin", "cyrillic", "Cyrl") Returns: Normalized lowercase script name. Examples: >>> normalize_script("Latn") 'latin' >>> normalize_script("cyrillic") 'cyrillic' >>> normalize_script("Syll") 'syllabics' """ script_lower = script.lower() # ISO 15924 4-letter codes if script_lower in ("latn", "latin"): return "latin" if script_lower in ("cyrl", "cyrillic"): return "cyrillic" if script_lower in ("syll", "syllabics"): return "syllabics" if script_lower in ("hebr", "hebrew"): return "hebrew" if script_lower in ("arab", "arabic"): return "arabic" if script_lower in ("hans", "simplified"): return "hans" if script_lower in ("hant", "traditional"): return "hant" return script_lower
[docs] def parse_language_code( code: str, explicit_script: Optional[str] = None, ) -> Tuple[str, Optional[str], Optional[str]]: """ Parse a language code with optional script and region components. Supports BCP-47 style codes like: - "es" → ("es", None, None) - "es-ES" → ("es", None, "ES") - region preserved - "yi-Latn" → ("yi", "latin", None) - script extracted - "tt-cyrillic" → ("tt", "cyrillic", None) - script extracted - "zh-Hans-CN" → ("zh", "hans", "CN") - script extracted, region preserved If an explicit script is provided, it always takes precedence over any script extracted from the code. Args: code: Language code, possibly with script/region subtags. explicit_script: Optional explicit script that overrides extracted script. Returns: Tuple of (language_code, script_or_none, region_or_none). Examples: >>> parse_language_code("es") ('es', None, None) >>> parse_language_code("es-ES") ('es', None, 'ES') >>> parse_language_code("yi-Latn") ('yi', 'latin', None) >>> parse_language_code("tt-cyrillic") ('tt', 'cyrillic', None) >>> parse_language_code("zh-Hans-CN") ('zh', 'hans', 'CN') >>> parse_language_code("yi-Latn", "hebrew") ('yi', 'hebrew', None) """ # Parse the code parts = code.split("-") language = parts[0].lower() extracted_script: Optional[str] = None extracted_region: Optional[str] = None # Look for script and region in remaining parts for part in parts[1:]: # ISO 15924 script codes are 4 letters with Title case (e.g., Latn, Cyrl) if len(part) == 4 and part[0].isupper() and extracted_script is None: extracted_script = normalize_script(part) # Also check for lowercase script names (e.g., "cyrillic", "latin") elif part.lower() in ( "latin", "cyrillic", "syllabics", "hebrew", "arabic", "hans", "hant", "latn", "cyrl", "syll", "hebr", "arab", ) and extracted_script is None: extracted_script = normalize_script(part) # Region codes (2 uppercase letters or 3 digits) elif len(part) == 2 and part.isupper() and extracted_region is None: extracted_region = part.upper() elif len(part) == 3 and part.isdigit() and extracted_region is None: extracted_region = part # UN M49 numeric region code # Use explicit script if provided if explicit_script is not None: extracted_script = normalize_script(explicit_script) return language, extracted_script, extracted_region