"""Czech text utilities — diacritics normalization and month parsing.""" import re import unicodedata CZECH_MONTHS = { "leden": 1, "ledna": 1, "lednu": 1, "unor": 2, "unora": 2, "unoru": 2, "brezen": 3, "brezna": 3, "breznu": 3, "duben": 4, "dubna": 4, "dubnu": 4, "kveten": 5, "kvetna": 5, "kvetnu": 5, "cerven": 6, "cervna": 6, "cervnu": 6, "cervenec": 7, "cervnce": 7, "cervenci": 7, "srpen": 8, "srpna": 8, "srpnu": 8, "zari": 9, "rijen": 10, "rijna": 10, "rijnu": 10, "listopad": 11, "listopadu": 11, "prosinec": 12, "prosince": 12, "prosinci": 12, } def normalize(text: str) -> str: """Strip diacritics and lowercase.""" nfkd = unicodedata.normalize("NFKD", text) return "".join(c for c in nfkd if not unicodedata.combining(c)).lower() def parse_month_references(text: str, default_year: int = 2026) -> list[str]: """Extract YYYY-MM month references from Czech free text. Handles: - Czech month names: "leden", "únor", "prosinec" (all declensions) - Numeric: "01/26", "1/2026", "11+12/2025" - Ranges: "listopad-leden" (November through January) - Slash-separated numeric months: "11+12/2025" Returns sorted list of unique YYYY-MM strings. """ normalized = normalize(text) results: set[str] = set() # Pattern: numeric months with year, e.g. "11+12/2025", "01/26", "1/2026" # Match groups of digits separated by + before a /year numeric_pattern = re.findall( r"([\d+]+)\s*/\s*(\d{2,4})", normalized ) for months_part, year_str in numeric_pattern: year = int(year_str) if year < 100: year += 2000 for m_str in months_part.split("+"): m_str = m_str.strip() if m_str.isdigit(): m = int(m_str) if 1 <= m <= 12: results.add(f"{year:04d}-{m:02d}") # Pattern: standalone numeric month.year, e.g. "12.2025" dot_pattern = re.findall(r"(\d{1,2})\s*\.\s*(\d{4})", normalized) for m_str, year_str in dot_pattern: m, year = int(m_str), int(year_str) if 1 <= m <= 12: results.add(f"{year:04d}-{m:02d}") # Czech month names — handle ranges like "listopad-leden" # First, find range patterns month_name_re = "|".join(sorted(CZECH_MONTHS.keys(), key=len, reverse=True)) range_pattern = re.findall( rf"({month_name_re})\s*-\s*({month_name_re})", normalized ) found_in_ranges: set[str] = set() for start_name, end_name in range_pattern: found_in_ranges.add(start_name) found_in_ranges.add(end_name) start_m = CZECH_MONTHS[start_name] end_m = CZECH_MONTHS[end_name] # Walk from start to end, wrapping around December→January m = start_m while True: year = default_year if m >= start_m and start_m > end_m else default_year # If range wraps (e.g. Nov-Jan), months >= start are previous year if start_m > end_m and m >= start_m: year = default_year - 1 results.add(f"{year:04d}-{m:02d}") if m == end_m: break m = m % 12 + 1 # Individual Czech month names (not already part of a range) for match in re.finditer(rf"\b({month_name_re})\b", normalized): name = match.group(1) if name in found_in_ranges: continue m = CZECH_MONTHS[name] # Heuristic: if month > 9 and we're early in the year, it's likely previous year year = default_year if m >= 10: year = default_year - 1 results.add(f"{year:04d}-{m:02d}") return sorted(results)