fuj-management/scripts/czech_utils.py

"""Czech text utilities — diacritics normalization and month parsing."""

import re
import unicodedata

CZECH_MONTHS = {
    "leden": 1, "ledna": 1, "lednu": 1,
    "unor": 2, "unora": 2, "unoru": 2,
    "brezen": 3, "brezna": 3, "breznu": 3,
    "duben": 4, "dubna": 4, "dubnu": 4,
    "kveten": 5, "kvetna": 5, "kvetnu": 5,
    "cerven": 6, "cervna": 6, "cervnu": 6,
    "cervenec": 7, "cervnce": 7, "cervenci": 7,
    "srpen": 8, "srpna": 8, "srpnu": 8,
    "zari": 9,
    "rijen": 10, "rijna": 10, "rijnu": 10,
    "listopad": 11, "listopadu": 11,
    "prosinec": 12, "prosince": 12, "prosinci": 12,
}


def normalize(text: str) -> str:
    """Strip diacritics and lowercase."""
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c)).lower()


def parse_month_references(text: str, default_year: int = 2026) -> list[str]:
    """Extract YYYY-MM month references from Czech free text.

    Handles:
    - Czech month names: "leden", "únor", "prosinec" (all declensions)
    - Numeric: "01/26", "1/2026", "11+12/2025"
    - Ranges: "listopad-leden" (November through January)
    - Slash-separated numeric months: "11+12/2025"

    Returns sorted list of unique YYYY-MM strings.
    """
    normalized = normalize(text)
    results: set[str] = set()

    # Pattern: numeric months with year, e.g. "11+12/2025", "01/26", "1/2026"
    # Match groups of digits separated by + before a /year
    numeric_pattern = re.findall(
        r"([\d+]+)\s*/\s*(\d{2,4})", normalized
    )
    for months_part, year_str in numeric_pattern:
        year = int(year_str)
        if year < 100:
            year += 2000
        for m_str in months_part.split("+"):
            m_str = m_str.strip()
            if m_str.isdigit():
                m = int(m_str)
                if 1 <= m <= 12:
                    results.add(f"{year:04d}-{m:02d}")

    # Pattern: standalone numeric month.year, e.g. "12.2025"
    dot_pattern = re.findall(r"(\d{1,2})\s*\.\s*(\d{4})", normalized)
    for m_str, year_str in dot_pattern:
        m, year = int(m_str), int(year_str)
        if 1 <= m <= 12:
            results.add(f"{year:04d}-{m:02d}")

    # Czech month names — handle ranges like "listopad-leden"
    # First, find range patterns
    month_name_re = "|".join(sorted(CZECH_MONTHS.keys(), key=len, reverse=True))
    range_pattern = re.findall(
        rf"({month_name_re})\s*-\s*({month_name_re})", normalized
    )
    found_in_ranges: set[str] = set()
    for start_name, end_name in range_pattern:
        found_in_ranges.add(start_name)
        found_in_ranges.add(end_name)
        start_m = CZECH_MONTHS[start_name]
        end_m = CZECH_MONTHS[end_name]
        # Walk from start to end, wrapping around December→January
        m = start_m
        while True:
            year = default_year if m >= start_m and start_m > end_m else default_year
            # If range wraps (e.g. Nov-Jan), months >= start are previous year
            if start_m > end_m and m >= start_m:
                year = default_year - 1
            results.add(f"{year:04d}-{m:02d}")
            if m == end_m:
                break
            m = m % 12 + 1

    # Individual Czech month names (not already part of a range)
    for match in re.finditer(rf"\b({month_name_re})\b", normalized):
        name = match.group(1)
        if name in found_in_ranges:
            continue
        m = CZECH_MONTHS[name]
        # Heuristic: if month > 9 and we're early in the year, it's likely previous year
        year = default_year
        if m >= 10:
            year = default_year - 1
        results.add(f"{year:04d}-{m:02d}")

    return sorted(results)