Files
fuj-management/scripts/czech_utils.py
Jan Novak 3bfea4e0a4 feat: initial dashboard implementation and robust attendance parsing
- Added a Makefile to easily run project scripts (fees, match, web, image)
- Modified attendance.py to dynamically handle a variable number of header rows from the Google Sheet
- Updated both attendance calculations and calculate_fees terminal output to show actual attendance counts (e.g., '750 CZK (3)')
- Created a Flask web dashboard (app.py and templates/fees.html) to view member fees in an attractive, condensed, terminal-like UI
- Bound the Flask server to port 5000 and added a routing alias from '/' to '/fees'
- Configured Python virtual environment (.venv) creation directly into the Makefile to resolve global pip install errors on macOS

Co-authored-by: Antigravity <antigravity@deepmind.com>
2026-02-27 13:20:42 +01:00

102 lines
3.6 KiB
Python

"""Czech text utilities — diacritics normalization and month parsing."""
import re
import unicodedata
CZECH_MONTHS = {
"leden": 1, "ledna": 1, "lednu": 1,
"unor": 2, "unora": 2, "unoru": 2,
"brezen": 3, "brezna": 3, "breznu": 3,
"duben": 4, "dubna": 4, "dubnu": 4,
"kveten": 5, "kvetna": 5, "kvetnu": 5,
"cerven": 6, "cervna": 6, "cervnu": 6,
"cervenec": 7, "cervnce": 7, "cervenci": 7,
"srpen": 8, "srpna": 8, "srpnu": 8,
"zari": 9,
"rijen": 10, "rijna": 10, "rijnu": 10,
"listopad": 11, "listopadu": 11,
"prosinec": 12, "prosince": 12, "prosinci": 12,
}
def normalize(text: str) -> str:
"""Strip diacritics and lowercase."""
nfkd = unicodedata.normalize("NFKD", text)
return "".join(c for c in nfkd if not unicodedata.combining(c)).lower()
def parse_month_references(text: str, default_year: int = 2026) -> list[str]:
"""Extract YYYY-MM month references from Czech free text.
Handles:
- Czech month names: "leden", "únor", "prosinec" (all declensions)
- Numeric: "01/26", "1/2026", "11+12/2025"
- Ranges: "listopad-leden" (November through January)
- Slash-separated numeric months: "11+12/2025"
Returns sorted list of unique YYYY-MM strings.
"""
normalized = normalize(text)
results: set[str] = set()
# Pattern: numeric months with year, e.g. "11+12/2025", "01/26", "1/2026"
# Match groups of digits separated by + before a /year
numeric_pattern = re.findall(
r"([\d+]+)\s*/\s*(\d{2,4})", normalized
)
for months_part, year_str in numeric_pattern:
year = int(year_str)
if year < 100:
year += 2000
for m_str in months_part.split("+"):
m_str = m_str.strip()
if m_str.isdigit():
m = int(m_str)
if 1 <= m <= 12:
results.add(f"{year:04d}-{m:02d}")
# Pattern: standalone numeric month.year, e.g. "12.2025"
dot_pattern = re.findall(r"(\d{1,2})\s*\.\s*(\d{4})", normalized)
for m_str, year_str in dot_pattern:
m, year = int(m_str), int(year_str)
if 1 <= m <= 12:
results.add(f"{year:04d}-{m:02d}")
# Czech month names — handle ranges like "listopad-leden"
# First, find range patterns
month_name_re = "|".join(sorted(CZECH_MONTHS.keys(), key=len, reverse=True))
range_pattern = re.findall(
rf"({month_name_re})\s*-\s*({month_name_re})", normalized
)
found_in_ranges: set[str] = set()
for start_name, end_name in range_pattern:
found_in_ranges.add(start_name)
found_in_ranges.add(end_name)
start_m = CZECH_MONTHS[start_name]
end_m = CZECH_MONTHS[end_name]
# Walk from start to end, wrapping around December→January
m = start_m
while True:
year = default_year if m >= start_m and start_m > end_m else default_year
# If range wraps (e.g. Nov-Jan), months >= start are previous year
if start_m > end_m and m >= start_m:
year = default_year - 1
results.add(f"{year:04d}-{m:02d}")
if m == end_m:
break
m = m % 12 + 1
# Individual Czech month names (not already part of a range)
for match in re.finditer(rf"\b({month_name_re})\b", normalized):
name = match.group(1)
if name in found_in_ranges:
continue
m = CZECH_MONTHS[name]
# Heuristic: if month > 9 and we're early in the year, it's likely previous year
year = default_year
if m >= 10:
year = default_year - 1
results.add(f"{year:04d}-{m:02d}")
return sorted(results)