package czech import ( "fmt" "regexp" "sort" "strconv" "strings" ) var czechMonths = map[string]int{ "leden": 1, "ledna": 1, "lednu": 1, "unor": 2, "unora": 2, "unoru": 2, "brezen": 3, "brezna": 3, "breznu": 3, "duben": 4, "dubna": 4, "dubnu": 4, "kveten": 5, "kvetna": 5, "kvetnu": 5, "cerven": 6, "cervna": 6, "cervnu": 6, "cervenec": 7, "cervnce": 7, "cervenci": 7, "srpen": 8, "srpna": 8, "srpnu": 8, "zari": 9, "rijen": 10, "rijna": 10, "rijnu": 10, "listopad": 11, "listopadu": 11, "prosinec": 12, "prosince": 12, "prosinci": 12, } var ( numericRe *regexp.Regexp dotRe *regexp.Regexp rangeRe *regexp.Regexp standRe *regexp.Regexp ) func init() { // Sort by descending length so longer alternatives win in RE2 leftmost-first // matching (e.g. "cervenec" is tried before "cerven"). names := make([]string, 0, len(czechMonths)) for name := range czechMonths { names = append(names, name) } sort.Slice(names, func(i, j int) bool { if len(names[i]) != len(names[j]) { return len(names[i]) > len(names[j]) } return names[i] < names[j] }) alt := strings.Join(names, "|") numericRe = regexp.MustCompile(`([\d+]+)\s*/\s*(\d{2,4})`) dotRe = regexp.MustCompile(`(\d{1,2})\s*\.\s*(\d{4})`) rangeRe = regexp.MustCompile(`(` + alt + `)\s*-\s*(` + alt + `)`) standRe = regexp.MustCompile(`\b(` + alt + `)\b`) } // ParseMonthReferences extracts YYYY-MM month references from Czech free text. // // defaultYear seeds two heuristics: standalone month names with m >= 10 are // treated as defaultYear-1 (out-of-year backfill), and wrap-around ranges // (e.g. listopad-leden) place months >= start_m in defaultYear-1. // // Returns a sorted, deduplicated slice of "YYYY-MM" strings. func ParseMonthReferences(text string, defaultYear int) []string { normalized := Normalize(text) seen := map[string]struct{}{} add := func(year, m int) { if m >= 1 && m <= 12 { seen[fmt.Sprintf("%04d-%02d", year, m)] = struct{}{} } } // Pass 1: numeric months — "11+12/2025", "01/26", "1/2026" for _, groups := range numericRe.FindAllStringSubmatch(normalized, -1) { monthsPart, yearStr := groups[1], groups[2] year, err := strconv.Atoi(yearStr) if err != nil { continue } if year < 100 { year += 2000 } for mStr := range strings.SplitSeq(monthsPart, "+") { mStr = strings.TrimSpace(mStr) if mStr == "" { continue } allDigits := true for _, c := range mStr { if c < '0' || c > '9' { allDigits = false break } } if !allDigits { continue } m, err := strconv.Atoi(mStr) if err != nil { continue } add(year, m) } } // Pass 2: dot-separated month.year — "12.2025" (4-digit year only) for _, groups := range dotRe.FindAllStringSubmatch(normalized, -1) { m, _ := strconv.Atoi(groups[1]) year, _ := strconv.Atoi(groups[2]) add(year, m) } // Pass 3a: Czech month name ranges — "listopad-leden" foundInRanges := map[string]struct{}{} for _, groups := range rangeRe.FindAllStringSubmatch(normalized, -1) { startName, endName := groups[1], groups[2] foundInRanges[startName] = struct{}{} foundInRanges[endName] = struct{}{} startM := czechMonths[startName] endM := czechMonths[endName] wraps := startM > endM m := startM for range 12 { year := defaultYear if wraps && m >= startM { year = defaultYear - 1 } add(year, m) if m == endM { break } m = m%12 + 1 } } // Pass 3b: standalone Czech month names (not part of a range) for _, groups := range standRe.FindAllStringSubmatch(normalized, -1) { name := groups[1] if _, inRange := foundInRanges[name]; inRange { continue } m := czechMonths[name] year := defaultYear if m >= 10 { year = defaultYear - 1 } add(year, m) } result := make([]string, 0, len(seen)) for k := range seen { result = append(result, k) } sort.Strings(result) return result }