fuj-management/go/internal/domain/czech/parse_month_references.go

package czech

import (
	"fmt"
	"regexp"
	"sort"
	"strconv"
	"strings"
)

var czechMonths = map[string]int{
	"leden": 1, "ledna": 1, "lednu": 1,
	"unor": 2, "unora": 2, "unoru": 2,
	"brezen": 3, "brezna": 3, "breznu": 3,
	"duben": 4, "dubna": 4, "dubnu": 4,
	"kveten": 5, "kvetna": 5, "kvetnu": 5,
	"cerven": 6, "cervna": 6, "cervnu": 6,
	"cervenec": 7, "cervnce": 7, "cervenci": 7,
	"srpen": 8, "srpna": 8, "srpnu": 8,
	"zari":  9,
	"rijen": 10, "rijna": 10, "rijnu": 10,
	"listopad": 11, "listopadu": 11,
	"prosinec": 12, "prosince": 12, "prosinci": 12,
}

var (
	numericRe *regexp.Regexp
	dotRe     *regexp.Regexp
	rangeRe   *regexp.Regexp
	standRe   *regexp.Regexp
)

func init() {
	// Sort by descending length so longer alternatives win in RE2 leftmost-first
	// matching (e.g. "cervenec" is tried before "cerven").
	names := make([]string, 0, len(czechMonths))
	for name := range czechMonths {
		names = append(names, name)
	}
	sort.Slice(names, func(i, j int) bool {
		if len(names[i]) != len(names[j]) {
			return len(names[i]) > len(names[j])
		}
		return names[i] < names[j]
	})
	alt := strings.Join(names, "|")

	numericRe = regexp.MustCompile(`([\d+]+)\s*/\s*(\d{2,4})`)
	dotRe = regexp.MustCompile(`(\d{1,2})\s*\.\s*(\d{4})`)
	rangeRe = regexp.MustCompile(`(` + alt + `)\s*-\s*(` + alt + `)`)
	standRe = regexp.MustCompile(`\b(` + alt + `)\b`)
}

// ParseMonthReferences extracts YYYY-MM month references from Czech free text.
//
// defaultYear seeds two heuristics: standalone month names with m >= 10 are
// treated as defaultYear-1 (out-of-year backfill), and wrap-around ranges
// (e.g. listopad-leden) place months >= start_m in defaultYear-1.
//
// Returns a sorted, deduplicated slice of "YYYY-MM" strings.
func ParseMonthReferences(text string, defaultYear int) []string {
	normalized := Normalize(text)
	seen := map[string]struct{}{}

	add := func(year, m int) {
		if m >= 1 && m <= 12 {
			seen[fmt.Sprintf("%04d-%02d", year, m)] = struct{}{}
		}
	}

	// Pass 1: numeric months — "11+12/2025", "01/26", "1/2026"
	for _, groups := range numericRe.FindAllStringSubmatch(normalized, -1) {
		monthsPart, yearStr := groups[1], groups[2]
		year, err := strconv.Atoi(yearStr)
		if err != nil {
			continue
		}
		if year < 100 {
			year += 2000
		}
		for mStr := range strings.SplitSeq(monthsPart, "+") {
			mStr = strings.TrimSpace(mStr)
			if mStr == "" {
				continue
			}
			allDigits := true
			for _, c := range mStr {
				if c < '0' || c > '9' {
					allDigits = false
					break
				}
			}
			if !allDigits {
				continue
			}
			m, err := strconv.Atoi(mStr)
			if err != nil {
				continue
			}
			add(year, m)
		}
	}

	// Pass 2: dot-separated month.year — "12.2025" (4-digit year only)
	for _, groups := range dotRe.FindAllStringSubmatch(normalized, -1) {
		m, _ := strconv.Atoi(groups[1])
		year, _ := strconv.Atoi(groups[2])
		add(year, m)
	}

	// Pass 3a: Czech month name ranges — "listopad-leden"
	foundInRanges := map[string]struct{}{}
	for _, groups := range rangeRe.FindAllStringSubmatch(normalized, -1) {
		startName, endName := groups[1], groups[2]
		foundInRanges[startName] = struct{}{}
		foundInRanges[endName] = struct{}{}
		startM := czechMonths[startName]
		endM := czechMonths[endName]
		wraps := startM > endM
		m := startM
		for range 12 {
			year := defaultYear
			if wraps && m >= startM {
				year = defaultYear - 1
			}
			add(year, m)
			if m == endM {
				break
			}
			m = m%12 + 1
		}
	}

	// Pass 3b: standalone Czech month names (not part of a range)
	for _, groups := range standRe.FindAllStringSubmatch(normalized, -1) {
		name := groups[1]
		if _, inRange := foundInRanges[name]; inRange {
			continue
		}
		m := czechMonths[name]
		year := defaultYear
		if m >= 10 {
			year = defaultYear - 1
		}
		add(year, m)
	}

	result := make([]string, 0, len(seen))
	for k := range seen {
		result = append(result, k)
	}
	sort.Strings(result)
	return result
}