feat(go/M2.2): port czech.ParseMonthReferences

Three-pass regex parser matching python/czech_utils.py parse_month_references: 1. Numeric slash notation — "11+12/2025", "01/26"; 2-digit year → +2000 2. Dot notation — "12.2025" (4-digit year only) 3. Czech month names — range walk (listopad-leden wrap logic) then standalone with m≥10 → defaultYear-1 heuristic; longest-match alternation (sorted desc by name length) handles cervenec vs cerven 35 table-driven tests, all expected outputs verified against live Python on 2026-05-05 before locking. Plan at docs/plans/2026-05-05-2337-go-rewrite-m2-2-parse-month-references.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-06 00:05:40 +02:00
parent 3460f57c62
commit 6d971b61d4
3 changed files with 603 additions and 0 deletions
--- a/go/internal/domain/czech/parse_month_references.go
+++ b/go/internal/domain/czech/parse_month_references.go
@@ -0,0 +1,154 @@
+package czech
+
+import (
+	"fmt"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+var czechMonths = map[string]int{
+	"leden": 1, "ledna": 1, "lednu": 1,
+	"unor": 2, "unora": 2, "unoru": 2,
+	"brezen": 3, "brezna": 3, "breznu": 3,
+	"duben": 4, "dubna": 4, "dubnu": 4,
+	"kveten": 5, "kvetna": 5, "kvetnu": 5,
+	"cerven": 6, "cervna": 6, "cervnu": 6,
+	"cervenec": 7, "cervnce": 7, "cervenci": 7,
+	"srpen": 8, "srpna": 8, "srpnu": 8,
+	"zari":  9,
+	"rijen": 10, "rijna": 10, "rijnu": 10,
+	"listopad": 11, "listopadu": 11,
+	"prosinec": 12, "prosince": 12, "prosinci": 12,
+}
+
+var (
+	numericRe *regexp.Regexp
+	dotRe     *regexp.Regexp
+	rangeRe   *regexp.Regexp
+	standRe   *regexp.Regexp
+)
+
+func init() {
+	// Sort by descending length so longer alternatives win in RE2 leftmost-first
+	// matching (e.g. "cervenec" is tried before "cerven").
+	names := make([]string, 0, len(czechMonths))
+	for name := range czechMonths {
+		names = append(names, name)
+	}
+	sort.Slice(names, func(i, j int) bool {
+		if len(names[i]) != len(names[j]) {
+			return len(names[i]) > len(names[j])
+		}
+		return names[i] < names[j]
+	})
+	alt := strings.Join(names, "|")
+
+	numericRe = regexp.MustCompile(`([\d+]+)\s*/\s*(\d{2,4})`)
+	dotRe = regexp.MustCompile(`(\d{1,2})\s*\.\s*(\d{4})`)
+	rangeRe = regexp.MustCompile(`(` + alt + `)\s*-\s*(` + alt + `)`)
+	standRe = regexp.MustCompile(`\b(` + alt + `)\b`)
+}
+
+// ParseMonthReferences extracts YYYY-MM month references from Czech free text.
+//
+// defaultYear seeds two heuristics: standalone month names with m >= 10 are
+// treated as defaultYear-1 (out-of-year backfill), and wrap-around ranges
+// (e.g. listopad-leden) place months >= start_m in defaultYear-1.
+//
+// Returns a sorted, deduplicated slice of "YYYY-MM" strings.
+func ParseMonthReferences(text string, defaultYear int) []string {
+	normalized := Normalize(text)
+	seen := map[string]struct{}{}
+
+	add := func(year, m int) {
+		if m >= 1 && m <= 12 {
+			seen[fmt.Sprintf("%04d-%02d", year, m)] = struct{}{}
+		}
+	}
+
+	// Pass 1: numeric months — "11+12/2025", "01/26", "1/2026"
+	for _, groups := range numericRe.FindAllStringSubmatch(normalized, -1) {
+		monthsPart, yearStr := groups[1], groups[2]
+		year, err := strconv.Atoi(yearStr)
+		if err != nil {
+			continue
+		}
+		if year < 100 {
+			year += 2000
+		}
+		for mStr := range strings.SplitSeq(monthsPart, "+") {
+			mStr = strings.TrimSpace(mStr)
+			if mStr == "" {
+				continue
+			}
+			allDigits := true
+			for _, c := range mStr {
+				if c < '0' || c > '9' {
+					allDigits = false
+					break
+				}
+			}
+			if !allDigits {
+				continue
+			}
+			m, err := strconv.Atoi(mStr)
+			if err != nil {
+				continue
+			}
+			add(year, m)
+		}
+	}
+
+	// Pass 2: dot-separated month.year — "12.2025" (4-digit year only)
+	for _, groups := range dotRe.FindAllStringSubmatch(normalized, -1) {
+		m, _ := strconv.Atoi(groups[1])
+		year, _ := strconv.Atoi(groups[2])
+		add(year, m)
+	}
+
+	// Pass 3a: Czech month name ranges — "listopad-leden"
+	foundInRanges := map[string]struct{}{}
+	for _, groups := range rangeRe.FindAllStringSubmatch(normalized, -1) {
+		startName, endName := groups[1], groups[2]
+		foundInRanges[startName] = struct{}{}
+		foundInRanges[endName] = struct{}{}
+		startM := czechMonths[startName]
+		endM := czechMonths[endName]
+		wraps := startM > endM
+		m := startM
+		for range 12 {
+			year := defaultYear
+			if wraps && m >= startM {
+				year = defaultYear - 1
+			}
+			add(year, m)
+			if m == endM {
+				break
+			}
+			m = m%12 + 1
+		}
+	}
+
+	// Pass 3b: standalone Czech month names (not part of a range)
+	for _, groups := range standRe.FindAllStringSubmatch(normalized, -1) {
+		name := groups[1]
+		if _, inRange := foundInRanges[name]; inRange {
+			continue
+		}
+		m := czechMonths[name]
+		year := defaultYear
+		if m >= 10 {
+			year = defaultYear - 1
+		}
+		add(year, m)
+	}
+
+	result := make([]string, 0, len(seen))
+	for k := range seen {
+		result = append(result, k)
+	}
+	sort.Strings(result)
+	return result
+}
--- a/go/internal/domain/czech/parse_month_references_test.go
+++ b/go/internal/domain/czech/parse_month_references_test.go
@@ -0,0 +1,244 @@
+package czech
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestParseMonthReferences(t *testing.T) {
+	t.Parallel()
+
+	// All expected outputs verified against live Python implementation on 2026-05-05:
+	//   PYTHONPATH=scripts:. python -c 'from czech_utils import parse_month_references; print(parse_month_references("<input>", 2026))'
+	tests := []struct {
+		name        string
+		input       string
+		defaultYear int
+		want        []string
+	}{
+		{
+			name:        "empty",
+			input:       "",
+			defaultYear: 2026,
+			want:        []string{},
+		},
+		{
+			name:        "numeric plus-split two months full year",
+			input:       "11+12/2025",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12"},
+		},
+		{
+			name:        "numeric single month full year",
+			input:       "1/2026",
+			defaultYear: 2026,
+			want:        []string{"2026-01"},
+		},
+		{
+			name:        "numeric 2-digit year",
+			input:       "01/26",
+			defaultYear: 2026,
+			want:        []string{"2026-01"},
+		},
+		{
+			name:        "numeric plus-split with 2-digit year",
+			input:       "11+12/25",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12"},
+		},
+		{
+			name:        "numeric three months sorted",
+			input:       "12+1+2/2026",
+			defaultYear: 2026,
+			want:        []string{"2026-01", "2026-02", "2026-12"},
+		},
+		{
+			name:        "dot pattern",
+			input:       "12.2025",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "dot pattern requires 4-digit year",
+			input:       "1.26",
+			defaultYear: 2026,
+			want:        []string{},
+		},
+		{
+			name:        "standalone month below m10 threshold",
+			input:       "leden",
+			defaultYear: 2026,
+			want:        []string{"2026-01"},
+		},
+		{
+			name:        "standalone month m10 heuristic",
+			input:       "prosinec",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "declension prosince",
+			input:       "prosince",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "declension lednu",
+			input:       "lednu",
+			defaultYear: 2026,
+			want:        []string{"2026-01"},
+		},
+		{
+			name:        "standalone m10 boundary (rijen = October)",
+			input:       "rijen",
+			defaultYear: 2026,
+			want:        []string{"2025-10"},
+		},
+		{
+			name:        "standalone m9 just below boundary (zari = September)",
+			input:       "zari",
+			defaultYear: 2026,
+			want:        []string{"2026-09"},
+		},
+		{
+			name:        "range wrap Nov-Jan",
+			input:       "listopad-leden",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12", "2026-01"},
+		},
+		{
+			name:        "range wrap starting at October",
+			input:       "rijen-leden",
+			defaultYear: 2026,
+			want:        []string{"2025-10", "2025-11", "2025-12", "2026-01"},
+		},
+		{
+			name:        "range no wrap",
+			input:       "unor-kveten",
+			defaultYear: 2026,
+			want:        []string{"2026-02", "2026-03", "2026-04", "2026-05"},
+		},
+		{
+			name:        "degenerate range same month",
+			input:       "leden-leden",
+			defaultYear: 2026,
+			want:        []string{"2026-01"},
+		},
+		{
+			name:        "range spanning m10 — heuristic does NOT fire for range members",
+			input:       "unor-listopad",
+			defaultYear: 2026,
+			want:        []string{"2026-02", "2026-03", "2026-04", "2026-05", "2026-06", "2026-07", "2026-08", "2026-09", "2026-10", "2026-11"},
+		},
+		{
+			name:        "longest-match alternation cervenec beats cerven",
+			input:       "cervenec-srpen",
+			defaultYear: 2026,
+			want:        []string{"2026-07", "2026-08"},
+		},
+		{
+			name:        "range plus standalone — range excludes, dedup",
+			input:       "listopad-leden, prosinec",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12", "2026-01"},
+		},
+		{
+			name:        "two standalones no range",
+			input:       "prosinec leden",
+			defaultYear: 2026,
+			want:        []string{"2025-12", "2026-01"},
+		},
+		{
+			name:        "numeric plus range mix",
+			input:       "11+12/2025, leden-brezen",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12", "2026-01", "2026-02", "2026-03"},
+		},
+		{
+			name:        "dedup across numeric and standalone passes",
+			input:       "11+12/25 a listopad",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12"},
+		},
+		{
+			name:        "no digits before slash — standalone fires instead",
+			input:       "prosince/2025",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "range with trailing slash-year — numeric fails, range wins",
+			input:       "listopad-prosinec/2025",
+			defaultYear: 2026,
+			want:        []string{"2026-11", "2026-12"},
+		},
+		{
+			name:        "dot pattern only — numeric matches but month out of 1-12 range",
+			input:       "01.2026 / 02.2026",
+			defaultYear: 2026,
+			want:        []string{"2026-01", "2026-02"},
+		},
+		{
+			name:        "leading slash — numeric matches at second slash",
+			input:       "/12/2025",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "uppercase input normalized",
+			input:       "PROSINEC",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "diacritics stripped by Normalize",
+			input:       "Žluťoučký prosinec",
+			defaultYear: 2026,
+			want:        []string{"2025-12"},
+		},
+		{
+			name:        "diacritics in range with spaces around dash",
+			input:       "Únor - květen",
+			defaultYear: 2026,
+			want:        []string{"2026-02", "2026-03", "2026-04", "2026-05"},
+		},
+		{
+			name:        "natural language mixed with numeric and standalone",
+			input:       "platba 11/2025 a leden",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2026-01"},
+		},
+		{
+			name:        "English month name not recognized",
+			input:       "December",
+			defaultYear: 2026,
+			want:        []string{},
+		},
+		{
+			name:        "duplicate input deduped",
+			input:       "11+12/2025 11+12/2025",
+			defaultYear: 2026,
+			want:        []string{"2025-11", "2025-12"},
+		},
+		{
+			name:        "trailing year without separator ignored",
+			input:       "leden 2026",
+			defaultYear: 2026,
+			want:        []string{"2026-01"},
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			got := ParseMonthReferences(tc.input, tc.defaultYear)
+			if got == nil {
+				got = []string{}
+			}
+			if !reflect.DeepEqual(got, tc.want) {
+				t.Errorf("ParseMonthReferences(%q, %d)\n  got  %v\n  want %v",
+					tc.input, tc.defaultYear, got, tc.want)
+			}
+		})
+	}
+}