feat(go/M2.2): port czech.ParseMonthReferences
All checks were successful
Deploy to K8s / deploy (push) Successful in 8s

Three-pass regex parser matching python/czech_utils.py parse_month_references:
1. Numeric slash notation — "11+12/2025", "01/26"; 2-digit year → +2000
2. Dot notation — "12.2025" (4-digit year only)
3. Czech month names — range walk (listopad-leden wrap logic) then
   standalone with m≥10 → defaultYear-1 heuristic; longest-match
   alternation (sorted desc by name length) handles cervenec vs cerven

35 table-driven tests, all expected outputs verified against live Python
on 2026-05-05 before locking. Plan at
docs/plans/2026-05-05-2337-go-rewrite-m2-2-parse-month-references.md.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-06 00:05:40 +02:00
parent 3460f57c62
commit 6d971b61d4
3 changed files with 603 additions and 0 deletions

View File

@@ -0,0 +1,154 @@
package czech
import (
"fmt"
"regexp"
"sort"
"strconv"
"strings"
)
var czechMonths = map[string]int{
"leden": 1, "ledna": 1, "lednu": 1,
"unor": 2, "unora": 2, "unoru": 2,
"brezen": 3, "brezna": 3, "breznu": 3,
"duben": 4, "dubna": 4, "dubnu": 4,
"kveten": 5, "kvetna": 5, "kvetnu": 5,
"cerven": 6, "cervna": 6, "cervnu": 6,
"cervenec": 7, "cervnce": 7, "cervenci": 7,
"srpen": 8, "srpna": 8, "srpnu": 8,
"zari": 9,
"rijen": 10, "rijna": 10, "rijnu": 10,
"listopad": 11, "listopadu": 11,
"prosinec": 12, "prosince": 12, "prosinci": 12,
}
var (
numericRe *regexp.Regexp
dotRe *regexp.Regexp
rangeRe *regexp.Regexp
standRe *regexp.Regexp
)
func init() {
// Sort by descending length so longer alternatives win in RE2 leftmost-first
// matching (e.g. "cervenec" is tried before "cerven").
names := make([]string, 0, len(czechMonths))
for name := range czechMonths {
names = append(names, name)
}
sort.Slice(names, func(i, j int) bool {
if len(names[i]) != len(names[j]) {
return len(names[i]) > len(names[j])
}
return names[i] < names[j]
})
alt := strings.Join(names, "|")
numericRe = regexp.MustCompile(`([\d+]+)\s*/\s*(\d{2,4})`)
dotRe = regexp.MustCompile(`(\d{1,2})\s*\.\s*(\d{4})`)
rangeRe = regexp.MustCompile(`(` + alt + `)\s*-\s*(` + alt + `)`)
standRe = regexp.MustCompile(`\b(` + alt + `)\b`)
}
// ParseMonthReferences extracts YYYY-MM month references from Czech free text.
//
// defaultYear seeds two heuristics: standalone month names with m >= 10 are
// treated as defaultYear-1 (out-of-year backfill), and wrap-around ranges
// (e.g. listopad-leden) place months >= start_m in defaultYear-1.
//
// Returns a sorted, deduplicated slice of "YYYY-MM" strings.
func ParseMonthReferences(text string, defaultYear int) []string {
normalized := Normalize(text)
seen := map[string]struct{}{}
add := func(year, m int) {
if m >= 1 && m <= 12 {
seen[fmt.Sprintf("%04d-%02d", year, m)] = struct{}{}
}
}
// Pass 1: numeric months — "11+12/2025", "01/26", "1/2026"
for _, groups := range numericRe.FindAllStringSubmatch(normalized, -1) {
monthsPart, yearStr := groups[1], groups[2]
year, err := strconv.Atoi(yearStr)
if err != nil {
continue
}
if year < 100 {
year += 2000
}
for mStr := range strings.SplitSeq(monthsPart, "+") {
mStr = strings.TrimSpace(mStr)
if mStr == "" {
continue
}
allDigits := true
for _, c := range mStr {
if c < '0' || c > '9' {
allDigits = false
break
}
}
if !allDigits {
continue
}
m, err := strconv.Atoi(mStr)
if err != nil {
continue
}
add(year, m)
}
}
// Pass 2: dot-separated month.year — "12.2025" (4-digit year only)
for _, groups := range dotRe.FindAllStringSubmatch(normalized, -1) {
m, _ := strconv.Atoi(groups[1])
year, _ := strconv.Atoi(groups[2])
add(year, m)
}
// Pass 3a: Czech month name ranges — "listopad-leden"
foundInRanges := map[string]struct{}{}
for _, groups := range rangeRe.FindAllStringSubmatch(normalized, -1) {
startName, endName := groups[1], groups[2]
foundInRanges[startName] = struct{}{}
foundInRanges[endName] = struct{}{}
startM := czechMonths[startName]
endM := czechMonths[endName]
wraps := startM > endM
m := startM
for range 12 {
year := defaultYear
if wraps && m >= startM {
year = defaultYear - 1
}
add(year, m)
if m == endM {
break
}
m = m%12 + 1
}
}
// Pass 3b: standalone Czech month names (not part of a range)
for _, groups := range standRe.FindAllStringSubmatch(normalized, -1) {
name := groups[1]
if _, inRange := foundInRanges[name]; inRange {
continue
}
m := czechMonths[name]
year := defaultYear
if m >= 10 {
year = defaultYear - 1
}
add(year, m)
}
result := make([]string, 0, len(seen))
for k := range seen {
result = append(result, k)
}
sort.Strings(result)
return result
}

View File

@@ -0,0 +1,244 @@
package czech
import (
"reflect"
"testing"
)
func TestParseMonthReferences(t *testing.T) {
t.Parallel()
// All expected outputs verified against live Python implementation on 2026-05-05:
// PYTHONPATH=scripts:. python -c 'from czech_utils import parse_month_references; print(parse_month_references("<input>", 2026))'
tests := []struct {
name string
input string
defaultYear int
want []string
}{
{
name: "empty",
input: "",
defaultYear: 2026,
want: []string{},
},
{
name: "numeric plus-split two months full year",
input: "11+12/2025",
defaultYear: 2026,
want: []string{"2025-11", "2025-12"},
},
{
name: "numeric single month full year",
input: "1/2026",
defaultYear: 2026,
want: []string{"2026-01"},
},
{
name: "numeric 2-digit year",
input: "01/26",
defaultYear: 2026,
want: []string{"2026-01"},
},
{
name: "numeric plus-split with 2-digit year",
input: "11+12/25",
defaultYear: 2026,
want: []string{"2025-11", "2025-12"},
},
{
name: "numeric three months sorted",
input: "12+1+2/2026",
defaultYear: 2026,
want: []string{"2026-01", "2026-02", "2026-12"},
},
{
name: "dot pattern",
input: "12.2025",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "dot pattern requires 4-digit year",
input: "1.26",
defaultYear: 2026,
want: []string{},
},
{
name: "standalone month below m10 threshold",
input: "leden",
defaultYear: 2026,
want: []string{"2026-01"},
},
{
name: "standalone month m10 heuristic",
input: "prosinec",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "declension prosince",
input: "prosince",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "declension lednu",
input: "lednu",
defaultYear: 2026,
want: []string{"2026-01"},
},
{
name: "standalone m10 boundary (rijen = October)",
input: "rijen",
defaultYear: 2026,
want: []string{"2025-10"},
},
{
name: "standalone m9 just below boundary (zari = September)",
input: "zari",
defaultYear: 2026,
want: []string{"2026-09"},
},
{
name: "range wrap Nov-Jan",
input: "listopad-leden",
defaultYear: 2026,
want: []string{"2025-11", "2025-12", "2026-01"},
},
{
name: "range wrap starting at October",
input: "rijen-leden",
defaultYear: 2026,
want: []string{"2025-10", "2025-11", "2025-12", "2026-01"},
},
{
name: "range no wrap",
input: "unor-kveten",
defaultYear: 2026,
want: []string{"2026-02", "2026-03", "2026-04", "2026-05"},
},
{
name: "degenerate range same month",
input: "leden-leden",
defaultYear: 2026,
want: []string{"2026-01"},
},
{
name: "range spanning m10 — heuristic does NOT fire for range members",
input: "unor-listopad",
defaultYear: 2026,
want: []string{"2026-02", "2026-03", "2026-04", "2026-05", "2026-06", "2026-07", "2026-08", "2026-09", "2026-10", "2026-11"},
},
{
name: "longest-match alternation cervenec beats cerven",
input: "cervenec-srpen",
defaultYear: 2026,
want: []string{"2026-07", "2026-08"},
},
{
name: "range plus standalone — range excludes, dedup",
input: "listopad-leden, prosinec",
defaultYear: 2026,
want: []string{"2025-11", "2025-12", "2026-01"},
},
{
name: "two standalones no range",
input: "prosinec leden",
defaultYear: 2026,
want: []string{"2025-12", "2026-01"},
},
{
name: "numeric plus range mix",
input: "11+12/2025, leden-brezen",
defaultYear: 2026,
want: []string{"2025-11", "2025-12", "2026-01", "2026-02", "2026-03"},
},
{
name: "dedup across numeric and standalone passes",
input: "11+12/25 a listopad",
defaultYear: 2026,
want: []string{"2025-11", "2025-12"},
},
{
name: "no digits before slash — standalone fires instead",
input: "prosince/2025",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "range with trailing slash-year — numeric fails, range wins",
input: "listopad-prosinec/2025",
defaultYear: 2026,
want: []string{"2026-11", "2026-12"},
},
{
name: "dot pattern only — numeric matches but month out of 1-12 range",
input: "01.2026 / 02.2026",
defaultYear: 2026,
want: []string{"2026-01", "2026-02"},
},
{
name: "leading slash — numeric matches at second slash",
input: "/12/2025",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "uppercase input normalized",
input: "PROSINEC",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "diacritics stripped by Normalize",
input: "Žluťoučký prosinec",
defaultYear: 2026,
want: []string{"2025-12"},
},
{
name: "diacritics in range with spaces around dash",
input: "Únor - květen",
defaultYear: 2026,
want: []string{"2026-02", "2026-03", "2026-04", "2026-05"},
},
{
name: "natural language mixed with numeric and standalone",
input: "platba 11/2025 a leden",
defaultYear: 2026,
want: []string{"2025-11", "2026-01"},
},
{
name: "English month name not recognized",
input: "December",
defaultYear: 2026,
want: []string{},
},
{
name: "duplicate input deduped",
input: "11+12/2025 11+12/2025",
defaultYear: 2026,
want: []string{"2025-11", "2025-12"},
},
{
name: "trailing year without separator ignored",
input: "leden 2026",
defaultYear: 2026,
want: []string{"2026-01"},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
got := ParseMonthReferences(tc.input, tc.defaultYear)
if got == nil {
got = []string{}
}
if !reflect.DeepEqual(got, tc.want) {
t.Errorf("ParseMonthReferences(%q, %d)\n got %v\n want %v",
tc.input, tc.defaultYear, got, tc.want)
}
})
}
}