All checks were successful
Deploy to K8s / deploy (push) Successful in 8s
Three-pass regex parser matching python/czech_utils.py parse_month_references: 1. Numeric slash notation — "11+12/2025", "01/26"; 2-digit year → +2000 2. Dot notation — "12.2025" (4-digit year only) 3. Czech month names — range walk (listopad-leden wrap logic) then standalone with m≥10 → defaultYear-1 heuristic; longest-match alternation (sorted desc by name length) handles cervenec vs cerven 35 table-driven tests, all expected outputs verified against live Python on 2026-05-05 before locking. Plan at docs/plans/2026-05-05-2337-go-rewrite-m2-2-parse-month-references.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
155 lines
3.8 KiB
Go
155 lines
3.8 KiB
Go
package czech
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
var czechMonths = map[string]int{
|
|
"leden": 1, "ledna": 1, "lednu": 1,
|
|
"unor": 2, "unora": 2, "unoru": 2,
|
|
"brezen": 3, "brezna": 3, "breznu": 3,
|
|
"duben": 4, "dubna": 4, "dubnu": 4,
|
|
"kveten": 5, "kvetna": 5, "kvetnu": 5,
|
|
"cerven": 6, "cervna": 6, "cervnu": 6,
|
|
"cervenec": 7, "cervnce": 7, "cervenci": 7,
|
|
"srpen": 8, "srpna": 8, "srpnu": 8,
|
|
"zari": 9,
|
|
"rijen": 10, "rijna": 10, "rijnu": 10,
|
|
"listopad": 11, "listopadu": 11,
|
|
"prosinec": 12, "prosince": 12, "prosinci": 12,
|
|
}
|
|
|
|
var (
|
|
numericRe *regexp.Regexp
|
|
dotRe *regexp.Regexp
|
|
rangeRe *regexp.Regexp
|
|
standRe *regexp.Regexp
|
|
)
|
|
|
|
func init() {
|
|
// Sort by descending length so longer alternatives win in RE2 leftmost-first
|
|
// matching (e.g. "cervenec" is tried before "cerven").
|
|
names := make([]string, 0, len(czechMonths))
|
|
for name := range czechMonths {
|
|
names = append(names, name)
|
|
}
|
|
sort.Slice(names, func(i, j int) bool {
|
|
if len(names[i]) != len(names[j]) {
|
|
return len(names[i]) > len(names[j])
|
|
}
|
|
return names[i] < names[j]
|
|
})
|
|
alt := strings.Join(names, "|")
|
|
|
|
numericRe = regexp.MustCompile(`([\d+]+)\s*/\s*(\d{2,4})`)
|
|
dotRe = regexp.MustCompile(`(\d{1,2})\s*\.\s*(\d{4})`)
|
|
rangeRe = regexp.MustCompile(`(` + alt + `)\s*-\s*(` + alt + `)`)
|
|
standRe = regexp.MustCompile(`\b(` + alt + `)\b`)
|
|
}
|
|
|
|
// ParseMonthReferences extracts YYYY-MM month references from Czech free text.
|
|
//
|
|
// defaultYear seeds two heuristics: standalone month names with m >= 10 are
|
|
// treated as defaultYear-1 (out-of-year backfill), and wrap-around ranges
|
|
// (e.g. listopad-leden) place months >= start_m in defaultYear-1.
|
|
//
|
|
// Returns a sorted, deduplicated slice of "YYYY-MM" strings.
|
|
func ParseMonthReferences(text string, defaultYear int) []string {
|
|
normalized := Normalize(text)
|
|
seen := map[string]struct{}{}
|
|
|
|
add := func(year, m int) {
|
|
if m >= 1 && m <= 12 {
|
|
seen[fmt.Sprintf("%04d-%02d", year, m)] = struct{}{}
|
|
}
|
|
}
|
|
|
|
// Pass 1: numeric months — "11+12/2025", "01/26", "1/2026"
|
|
for _, groups := range numericRe.FindAllStringSubmatch(normalized, -1) {
|
|
monthsPart, yearStr := groups[1], groups[2]
|
|
year, err := strconv.Atoi(yearStr)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if year < 100 {
|
|
year += 2000
|
|
}
|
|
for mStr := range strings.SplitSeq(monthsPart, "+") {
|
|
mStr = strings.TrimSpace(mStr)
|
|
if mStr == "" {
|
|
continue
|
|
}
|
|
allDigits := true
|
|
for _, c := range mStr {
|
|
if c < '0' || c > '9' {
|
|
allDigits = false
|
|
break
|
|
}
|
|
}
|
|
if !allDigits {
|
|
continue
|
|
}
|
|
m, err := strconv.Atoi(mStr)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
add(year, m)
|
|
}
|
|
}
|
|
|
|
// Pass 2: dot-separated month.year — "12.2025" (4-digit year only)
|
|
for _, groups := range dotRe.FindAllStringSubmatch(normalized, -1) {
|
|
m, _ := strconv.Atoi(groups[1])
|
|
year, _ := strconv.Atoi(groups[2])
|
|
add(year, m)
|
|
}
|
|
|
|
// Pass 3a: Czech month name ranges — "listopad-leden"
|
|
foundInRanges := map[string]struct{}{}
|
|
for _, groups := range rangeRe.FindAllStringSubmatch(normalized, -1) {
|
|
startName, endName := groups[1], groups[2]
|
|
foundInRanges[startName] = struct{}{}
|
|
foundInRanges[endName] = struct{}{}
|
|
startM := czechMonths[startName]
|
|
endM := czechMonths[endName]
|
|
wraps := startM > endM
|
|
m := startM
|
|
for range 12 {
|
|
year := defaultYear
|
|
if wraps && m >= startM {
|
|
year = defaultYear - 1
|
|
}
|
|
add(year, m)
|
|
if m == endM {
|
|
break
|
|
}
|
|
m = m%12 + 1
|
|
}
|
|
}
|
|
|
|
// Pass 3b: standalone Czech month names (not part of a range)
|
|
for _, groups := range standRe.FindAllStringSubmatch(normalized, -1) {
|
|
name := groups[1]
|
|
if _, inRange := foundInRanges[name]; inRange {
|
|
continue
|
|
}
|
|
m := czechMonths[name]
|
|
year := defaultYear
|
|
if m >= 10 {
|
|
year = defaultYear - 1
|
|
}
|
|
add(year, m)
|
|
}
|
|
|
|
result := make([]string, 0, len(seen))
|
|
for k := range seen {
|
|
result = append(result, k)
|
|
}
|
|
sort.Strings(result)
|
|
return result
|
|
}
|