diff --git a/docs/plans/2026-05-05-2204-go-rewrite-m2-1-czech-normalize.md b/docs/plans/2026-05-05-2204-go-rewrite-m2-1-czech-normalize.md new file mode 100644 index 0000000..58f03b5 --- /dev/null +++ b/docs/plans/2026-05-05-2204-go-rewrite-m2-1-czech-normalize.md @@ -0,0 +1,154 @@ +# Plan: Go rewrite — M2.1 `domain/czech.Normalize` + +## Context + +The Go rewrite finished M1 (skeleton, tooling, hello server) in commit +`cf0f176` on 2026-05-04. The next milestone, **M2 — Pure-domain helpers**, +is current per [progress tracker](2026-05-03-2349-go-backend-rewrite-progress.md) +but has no work landed yet (all 12 sub-tasks unchecked). + +This plan covers only the **first** M2 task: porting Python's +`normalize` from [scripts/czech_utils.py](../../scripts/czech_utils.py) +to Go as `internal/domain/czech.Normalize`. It is the lowest-level helper +in the domain — `parse_month_references`, `_build_name_variants`, +`match_members`, exception keys, and `reconcile` all transitively depend +on it. Getting it byte-equivalent first removes a class of "why does my +match not fire" failures from every later M2 task. + +**Decision (confirmed in plan-mode Q):** start with hand-written Go unit +tests for fresh Czech edge cases. Defer parity-fixture wiring until +M3.1/M3.2 land (separate task); add the parity test for `Normalize` +retroactively at that point. + +## Scope + +- New package `go/internal/domain/czech/` with `Normalize` and unit tests. +- Add `golang.org/x/text` dependency to `go/go.mod` (currently zero deps). +- **Out of scope:** `ParseMonthReferences` (M2.2), fixture tooling + (M3.1/M3.2), CLI subcommand wiring (M2.11/M2.12), parity test runner. + +## Recommended approach + +### Python contract to match + +```python +def normalize(text: str) -> str: + nfkd = unicodedata.normalize("NFKD", text) + return "".join(c for c in nfkd if not unicodedata.combining(c)).lower() +``` + +Three semantic operations: +1. NFKD decompose +2. Drop characters where `unicodedata.combining(c)` is non-zero +3. Lowercase + +### Go implementation + +`go/internal/domain/czech/normalize.go`: + +```go +package czech + +import ( + "strings" + "unicode" + "golang.org/x/text/unicode/norm" +) + +func Normalize(s string) string { + decomposed := norm.NFKD.String(s) + var b strings.Builder + b.Grow(len(decomposed)) + for _, r := range decomposed { + if unicode.In(r, unicode.Mn) { + continue + } + b.WriteRune(r) + } + return strings.ToLower(b.String()) +} +``` + +**Two precision points worth flagging:** + +1. **`unicode.Mn` not `unicode.IsMark`.** The plan's library-choices + table mentions `unicode.IsMark`, but that covers Mn + Mc + Me. Python + `unicodedata.combining()` returns 0 for Mc/Me (their canonical + combining class is 0), so it effectively filters only Mn. Use + `unicode.In(r, unicode.Mn)` for byte-equivalence with Python. Cite + this in a one-line code comment; it's the kind of thing a future + reader will second-guess. +2. **`strings.ToLower` vs Go's locale-aware tools.** Python's `.lower()` + on already-decomposed Latin is straight ASCII lowercase for Czech. + Stdlib `strings.ToLower` matches; do not pull in `golang.org/x/text/cases`. + +### Tests + +`go/internal/domain/czech/normalize_test.go` — table-driven, covers: + +- ASCII passthrough: `"Honza" → "honza"` +- Czech lowercase diacritics: `"žluťoučký" → "zlutoucky"` +- Mixed case + diacritics: `"Příliš" → "prilis"` +- Czech caron + ring: `"Dvořák" → "dvorak"`, `"Růžena" → "ruzena"` +- Hard letters: `"Čeněk" → "cenek"`, `"Kačer" → "kacer"` +- Empty string: `"" → ""` +- Already-normalized: `"prilis" → "prilis"` (idempotence) +- Pre-composed vs decomposed input both produce the same output (NFC + `"é"` and `"é"` both → `"e"`) +- Whitespace preserved: `"Jan Novák" → "jan novak"` + +Run a one-shot cross-check against the live Python implementation for +each test input before locking the table: +``` +PYTHONPATH=scripts:. python -c \ + 'from czech_utils import normalize; print(repr(normalize("Dvořák")))' +``` +This is the manual stand-in for the M3 parity fixtures. + +### Wire-up + +- `go get golang.org/x/text@latest` (run from `go/`); `go mod tidy`. +- No CLI changes — `cmd/fuj` already stubs `fees`/`reconcile` with + exit code 2; no need to touch dispatcher for this task. `Normalize` + is consumed by other domain code, not by users directly. + +## Critical files + +- New: [go/internal/domain/czech/normalize.go](../../go/internal/domain/czech/normalize.go) +- New: [go/internal/domain/czech/normalize_test.go](../../go/internal/domain/czech/normalize_test.go) +- Modified: [go/go.mod](../../go/go.mod), `go/go.sum` (new) +- Reference (read-only): [scripts/czech_utils.py](../../scripts/czech_utils.py) — the porting source +- Reference (read-only): [docs/plans/2026-05-03-2349-go-backend-rewrite.md](2026-05-03-2349-go-backend-rewrite.md) — risk #3 (NFKD edge cases) + +## Verification + +End-to-end checks before marking M2.1 done: + +1. `cd go && go build ./...` — clean compile. +2. `cd go && go test ./internal/domain/czech/...` — all table cases green. +3. `cd go && go test -race ./...` — race-clean. +4. `cd go && golangci-lint run` (or `make go-lint` from repo root) — clean. +5. **Spot parity** (manual, will be automated in M3): for each Go test + input, run the Python `normalize` via `PYTHONPATH=scripts:. python -c + '...'` and confirm bytes match. Capture the diff in the commit + message if anything surprises. +6. `make go-build && make go-test && make go-lint` from repo root — proves + the existing M1 gate still passes. + +## Branching & follow-up + +Per [CLAUDE.md](../../CLAUDE.md), this is feature work → branch + Gitea MR: + +- Branch: `feat/m2-1-czech-normalize` off `main`. +- Single commit, Co-Authored-By trailer. +- Push with `-u`, print compare URL + `https://gitea.home.hrajfrisbee.cz/kacerr/fuj-management/compare/main...feat/m2-1-czech-normalize` +- User opens/merges the MR. +- After merge: tick `M2.1` in the progress tracker with the commit SHA; + add a one-line CHANGELOG entry; record any porting surprise in the + tracker's "Notes & decisions" section (e.g. the `Mn`-vs-`IsMark` + precision point if it bears noting). + +Next task after this lands is **M2.2 `ParseMonthReferences`** — the +larger, edge-case-heavier sibling. Whether to start it before or after +M3.1/M3.2 is a separate decision the user can make then. diff --git a/go/go.mod b/go/go.mod index 609a963..718127a 100644 --- a/go/go.mod +++ b/go/go.mod @@ -1,3 +1,5 @@ module fuj-management/go go 1.26.1 + +require golang.org/x/text v0.36.0 diff --git a/go/go.sum b/go/go.sum new file mode 100644 index 0000000..65d3299 --- /dev/null +++ b/go/go.sum @@ -0,0 +1,2 @@ +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= diff --git a/go/internal/domain/czech/normalize.go b/go/internal/domain/czech/normalize.go new file mode 100644 index 0000000..0a26733 --- /dev/null +++ b/go/internal/domain/czech/normalize.go @@ -0,0 +1,26 @@ +package czech + +import ( + "strings" + "unicode" + + "golang.org/x/text/unicode/norm" +) + +// Normalize strips diacritics and lowercases s. +// +// Matches Python: unicodedata.normalize("NFKD", s) then filter out +// combining characters (unicode.Mn only — not Mc/Me, which have +// combining class 0 in Python's unicodedata.combining()). +func Normalize(s string) string { + decomposed := norm.NFKD.String(s) + var b strings.Builder + b.Grow(len(decomposed)) + for _, r := range decomposed { + if unicode.In(r, unicode.Mn) { + continue + } + b.WriteRune(r) + } + return strings.ToLower(b.String()) +} diff --git a/go/internal/domain/czech/normalize_test.go b/go/internal/domain/czech/normalize_test.go new file mode 100644 index 0000000..bcb8627 --- /dev/null +++ b/go/internal/domain/czech/normalize_test.go @@ -0,0 +1,31 @@ +package czech + +import "testing" + +func TestNormalize(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"Honza", "honza"}, + {"žluťoučký", "zlutoucky"}, + {"Příliš", "prilis"}, + {"Dvořák", "dvorak"}, + {"Růžena", "ruzena"}, + {"Čeněk", "cenek"}, + {"Kačer", "kacer"}, + {"", ""}, + {"prilis", "prilis"}, // idempotent + {"Jan Novák", "jan novak"}, // whitespace preserved + {"é", "e"}, // precomposed é (NFC) + {"é", "e"}, // decomposed e + combining acute + {"Ondřej Procházka", "ondrej prochazka"}, // realistic full name + } + + for _, tc := range cases { + got := Normalize(tc.in) + if got != tc.want { + t.Errorf("Normalize(%q) = %q, want %q", tc.in, got, tc.want) + } + } +}