From 54a783ea0014d691e186264fd233a27fa2374587 Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Wed, 6 May 2026 12:43:41 +0200 Subject: [PATCH] feat(go/M2.6): port domain/synch.GenerateSyncID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SHA-256 dedup hash from sync_fio_to_sheets.py generate_sync_id. Key subtlety: Python str(float) emits "500.0" for whole-valued floats and switches to scientific notation at |f|>=1e16 or |f|<1e-4 — replicated via formatAmount using 'f'/'e' format selection. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 7 + ...-05-03-2349-go-backend-rewrite-progress.md | 2 +- ...-06-1236-go-m2-6-synch-generate-sync-id.md | 265 ++++++++++++++++++ go/internal/domain/synch/synch.go | 65 +++++ go/internal/domain/synch/synch_test.go | 119 ++++++++ 5 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 docs/plans/2026-05-06-1236-go-m2-6-synch-generate-sync-id.md create mode 100644 go/internal/domain/synch/synch.go create mode 100644 go/internal/domain/synch/synch_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 08057d9..dc77a37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## 2026-05-06 12:43 CEST — feat(go/M2.6): port domain/synch.GenerateSyncID + +- New `go/internal/domain/synch` package with `GenerateSyncID(Transaction) string` ported from `scripts/sync_fio_to_sheets.py` `generate_sync_id`. +- Byte-stable SHA-256 hash over `date|amount|currency|sender|vs|message|bank_id` (lowercased, UTF-8); `Currency: ""` defaults to `"CZK"` matching the Python missing-key fallback. +- Key subtlety: Python's `str(float)` emits `"500.0"` for whole-valued floats and switches to scientific notation at `|f| >= 1e16` or `|f| < 1e-4` — replicated in `formatAmount` using `'f'`/`'e'` format selection. +- 6 table-driven hash tests + 9 `formatAmount` tests; all expected values verified against live Python on 2026-05-06. + ## 2026-05-06 09:38 CEST — feat(go/M2.5): port domain/money.ParseCZK - New `go/internal/domain/money` package with `ParseCZK(string) (float64, error)` ported from `scripts/infer_payments.py` `parse_czk_amount`. diff --git a/docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md b/docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md index e3116f1..f09d4ab 100644 --- a/docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md +++ b/docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md @@ -49,7 +49,7 @@ Each task: port the function, write Go unit tests for fresh cases, hook into the - [x] **M2.3** `domain/fees.CalculateFee` — port [attendance.py](scripts/attendance.py) `calculate_fee` (constants table) — `0fc3b6d` - [x] **M2.4** `domain/fees.CalculateJuniorFee` — port `calculate_junior_fee` with `Expected{Value int; Unknown bool}` for the `"?"` sentinel — `0fc3b6d` - [x] **M2.5** `domain/money.ParseCZK` — port [infer_payments.py](scripts/infer_payments.py) `parse_czk_amount` (Czech locale: comma decimal, dot/space thousand separators) — `d24d205` -- [ ] **M2.6** `domain/synch.GenerateSyncID` — port [sync_fio_to_sheets.py](scripts/sync_fio_to_sheets.py) `generate_sync_id` (SHA-256, byte-stable hash; verify float string format against real sheet rows) +- [x] **M2.6** `domain/synch.GenerateSyncID` — port [sync_fio_to_sheets.py](scripts/sync_fio_to_sheets.py) `generate_sync_id` (SHA-256, byte-stable hash; verify float string format against real sheet rows) - [ ] **M2.7** `domain/matching.BuildNameVariants` + `MatchMembers` — port `_build_name_variants` and `match_members` from [match_payments.py](scripts/match_payments.py) (auto vs review confidence, common-surname filter) - [ ] **M2.8** `domain/matching.InferTransactionDetails` — port `infer_transaction_details` (composes name + month parsing) - [ ] **M2.9** `domain/matching.FormatDate` — port `format_date` (handles Google Sheets serial-day numbers since 1899-12-30) diff --git a/docs/plans/2026-05-06-1236-go-m2-6-synch-generate-sync-id.md b/docs/plans/2026-05-06-1236-go-m2-6-synch-generate-sync-id.md new file mode 100644 index 0000000..42e2a1b --- /dev/null +++ b/docs/plans/2026-05-06-1236-go-m2-6-synch-generate-sync-id.md @@ -0,0 +1,265 @@ + +## Context + +Continuing the Go backend rewrite tracked in +[2026-05-03-2349-go-backend-rewrite-progress.md](../../srv/personal/fuj-management/docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md). +M2.1–M2.5 are landed. Next leaf-level pure function is `generate_sync_id` +from [scripts/sync_fio_to_sheets.py:62-77](../../srv/personal/fuj-management/scripts/sync_fio_to_sheets.py#L62-L77). + +It computes a SHA-256 hash over a fixed seven-field projection of a Fio +transaction (`date|amount|currency|sender|vs|message|bank_id`) and is +the deduplication key written into column K (`Sync ID`) of the payments +sheet. The Go port must produce a **byte-identical** digest for the same +transaction; otherwise the Go-side sync (M4.7) would re-append rows +already written by the Python sync, double-counting payments. + +The non-trivial part is the `amount` field's string serialisation: +upstream `fio_utils.py` always supplies `amount` as a Python `float` +(API path: `float(val(1) or 0)`; HTML path: `parse_czech_amount(...)` +which returns `float`). Python's `str(float)` produces `"500.0"` for +whole-valued floats; Go's `strconv.FormatFloat(f, 'g', -1, 64)` produces +`"500"`. This is the gotcha called out in the M2.6 line of the progress +tracker. + +## Python behaviour (the spec) + +```py +def generate_sync_id(tx: dict) -> str: + components = [ + str(tx.get("date", "")), + str(tx.get("amount", "")), + str(tx.get("currency", "CZK")), + str(tx.get("sender", "")), + str(tx.get("vs", "")), + str(tx.get("message", "")), + str(tx.get("bank_id", "")), + ] + raw_str = "|".join(components).lower() + return hashlib.sha256(raw_str.encode("utf-8")).hexdigest() +``` + +Behavioural notes for the Go port: + +1. **Field order is load-bearing.** `date|amount|currency|sender|vs|message|bank_id` exactly. +2. **Separator is `"|"`.** +3. **Whole string is `.lower()`-ed before hashing** (so e.g. "ABC" sender vs "abc" hash identically). Unicode lower; in practice Fio data is ASCII + Czech diacritics. +4. **`currency` defaults to `"CZK"`** when missing from the dict (HTML scraper path never sets it). Other fields default to `""`. +5. **`amount` is a `float`.** Always. Real Fio data is `500.0`, `1234.56`, etc. — no NaN/Inf, but parity test must pin the format. +6. **Output is `hashlib.sha256(...).hexdigest()`** — 64-char lowercase hex. +7. **Encoding is UTF-8.** + +### `str(float)` cases observed in real Fio amounts + +| float64 | Python `str(f)` | Go `strconv.FormatFloat(f,'g',-1,64)` | Need | +|---|---|---|---| +| `500.0` | `"500.0"` | `"500"` | append `.0` | +| `1234.56` | `"1234.56"` | `"1234.56"` | matches | +| `0.0` | `"0.0"` | `"0"` | append `.0` | +| `-500.0` | `"-500.0"` | `"-500"` | append `.0` | +| `0.1` | `"0.1"` | `"0.1"` | matches | +| `99999.99` | `"99999.99"` | `"99999.99"` | matches | + +For the Fio amount domain (signed CZK, ≤ ~7 digits, ≤2 decimal places), +the rule "`'g'` with prec -1, then append `.0` if result has no `.` and +no `e`/`E`" is exact. We do not need to handle Python's +scientific-notation crossover (`>= 1e16`) for real data, but the +implementation should still cope with it correctly via the same rule. + +## Approach + +Create new package `internal/domain/synch` mirroring the layout of +`internal/domain/money` (single-file module + test file alongside). + +### Package + signature + +```go +// Package synch ports the bank-sync deduplication helper from +// scripts/sync_fio_to_sheets.py. +package synch + +// Transaction is the projection of a Fio transaction that participates +// in the Sync ID hash. Other fields (ks, ss, sender_account, …) are +// intentionally excluded — they are not part of the Python hash. +// +// Currency: leave "" to inherit the Python default of "CZK" (matches +// the HTML scraper path which omits the key entirely). +type Transaction struct { + Date string + Amount float64 + Currency string + Sender string + VS string + Message string + BankID string +} + +// GenerateSyncID returns the lowercase SHA-256 hex digest of +// "date|amount|currency|sender|vs|message|bank_id" (lower-cased), used +// as the dedup key in column K of the payments sheet. +// +// Byte-stable with scripts/sync_fio_to_sheets.py generate_sync_id. +func GenerateSyncID(tx Transaction) string +``` + +### `Currency` default + +In Go every struct field is always present, so we lose Python's +"missing key vs empty string" distinction. Real-world data either sets +`currency = "CZK"` (API path) or omits the key (HTML path → `"CZK"` +default). Empty string never occurs in practice. The Go port collapses +the two by treating `Currency == ""` as "use `CZK`": + +```go +currency := tx.Currency +if currency == "" { + currency = "CZK" +} +``` + +This is byte-equal to Python for every input we will ever see in +production, and avoids forcing callers to pass a `*string`. + +### Float formatter + +Internal helper, unexported: + +```go +// formatAmount mimics Python's str(float) for the float values that +// appear in Fio transactions. For mundane decimal amounts the rule +// is: format with 'g' precision -1, then append ".0" if the result +// has no decimal point and no exponent. +func formatAmount(f float64) string { + s := strconv.FormatFloat(f, 'g', -1, 64) + if !strings.ContainsAny(s, ".eE") { + s += ".0" + } + return s +} +``` + +Tested explicitly (see Tests below) so the edge cases (`0`, whole +numbers, negatives, large/small with exponent) stay locked. + +### Hash composition + +```go +func GenerateSyncID(tx Transaction) string { + currency := tx.Currency + if currency == "" { + currency = "CZK" + } + raw := strings.ToLower(strings.Join([]string{ + tx.Date, + formatAmount(tx.Amount), + currency, + tx.Sender, + tx.VS, + tx.Message, + tx.BankID, + }, "|")) + sum := sha256.Sum256([]byte(raw)) + return hex.EncodeToString(sum[:]) +} +``` + +(`crypto/sha256` + `encoding/hex` — both stdlib, no `go.mod` change.) + +## Tests + +`synch_test.go` mirrors `money_test.go`'s table-driven style with the +verification snippet at the top of the function. Two test functions: + +### 1. `TestGenerateSyncID` + +Each row's expected digest is computed from the Python source: + +```sh +PYTHONPATH=scripts:. python -c ' +from sync_fio_to_sheets import generate_sync_id +cases = [ + {"date":"2026-01-15","amount":500.0,"currency":"CZK","sender":"Jan Novak","vs":"123","message":"clenske 1/2026","bank_id":"abc123"}, + {"date":"2026-01-15","amount":500.0,"sender":"Jan Novak","vs":"123","message":"clenske 1/2026","bank_id":"abc123"}, # currency missing → CZK + {"date":"2026-02-10","amount":1234.56,"currency":"CZK","sender":"ABC SRO","vs":"","message":"FAKTURA 42","bank_id":"xyz"}, # mixed case → lowercased + {"date":"2026-03-01","amount":-500.0,"currency":"CZK","sender":"refund","vs":"","message":"","bank_id":""}, # negative + {"date":"2026-04-01","amount":0.0,"currency":"CZK","sender":"","vs":"","message":"","bank_id":""}, # zero amount + {}, # empty dict — every field falls back to default +] +for c in cases: + print(repr(c), "->", generate_sync_id(c)) +' +``` + +Cases (one row per dict above), each asserting the exact 64-char hex +digest the snippet prints. Cover: + +- Happy path with all fields set. +- `Currency: ""` → `"CZK"` default (parity with missing key). +- Mixed-case sender/message → lowercased before hashing. +- Negative amount. +- Zero amount. +- Zero-value `Transaction{}` — every field at Go zero, currency defaults + to `"CZK"`, hash matches Python `generate_sync_id({})`. + +### 2. `TestFormatAmount` + +Pin the float formatter against Python's `str(float)`: + +```sh +PYTHONPATH=scripts:. python -c ' +for v in [0.0, 500.0, -500.0, 0.1, 1234.56, 99999.99, 1500000.0, 1e16, 1e-5]: + print(repr(v), "->", repr(str(v))) +' +``` + +Table of `(float64, expected string)` pairs. Whole numbers must end in +`.0`; existing decimal representations pass through unchanged; +exponent-form floats (`1e16`, `1e-5`) keep their format. + +## Files to create + +- `go/internal/domain/synch/synch.go` — package, `Transaction`, + `GenerateSyncID`, internal `formatAmount`. +- `go/internal/domain/synch/synch_test.go` — `TestGenerateSyncID` + + `TestFormatAmount`. + +No existing Go files need editing. + +## Verification + +```sh +cd go && go test ./internal/domain/synch/... +make go-lint +make go-build # sanity: nothing else broke +``` + +Plus run the two Python snippets in the Tests section and diff their +output against the test tables to confirm parity. + +## Out of scope (explicit non-goals) + +- **Hooking into the Tier-1 parity runner.** That comes with M3.5 + (`-tags=parity` build constraint and `tests/fixtures/pure/`). M2.6 + ships with hand-written, Python-verified test tables — same approach + used by M2.1–M2.5. +- **A richer `Transaction` struct** covering ks/ss/note/sender_account. + Those fields aren't part of the hash. M4.4 (Fio IO adapter) will + decide whether to reuse `synch.Transaction` or define its own struct + and convert at the boundary. +- **Polymorphic input** (e.g. accepting a `map[string]any`). Python's + duck-typing is a non-goal in Go. +- **Any Python callsite migration.** `sync_fio_to_sheets.py` keeps using + its own `generate_sync_id` until M4.7 ports the sync service. + +## Progress tracker + changelog + +After the commit lands: + +- Tick `M2.6` in + [docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md](../../srv/personal/fuj-management/docs/plans/2026-05-03-2349-go-backend-rewrite-progress.md) + with the commit SHA, mirroring the M2.5 entry style. +- Add a `CHANGELOG.md` entry at top: + `## YYYY-MM-DD HH:MM TZ — feat(go/M2.6): port domain/synch.GenerateSyncID`. + +Branch: `feat/m2-6-synch-generate-sync-id` (per CLAUDE.md +branch-per-feature workflow). Push, open MR via `tea pr create`, leave +merge to the user. diff --git a/go/internal/domain/synch/synch.go b/go/internal/domain/synch/synch.go new file mode 100644 index 0000000..9b92de9 --- /dev/null +++ b/go/internal/domain/synch/synch.go @@ -0,0 +1,65 @@ +// Package synch ports the bank-sync deduplication helper from +// scripts/sync_fio_to_sheets.py. +package synch + +import ( + "crypto/sha256" + "encoding/hex" + "math" + "strconv" + "strings" +) + +// Transaction is the projection of a Fio transaction that participates +// in the Sync ID hash. Other fields (ks, ss, sender_account, …) are +// intentionally excluded — they are not part of the Python hash. +// +// Currency: leave "" to inherit the Python default of "CZK" (matches +// the HTML scraper path which omits the key entirely). +type Transaction struct { + Date string + Amount float64 + Currency string + Sender string + VS string + Message string + BankID string +} + +// GenerateSyncID returns the lowercase SHA-256 hex digest of +// "date|amount|currency|sender|vs|message|bank_id" (lower-cased), used +// as the dedup key in column K of the payments sheet. +// +// Byte-stable with scripts/sync_fio_to_sheets.py generate_sync_id. +func GenerateSyncID(tx Transaction) string { + currency := tx.Currency + if currency == "" { + currency = "CZK" + } + raw := strings.ToLower(strings.Join([]string{ + tx.Date, + formatAmount(tx.Amount), + currency, + tx.Sender, + tx.VS, + tx.Message, + tx.BankID, + }, "|")) + sum := sha256.Sum256([]byte(raw)) + return hex.EncodeToString(sum[:]) +} + +// formatAmount mimics Python's str(float) for Fio transaction amounts. +// Python uses decimal notation for abs(f) in [1e-4, 1e16) and scientific +// notation outside that range, always adding ".0" to whole-valued decimals. +func formatAmount(f float64) string { + abs := math.Abs(f) + if abs != 0 && (abs < 1e-4 || abs >= 1e16) { + return strconv.FormatFloat(f, 'e', -1, 64) + } + s := strconv.FormatFloat(f, 'f', -1, 64) + if !strings.ContainsRune(s, '.') { + s += ".0" + } + return s +} diff --git a/go/internal/domain/synch/synch_test.go b/go/internal/domain/synch/synch_test.go new file mode 100644 index 0000000..4664b71 --- /dev/null +++ b/go/internal/domain/synch/synch_test.go @@ -0,0 +1,119 @@ +package synch + +import ( + "testing" +) + +// All expected digests verified against the live Python implementation on 2026-05-06: +// +// PYTHONPATH=scripts:. python -c ' +// from sync_fio_to_sheets import generate_sync_id +// cases = [ +// {"date":"2026-01-15","amount":500.0,"currency":"CZK","sender":"Jan Novak","vs":"123","message":"clenske 1/2026","bank_id":"abc123"}, +// {"date":"2026-01-15","amount":500.0,"sender":"Jan Novak","vs":"123","message":"clenske 1/2026","bank_id":"abc123"}, +// {"date":"2026-02-10","amount":1234.56,"currency":"CZK","sender":"ABC SRO","vs":"","message":"FAKTURA 42","bank_id":"xyz"}, +// {"date":"2026-03-01","amount":-500.0,"currency":"CZK","sender":"refund","vs":"","message":"","bank_id":""}, +// {"date":"2026-04-01","amount":0.0,"currency":"CZK","sender":"","vs":"","message":"","bank_id":""}, +// {"date":"","amount":0.0,"currency":"CZK","sender":"","vs":"","message":"","bank_id":""}, +// ] +// for c in cases: print(generate_sync_id(c)) +// ' +func TestGenerateSyncID(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + tx Transaction + want string + }{ + { + name: "all fields set", + tx: Transaction{ + Date: "2026-01-15", Amount: 500.0, Currency: "CZK", + Sender: "Jan Novak", VS: "123", Message: "clenske 1/2026", BankID: "abc123", + }, + want: "4ac26598b6f23965380690172156a438a7e97a97dcedf222e5afe1afbe2c1bc4", + }, + { + name: "currency empty defaults to CZK", + tx: Transaction{ + Date: "2026-01-15", Amount: 500.0, Currency: "", + Sender: "Jan Novak", VS: "123", Message: "clenske 1/2026", BankID: "abc123", + }, + want: "4ac26598b6f23965380690172156a438a7e97a97dcedf222e5afe1afbe2c1bc4", + }, + { + name: "mixed-case fields lowercased before hashing", + tx: Transaction{ + Date: "2026-02-10", Amount: 1234.56, Currency: "CZK", + Sender: "ABC SRO", VS: "", Message: "FAKTURA 42", BankID: "xyz", + }, + want: "d40fa224d4fa572ffcd58e308e5c6508c4d5ca087b24ef6ff9284528fc128250", + }, + { + name: "negative amount", + tx: Transaction{ + Date: "2026-03-01", Amount: -500.0, Currency: "CZK", + Sender: "refund", VS: "", Message: "", BankID: "", + }, + want: "0c630a407160367c396a2beec08efb94c319b4d84a8b90cc2be89e6ea10c391f", + }, + { + name: "zero amount", + tx: Transaction{ + Date: "2026-04-01", Amount: 0.0, Currency: "CZK", + Sender: "", VS: "", Message: "", BankID: "", + }, + want: "6a23ce53717cd539064d550d2c2ec5de2e9bf81016d16852820ca9b8e259331f", + }, + { + // Python equivalent: {"date":"","amount":0.0,"currency":"CZK","sender":"","vs":"","message":"","bank_id":""} + // Note: Python generate_sync_id({}) hashes "" for missing amount, not "0.0". + name: "zero-value Transaction", + tx: Transaction{}, + want: "d33d7e391f5a43f0192bb5a34c0ec15715139125678ecef8e1324af7d943b21d", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got := GenerateSyncID(tc.tx) + if got != tc.want { + t.Errorf("GenerateSyncID(%+v) = %q, want %q", tc.tx, got, tc.want) + } + }) + } +} + +// All expected strings verified against the live Python implementation on 2026-05-06: +// +// PYTHONPATH=scripts:. python -c ' +// for v in [0.0, 500.0, -500.0, 0.1, 1234.56, 99999.99, 1500000.0, 1e16, 1e-5]: +// print(repr(v), "->", repr(str(v))) +// ' +func TestFormatAmount(t *testing.T) { + t.Parallel() + + cases := []struct { + in float64 + want string + }{ + {0.0, "0.0"}, + {500.0, "500.0"}, + {-500.0, "-500.0"}, + {0.1, "0.1"}, + {1234.56, "1234.56"}, + {99999.99, "99999.99"}, + {1500000.0, "1500000.0"}, + {1e16, "1e+16"}, + {1e-5, "1e-05"}, + } + + for _, tc := range cases { + got := formatAmount(tc.in) + if got != tc.want { + t.Errorf("formatAmount(%v) = %q, want %q", tc.in, got, tc.want) + } + } +}