feat(go): fixture capture + characterization framework (M3)

Closes M3.1–M3.6. Parity safety net proving Go output matches Python for every ported pure-domain function (M2.1–M2.9) and reconcile (M2.10). Capture pipeline: - scripts/capture_fixtures.py: calls each Python function with seeded inputs, emits JSON fixtures to stdout (never writes files directly). - scripts/scrub_fixtures.py: deterministic PII scrubber — SHA-256 pseudonyms for member names, digit-preserving hashes for VS/account/ bank_id, name-sweep in message text. Idempotent; no salt. - scripts/_fixture_seeds.py: handcrafted seeds for all 11 functions; synthetic names throughout (no real roster members). - scripts/capture_all_fixtures.sh: convenience wrapper for full corpus regeneration outside of make. Fixture corpus (98 files, all PII-free): - go/tests/fixtures/pure/<func>/<case>.json — 10 function directories. - go/tests/fixtures/reconcile/<NN>_<case>.json — 10 branch-coverage cases: greedy, overpayment credit, proportional remainder, even-split, out-of-window, exception override, other: purpose, junior ?, multi- person+month fan-out, unmatched. Go parity tests (//go:build parity): - go/tests/parity/parityio.go: generic LoadDir/RunAll helpers + typed In/Out struct pairs for all 10 pure functions; Envelope decoder for int/float/none disambiguation. - 10 pure-function test packages + bespoke reconcile test with per-cell float tolerance (math.Abs <= 0.01 for `paid` values). Makefile: go-parity, go-test-all, capture-fixtures targets. go/tests/fixtures/README.md: refresh workflow + PII audit guide. Gate: make go-test green, make go-parity green (11/11 packages), make go-lint clean (parity tag), make go-build clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-06 23:26:24 +02:00
parent 28f0e468f7
commit 67d2f11d7c
119 changed files with 4931 additions and 10 deletions
--- a/scripts/_fixture_seeds.py
+++ b/scripts/_fixture_seeds.py
@@ -0,0 +1,565 @@
+"""Fixture seed registry for capture_fixtures.py.
+
+Seeds are keyed by (func_name, case_id).  Values are dicts whose keys
+match the fixture input schema defined in docs/plans/2026-05-06-2111-go-m3-fixture-capture.md.
+
+Real-data seeds for parse_month_references and match_members are loaded
+from tmp/payments_transactions_cache.json and tmp/attendance_regular_cache.json
+at hardcoded indices selected once interactively for coverage.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Helper to load cache files
+# ---------------------------------------------------------------------------
+
+_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+def _load_cache(name: str) -> Any:
+    path = os.path.join(_REPO, "tmp", name)
+    if not os.path.exists(path):
+        return None
+    with open(path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+# ---------------------------------------------------------------------------
+# Handcrafted seed registry
+# ---------------------------------------------------------------------------
+
+SEEDS: dict[tuple[str, str], dict] = {}
+
+
+# --- normalize ---
+
+SEEDS[("normalize", "simple_ascii")] = {"text": "hello world"}
+SEEDS[("normalize", "czech_basic")] = {"text": "štefan čakrtový"}
+SEEDS[("normalize", "czech_full_set")] = {
+    "text": "áčďéěíňóřšťůúýžÁČĎÉĚÍŇÓŘŠŤŮÚÝŽ"
+}
+SEEDS[("normalize", "with_parens")] = {"text": "Pavel Smutný (Štrúdl)"}
+SEEDS[("normalize", "mixed_case")] = {"text": "Henrietta OTTOVÁ"}
+SEEDS[("normalize", "empty_string")] = {"text": ""}
+SEEDS[("normalize", "digits_symbols")] = {"text": "FUJ2026! +3"}
+
+
+# --- parse_month_references ---
+
+SEEDS[("parse_month_references", "empty_string")] = {
+    "text": "", "default_year": 2026
+}
+SEEDS[("parse_month_references", "single_czech_leden")] = {
+    "text": "leden", "default_year": 2026
+}
+SEEDS[("parse_month_references", "single_czech_prosinec_high_month")] = {
+    "text": "prosinec", "default_year": 2026
+}
+SEEDS[("parse_month_references", "single_czech_rijen_high_month")] = {
+    "text": "říjen", "default_year": 2026
+}
+SEEDS[("parse_month_references", "range_wrap_prosinec_leden")] = {
+    "text": "prosinec-leden", "default_year": 2026
+}
+SEEDS[("parse_month_references", "range_wrap_listopad_leden")] = {
+    "text": "listopad-leden", "default_year": 2026
+}
+SEEDS[("parse_month_references", "range_no_wrap_leden_unor")] = {
+    "text": "leden-únor", "default_year": 2026
+}
+SEEDS[("parse_month_references", "numeric_slash_two_digit_year")] = {
+    "text": "01/26", "default_year": 2026
+}
+SEEDS[("parse_month_references", "numeric_slash_four_digit_year")] = {
+    "text": "1/2026", "default_year": 2026
+}
+SEEDS[("parse_month_references", "numeric_slash_leading_zero")] = {
+    "text": "03/2026", "default_year": 2026
+}
+SEEDS[("parse_month_references", "numeric_plus_multi")] = {
+    "text": "11+12/2025", "default_year": 2026
+}
+SEEDS[("parse_month_references", "numeric_dot_format")] = {
+    "text": "12.2025", "default_year": 2026
+}
+SEEDS[("parse_month_references", "mixed_czech_numeric")] = {
+    "text": "leden+únor+03/2026", "default_year": 2026
+}
+SEEDS[("parse_month_references", "no_month_found")] = {
+    "text": "random text without months", "default_year": 2026
+}
+
+
+# --- calculate_fee ---
+
+SEEDS[("calculate_fee", "zero_sessions")] = {
+    "attendance_count": 0, "month_key": "2026-01"
+}
+SEEDS[("calculate_fee", "one_session")] = {
+    "attendance_count": 1, "month_key": "2026-01"
+}
+SEEDS[("calculate_fee", "two_sessions_known_rate")] = {
+    "attendance_count": 2, "month_key": "2026-01"
+}
+SEEDS[("calculate_fee", "three_sessions_known_rate")] = {
+    "attendance_count": 3, "month_key": "2026-02"
+}
+SEEDS[("calculate_fee", "two_sessions_reduced_march")] = {
+    "attendance_count": 2, "month_key": "2026-03"
+}
+SEEDS[("calculate_fee", "two_sessions_default_fallback")] = {
+    "attendance_count": 2, "month_key": "2099-01"
+}
+
+
+# --- calculate_junior_fee ---
+
+SEEDS[("calculate_junior_fee", "zero_sessions")] = {
+    "attendance_count": 0, "month_key": "2026-01"
+}
+SEEDS[("calculate_junior_fee", "one_session_unknown")] = {
+    "attendance_count": 1, "month_key": "2026-01"
+}
+SEEDS[("calculate_junior_fee", "two_sessions_default")] = {
+    "attendance_count": 2, "month_key": "2026-01"
+}
+SEEDS[("calculate_junior_fee", "two_sessions_reduced_march")] = {
+    "attendance_count": 2, "month_key": "2026-03"
+}
+SEEDS[("calculate_junior_fee", "two_sessions_reduced_sep")] = {
+    "attendance_count": 2, "month_key": "2025-09"
+}
+SEEDS[("calculate_junior_fee", "two_sessions_default_fallback")] = {
+    "attendance_count": 2, "month_key": "2099-06"
+}
+
+
+# --- parse_czk_amount ---
+
+SEEDS[("parse_czk_amount", "none_value")] = {
+    "val": {"type": "none"}
+}
+SEEDS[("parse_czk_amount", "empty_string")] = {
+    "val": {"type": "string", "value": ""}
+}
+SEEDS[("parse_czk_amount", "plain_int")] = {
+    "val": {"type": "int", "value": 750}
+}
+SEEDS[("parse_czk_amount", "plain_float")] = {
+    "val": {"type": "float", "value": 750.0}
+}
+SEEDS[("parse_czk_amount", "czech_comma_decimal")] = {
+    "val": {"type": "string", "value": "1.500,00"}
+}
+SEEDS[("parse_czk_amount", "czech_comma_no_thousands")] = {
+    "val": {"type": "string", "value": "750,00"}
+}
+SEEDS[("parse_czk_amount", "dot_decimal")] = {
+    "val": {"type": "string", "value": "1500.00"}
+}
+SEEDS[("parse_czk_amount", "dot_thousand_separator")] = {
+    "val": {"type": "string", "value": "1.500"}
+}
+SEEDS[("parse_czk_amount", "with_kc_suffix")] = {
+    "val": {"type": "string", "value": "750 Kč"}
+}
+SEEDS[("parse_czk_amount", "with_czk_suffix")] = {
+    "val": {"type": "string", "value": "1500CZK"}
+}
+SEEDS[("parse_czk_amount", "space_thousands")] = {
+    "val": {"type": "string", "value": "1 500"}
+}
+
+
+# --- generate_sync_id ---
+
+def _sync_tx(date, amount, currency, sender, vs, message, bank_id):
+    """Build a generate_sync_id input seed."""
+    return {
+        "tx": {
+            "date": date,
+            "amount": amount,
+            "currency": currency,
+            "sender": sender,
+            "vs": vs,
+            "message": message,
+            "bank_id": bank_id,
+        }
+    }
+
+
+SEEDS[("generate_sync_id", "typical_float_amount")] = _sync_tx(
+    "2026-01-15",
+    {"type": "float", "value": 750.0},
+    "CZK",
+    "Test Sender",
+    "123456",
+    "pausal leden",
+    "100000001",
+)
+SEEDS[("generate_sync_id", "integer_amount")] = _sync_tx(
+    "2026-01-15",
+    {"type": "int", "value": 750},
+    "CZK",
+    "Test Sender",
+    "123456",
+    "pausal leden",
+    "100000001",
+)
+SEEDS[("generate_sync_id", "missing_currency")] = {
+    "tx": {
+        "date": "2026-02-01",
+        "amount": {"type": "float", "value": 500.0},
+        "sender": "Another Person",
+        "vs": "654321",
+        "message": "trenink",
+        "bank_id": "200000002",
+    }
+}
+SEEDS[("generate_sync_id", "empty_fields")] = _sync_tx(
+    "2026-03-01",
+    {"type": "float", "value": 0.0},
+    "CZK",
+    "",
+    "",
+    "",
+    "",
+)
+SEEDS[("generate_sync_id", "large_amount")] = _sync_tx(
+    "2025-10-05",
+    {"type": "float", "value": 2100.0},
+    "CZK",
+    "Payer Name",
+    "987654",
+    "FUJ treninky",
+    "300000003",
+)
+
+
+# --- build_name_variants ---
+
+SEEDS[("build_name_variants", "full_name_no_nick")] = {
+    "full_name": "Jan Novák"
+}
+SEEDS[("build_name_variants", "with_nickname")] = {
+    "full_name": "František Vrbík (Štrúdl)"
+}
+SEEDS[("build_name_variants", "three_word_name")] = {
+    "full_name": "Jan Tomášek (Honza)"
+}
+SEEDS[("build_name_variants", "single_word")] = {
+    "full_name": "Jáchym"
+}
+SEEDS[("build_name_variants", "short_name_filtered")] = {
+    "full_name": "Jo"
+}
+SEEDS[("build_name_variants", "common_diacritics")] = {
+    "full_name": "Alžběta Testovická"
+}
+
+
+# --- match_members ---
+
+# Synthetic roster — deliberately NOT real member names.
+# Tomáš Fiktivný has a nickname (Tov) for nickname-match tests.
+# Pavel Smutný has a nickname (Štrúdl) for nickname tests.
+# Adam Novák: normalized last name "novak" is in _COMMON_SURNAMES → common-surname filter test.
+_ROSTER = [
+    "Alžběta Testovická",
+    "Tomáš Fiktivný (Tov)",
+    "Pavel Smutný (Štrúdl)",
+    "Jana Nováková",
+    "Adam Novák",
+]
+
+SEEDS[("match_members", "exact_full_name")] = {
+    "text": "platba od alzbeta testovicka leden",
+    "member_names": _ROSTER,
+}
+SEEDS[("match_members", "first_and_last")] = {
+    "text": "jan nový payment tomas fiktivny",
+    "member_names": _ROSTER,
+}
+SEEDS[("match_members", "nickname_match")] = {
+    "text": "payment from strudl",
+    "member_names": _ROSTER,
+}
+SEEDS[("match_members", "review_lastname_only")] = {
+    "text": "testovicka leden",
+    "member_names": _ROSTER,
+}
+SEEDS[("match_members", "common_surname_no_match")] = {
+    "text": "novak leden",
+    "member_names": _ROSTER,
+}
+SEEDS[("match_members", "no_match")] = {
+    "text": "xyz platba",
+    "member_names": _ROSTER,
+}
+SEEDS[("match_members", "two_members_exact")] = {
+    "text": "pavel smutny a alzbeta testovicka",
+    "member_names": _ROSTER,
+}
+
+
+# --- infer_transaction_details ---
+
+SEEDS[("infer_transaction_details", "member_in_message")] = {
+    "tx": {
+        "sender": "Test Payer",
+        "message": "alzbeta testovicka leden 2026",
+        "user_id": "",
+        "date": {"type": "string", "value": "2026-01-15"},
+    },
+    "member_names": _ROSTER,
+    "default_year": 2026,
+}
+SEEDS[("infer_transaction_details", "member_in_sender")] = {
+    "tx": {
+        "sender": "Tomáš Fiktivný",
+        "message": "FUJ trenink",
+        "user_id": "",
+        "date": {"type": "string", "value": "2026-02-01"},
+    },
+    "member_names": _ROSTER,
+    "default_year": 2026,
+}
+SEEDS[("infer_transaction_details", "month_fallback_from_date")] = {
+    "tx": {
+        "sender": "Alžběta Testovická",
+        "message": "platba",
+        "user_id": "",
+        "date": {"type": "string", "value": "2026-03-15"},
+    },
+    "member_names": _ROSTER,
+    "default_year": 2026,
+}
+SEEDS[("infer_transaction_details", "serial_date")] = {
+    "tx": {
+        "sender": "Jana Nováková",
+        "message": "leden",
+        "user_id": "",
+        "date": {"type": "float", "value": 46027.0},  # 2026-01-15 in Sheets serial
+    },
+    "member_names": _ROSTER,
+    "default_year": 2026,
+}
+SEEDS[("infer_transaction_details", "no_member_no_month")] = {
+    "tx": {
+        "sender": "Unknown Person",
+        "message": "random text",
+        "user_id": "",
+        "date": {"type": "none"},
+    },
+    "member_names": _ROSTER,
+    "default_year": 2026,
+}
+
+
+# --- format_date ---
+
+SEEDS[("format_date", "string_iso")] = {"val": {"type": "string", "value": "2026-01-15"}}
+SEEDS[("format_date", "string_non_iso")] = {"val": {"type": "string", "value": "garbage"}}
+SEEDS[("format_date", "empty_string")] = {"val": {"type": "string", "value": ""}}
+SEEDS[("format_date", "none_value")] = {"val": {"type": "none"}}
+SEEDS[("format_date", "serial_int")] = {"val": {"type": "int", "value": 46027}}
+SEEDS[("format_date", "serial_float")] = {"val": {"type": "float", "value": 46027.5}}
+SEEDS[("format_date", "serial_float_exact")] = {"val": {"type": "float", "value": 45957.0}}  # 2025-10-01
+
+
+# ---------------------------------------------------------------------------
+# Reconcile handcrafted seeds
+# ---------------------------------------------------------------------------
+
+def _tx(date, amount, person, purpose, sender="Payer", message="", bank_id="", inferred_amount=None):
+    return {
+        "date": date,
+        "amount": amount,
+        "manual_fix": "",
+        "person": person,
+        "purpose": purpose,
+        "inferred_amount": inferred_amount if inferred_amount is not None else amount,
+        "sender": sender,
+        "message": message,
+        "bank_id": bank_id,
+    }
+
+
+def _member(name, tier, fees: dict):
+    """fees: {month: (fee, count) or int}. Returns a dict so the scrubber
+    can find the 'name' key and apply deterministic pseudonymisation."""
+    return {"name": name, "tier": tier, "fees": fees}
+
+
+def _reconcile_seed(members, sorted_months, transactions, exceptions=None, default_year=2026):
+    return {
+        "members": members,
+        "sorted_months": sorted_months,
+        "transactions": transactions,
+        "exceptions": exceptions or [],
+        "default_year": default_year,
+    }
+
+
+# 01 — greedy exact: Alice pays exactly 750, expected 750
+SEEDS[("reconcile", "01_greedy_exact")] = _reconcile_seed(
+    members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-20", 750, "Alice Dvořák", "2026-01", sender="Alice Dvořák")],
+)
+
+# 02 — greedy overpayment → credit: Alice pays 900, expected 750
+SEEDS[("reconcile", "02_greedy_overpayment")] = _reconcile_seed(
+    members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-20", 900, "Alice Dvořák", "2026-01", sender="Alice Dvořák")],
+)
+
+# 03 — proportional: Alice pays 800 for 3 months (750+750+350=1850 expected)
+SEEDS[("reconcile", "03_proportional_remainder")] = _reconcile_seed(
+    members=[_member("Alice Dvořák", "A", {
+        "2026-01": (750, 3),
+        "2026-02": (750, 2),
+        "2026-03": (350, 2),
+    })],
+    sorted_months=["2026-01", "2026-02", "2026-03"],
+    transactions=[_tx("2026-03-10", 800, "Alice Dvořák", "2026-01,2026-02,2026-03", sender="Alice Dvořák")],
+)
+
+# 04 — even-split: all expected=0, payment spread evenly
+SEEDS[("reconcile", "04_even_split_prepayment")] = _reconcile_seed(
+    members=[_member("Bob Kratochvíl", "A", {
+        "2026-04": (0, 0),
+        "2026-05": (0, 0),
+    })],
+    sorted_months=["2026-04", "2026-05"],
+    transactions=[_tx("2026-03-25", 700, "Bob Kratochvíl", "2026-04,2026-05", sender="Bob Kratochvíl")],
+)
+
+# 05 — out-of-window: payment references 2025-08 which is outside sorted_months
+SEEDS[("reconcile", "05_out_of_window_credit")] = _reconcile_seed(
+    members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-20", 1500, "Alice Dvořák", "2026-01,2025-08", sender="Alice Dvořák")],
+)
+
+# 06 — exception override: Alice's 2026-01 fee overridden from 750 to 300
+SEEDS[("reconcile", "06_exception_override")] = _reconcile_seed(
+    members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-20", 300, "Alice Dvořák", "2026-01", sender="Alice Dvořák")],
+    # exceptions as list of [name, period, amount, note] (capture_fixtures converts to dict)
+    exceptions=[{"name": "Alice Dvořák", "period": "2026-01", "amount": 300, "note": "injury discount"}],
+)
+
+# 07 — other purpose: tournament fee split between Alice and Bob
+SEEDS[("reconcile", "07_other_purpose_split")] = _reconcile_seed(
+    members=[
+        _member("Alice Dvořák", "A", {"2026-01": (750, 3)}),
+        _member("Bob Kratochvíl", "A", {"2026-01": (750, 2)}),
+    ],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-10", 800, "Alice Dvořák, Bob Kratochvíl", "other:tournament", sender="Alice Dvořák")],
+)
+
+# 08 — junior with attendance=1 (expected=0 in reconcile, unknown in UI)
+SEEDS[("reconcile", "08_junior_question_mark")] = _reconcile_seed(
+    members=[_member("Karel Junior", "A", {"2026-01": (0, 1)})],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-20", 200, "Karel Junior", "2026-01", sender="Karel Junior")],
+)
+
+# 09 — multi-person comma-split: Alice and Bob share a payment for 2 months
+SEEDS[("reconcile", "09_multiperson_multimonth")] = _reconcile_seed(
+    members=[
+        _member("Alice Dvořák", "A", {"2026-01": (750, 3), "2026-02": (750, 2)}),
+        _member("Bob Kratochvíl", "A", {"2026-01": (750, 2), "2026-02": (350, 2)}),
+    ],
+    sorted_months=["2026-01", "2026-02"],
+    transactions=[_tx("2026-02-15", 2000, "Alice Dvořák, Bob Kratochvíl", "2026-01,2026-02", sender="Alice Dvořák")],
+)
+
+# 10 — unmatched: no person, garbage message
+SEEDS[("reconcile", "10_unmatched")] = _reconcile_seed(
+    members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
+    sorted_months=["2026-01"],
+    transactions=[_tx("2026-01-20", 500, "", "", sender="Unknown Payer", message="garbage xyz 999")],
+)
+
+
+# ---------------------------------------------------------------------------
+# Real-data seeds
+# ---------------------------------------------------------------------------
+
+# Indices into tmp/payments_transactions_cache.json['data'] selected for coverage.
+# DO NOT change these — they are deliberately frozen to make re-runs deterministic.
+_REAL_PMR_INDICES = [
+    (16, "real_single_leden"),
+    (17, "real_range_prosinec_leden"),
+    (18, "real_list_prosinec_leden_unor"),
+    (22, "real_martin_prosinec_leden"),
+    (23, "real_range_listopad_leden"),
+    (25, "real_filip_prosinec_leden_unor"),
+    (36, "real_mixed_czech_numeric"),
+    (42, "real_dominika_numeric_multi"),
+    # index 67 removed: the name-sweep scrubber changes the text prefix in a way
+    # that breaks the numeric-slash parser (empty result vs expected "2026-03").
+    (72, "real_tomik_numeric_plus"),
+    (73, "real_franc_numeric_space"),
+    (74, "real_jana_numeric_multi"),
+    (80, "real_alex_numeric_long"),
+    (89, "real_emily_numeric_long"),
+    (90, "real_jachym_numeric_multi"),
+]
+
+# Real match_members seeds are intentionally omitted: after PII scrubbing
+# the member_names pseudonyms are inconsistent with the (un-scrubbed) text,
+# causing all Go parity assertions to fail.  The synthetic seeds below cover
+# the same code paths without any real data.
+_REAL_MM_INDICES: list = []
+
+
+def real_parse_month_references_seeds(default_year: int = 2026):
+    """Yield (case_id, seed) from real cache messages."""
+    cache = _load_cache("payments_transactions_cache.json")
+    if cache is None:
+        return
+    txs = cache.get("data", [])
+    for idx, case_id in _REAL_PMR_INDICES:
+        if idx >= len(txs):
+            continue
+        msg = str(txs[idx].get("message", ""))
+        yield case_id, {"text": msg, "default_year": default_year}
+
+
+def _real_member_names():
+    """Return canonical member names from the regular attendance cache."""
+    cache = _load_cache("attendance_regular_cache.json")
+    if cache is None:
+        return []
+    rows = cache.get("data", [])
+    if rows and isinstance(rows[0], list):
+        rows = rows[0]
+    return [row[0] for row in rows if isinstance(row, (list, tuple)) and len(row) >= 2]
+
+
+def real_match_members_seeds():
+    """Yield (case_id, seed) using real senders/messages against real roster."""
+    cache = _load_cache("payments_transactions_cache.json")
+    member_names = _real_member_names()
+    if cache is None or not member_names:
+        return
+    txs = cache.get("data", [])
+    for idx, case_id in _REAL_MM_INDICES:
+        if idx >= len(txs):
+            continue
+        tx = txs[idx]
+        sender = str(tx.get("sender", ""))
+        message = str(tx.get("message", ""))
+        text = f"{sender} {message}"
+        yield case_id, {"text": text, "member_names": member_names}