feat(go): fixture capture + characterization framework (M3)
All checks were successful
Deploy to K8s / deploy (push) Successful in 7s
All checks were successful
Deploy to K8s / deploy (push) Successful in 7s
Closes M3.1–M3.6. Parity safety net proving Go output matches Python
for every ported pure-domain function (M2.1–M2.9) and reconcile (M2.10).
Capture pipeline:
- scripts/capture_fixtures.py: calls each Python function with seeded
inputs, emits JSON fixtures to stdout (never writes files directly).
- scripts/scrub_fixtures.py: deterministic PII scrubber — SHA-256
pseudonyms for member names, digit-preserving hashes for VS/account/
bank_id, name-sweep in message text. Idempotent; no salt.
- scripts/_fixture_seeds.py: handcrafted seeds for all 11 functions;
synthetic names throughout (no real roster members).
- scripts/capture_all_fixtures.sh: convenience wrapper for full corpus
regeneration outside of make.
Fixture corpus (98 files, all PII-free):
- go/tests/fixtures/pure/<func>/<case>.json — 10 function directories.
- go/tests/fixtures/reconcile/<NN>_<case>.json — 10 branch-coverage
cases: greedy, overpayment credit, proportional remainder, even-split,
out-of-window, exception override, other: purpose, junior ?, multi-
person+month fan-out, unmatched.
Go parity tests (//go:build parity):
- go/tests/parity/parityio.go: generic LoadDir/RunAll helpers + typed
In/Out struct pairs for all 10 pure functions; Envelope decoder for
int/float/none disambiguation.
- 10 pure-function test packages + bespoke reconcile test with per-cell
float tolerance (math.Abs <= 0.01 for `paid` values).
Makefile: go-parity, go-test-all, capture-fixtures targets.
go/tests/fixtures/README.md: refresh workflow + PII audit guide.
Gate: make go-test green, make go-parity green (11/11 packages),
make go-lint clean (parity tag), make go-build clean.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
565
scripts/_fixture_seeds.py
Normal file
565
scripts/_fixture_seeds.py
Normal file
@@ -0,0 +1,565 @@
|
||||
"""Fixture seed registry for capture_fixtures.py.
|
||||
|
||||
Seeds are keyed by (func_name, case_id). Values are dicts whose keys
|
||||
match the fixture input schema defined in docs/plans/2026-05-06-2111-go-m3-fixture-capture.md.
|
||||
|
||||
Real-data seeds for parse_month_references and match_members are loaded
|
||||
from tmp/payments_transactions_cache.json and tmp/attendance_regular_cache.json
|
||||
at hardcoded indices selected once interactively for coverage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper to load cache files
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
def _load_cache(name: str) -> Any:
|
||||
path = os.path.join(_REPO, "tmp", name)
|
||||
if not os.path.exists(path):
|
||||
return None
|
||||
with open(path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Handcrafted seed registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SEEDS: dict[tuple[str, str], dict] = {}
|
||||
|
||||
|
||||
# --- normalize ---
|
||||
|
||||
SEEDS[("normalize", "simple_ascii")] = {"text": "hello world"}
|
||||
SEEDS[("normalize", "czech_basic")] = {"text": "štefan čakrtový"}
|
||||
SEEDS[("normalize", "czech_full_set")] = {
|
||||
"text": "áčďéěíňóřšťůúýžÁČĎÉĚÍŇÓŘŠŤŮÚÝŽ"
|
||||
}
|
||||
SEEDS[("normalize", "with_parens")] = {"text": "Pavel Smutný (Štrúdl)"}
|
||||
SEEDS[("normalize", "mixed_case")] = {"text": "Henrietta OTTOVÁ"}
|
||||
SEEDS[("normalize", "empty_string")] = {"text": ""}
|
||||
SEEDS[("normalize", "digits_symbols")] = {"text": "FUJ2026! +3"}
|
||||
|
||||
|
||||
# --- parse_month_references ---
|
||||
|
||||
SEEDS[("parse_month_references", "empty_string")] = {
|
||||
"text": "", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "single_czech_leden")] = {
|
||||
"text": "leden", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "single_czech_prosinec_high_month")] = {
|
||||
"text": "prosinec", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "single_czech_rijen_high_month")] = {
|
||||
"text": "říjen", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "range_wrap_prosinec_leden")] = {
|
||||
"text": "prosinec-leden", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "range_wrap_listopad_leden")] = {
|
||||
"text": "listopad-leden", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "range_no_wrap_leden_unor")] = {
|
||||
"text": "leden-únor", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "numeric_slash_two_digit_year")] = {
|
||||
"text": "01/26", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "numeric_slash_four_digit_year")] = {
|
||||
"text": "1/2026", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "numeric_slash_leading_zero")] = {
|
||||
"text": "03/2026", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "numeric_plus_multi")] = {
|
||||
"text": "11+12/2025", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "numeric_dot_format")] = {
|
||||
"text": "12.2025", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "mixed_czech_numeric")] = {
|
||||
"text": "leden+únor+03/2026", "default_year": 2026
|
||||
}
|
||||
SEEDS[("parse_month_references", "no_month_found")] = {
|
||||
"text": "random text without months", "default_year": 2026
|
||||
}
|
||||
|
||||
|
||||
# --- calculate_fee ---
|
||||
|
||||
SEEDS[("calculate_fee", "zero_sessions")] = {
|
||||
"attendance_count": 0, "month_key": "2026-01"
|
||||
}
|
||||
SEEDS[("calculate_fee", "one_session")] = {
|
||||
"attendance_count": 1, "month_key": "2026-01"
|
||||
}
|
||||
SEEDS[("calculate_fee", "two_sessions_known_rate")] = {
|
||||
"attendance_count": 2, "month_key": "2026-01"
|
||||
}
|
||||
SEEDS[("calculate_fee", "three_sessions_known_rate")] = {
|
||||
"attendance_count": 3, "month_key": "2026-02"
|
||||
}
|
||||
SEEDS[("calculate_fee", "two_sessions_reduced_march")] = {
|
||||
"attendance_count": 2, "month_key": "2026-03"
|
||||
}
|
||||
SEEDS[("calculate_fee", "two_sessions_default_fallback")] = {
|
||||
"attendance_count": 2, "month_key": "2099-01"
|
||||
}
|
||||
|
||||
|
||||
# --- calculate_junior_fee ---
|
||||
|
||||
SEEDS[("calculate_junior_fee", "zero_sessions")] = {
|
||||
"attendance_count": 0, "month_key": "2026-01"
|
||||
}
|
||||
SEEDS[("calculate_junior_fee", "one_session_unknown")] = {
|
||||
"attendance_count": 1, "month_key": "2026-01"
|
||||
}
|
||||
SEEDS[("calculate_junior_fee", "two_sessions_default")] = {
|
||||
"attendance_count": 2, "month_key": "2026-01"
|
||||
}
|
||||
SEEDS[("calculate_junior_fee", "two_sessions_reduced_march")] = {
|
||||
"attendance_count": 2, "month_key": "2026-03"
|
||||
}
|
||||
SEEDS[("calculate_junior_fee", "two_sessions_reduced_sep")] = {
|
||||
"attendance_count": 2, "month_key": "2025-09"
|
||||
}
|
||||
SEEDS[("calculate_junior_fee", "two_sessions_default_fallback")] = {
|
||||
"attendance_count": 2, "month_key": "2099-06"
|
||||
}
|
||||
|
||||
|
||||
# --- parse_czk_amount ---
|
||||
|
||||
SEEDS[("parse_czk_amount", "none_value")] = {
|
||||
"val": {"type": "none"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "empty_string")] = {
|
||||
"val": {"type": "string", "value": ""}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "plain_int")] = {
|
||||
"val": {"type": "int", "value": 750}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "plain_float")] = {
|
||||
"val": {"type": "float", "value": 750.0}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "czech_comma_decimal")] = {
|
||||
"val": {"type": "string", "value": "1.500,00"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "czech_comma_no_thousands")] = {
|
||||
"val": {"type": "string", "value": "750,00"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "dot_decimal")] = {
|
||||
"val": {"type": "string", "value": "1500.00"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "dot_thousand_separator")] = {
|
||||
"val": {"type": "string", "value": "1.500"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "with_kc_suffix")] = {
|
||||
"val": {"type": "string", "value": "750 Kč"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "with_czk_suffix")] = {
|
||||
"val": {"type": "string", "value": "1500CZK"}
|
||||
}
|
||||
SEEDS[("parse_czk_amount", "space_thousands")] = {
|
||||
"val": {"type": "string", "value": "1 500"}
|
||||
}
|
||||
|
||||
|
||||
# --- generate_sync_id ---
|
||||
|
||||
def _sync_tx(date, amount, currency, sender, vs, message, bank_id):
|
||||
"""Build a generate_sync_id input seed."""
|
||||
return {
|
||||
"tx": {
|
||||
"date": date,
|
||||
"amount": amount,
|
||||
"currency": currency,
|
||||
"sender": sender,
|
||||
"vs": vs,
|
||||
"message": message,
|
||||
"bank_id": bank_id,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
SEEDS[("generate_sync_id", "typical_float_amount")] = _sync_tx(
|
||||
"2026-01-15",
|
||||
{"type": "float", "value": 750.0},
|
||||
"CZK",
|
||||
"Test Sender",
|
||||
"123456",
|
||||
"pausal leden",
|
||||
"100000001",
|
||||
)
|
||||
SEEDS[("generate_sync_id", "integer_amount")] = _sync_tx(
|
||||
"2026-01-15",
|
||||
{"type": "int", "value": 750},
|
||||
"CZK",
|
||||
"Test Sender",
|
||||
"123456",
|
||||
"pausal leden",
|
||||
"100000001",
|
||||
)
|
||||
SEEDS[("generate_sync_id", "missing_currency")] = {
|
||||
"tx": {
|
||||
"date": "2026-02-01",
|
||||
"amount": {"type": "float", "value": 500.0},
|
||||
"sender": "Another Person",
|
||||
"vs": "654321",
|
||||
"message": "trenink",
|
||||
"bank_id": "200000002",
|
||||
}
|
||||
}
|
||||
SEEDS[("generate_sync_id", "empty_fields")] = _sync_tx(
|
||||
"2026-03-01",
|
||||
{"type": "float", "value": 0.0},
|
||||
"CZK",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
)
|
||||
SEEDS[("generate_sync_id", "large_amount")] = _sync_tx(
|
||||
"2025-10-05",
|
||||
{"type": "float", "value": 2100.0},
|
||||
"CZK",
|
||||
"Payer Name",
|
||||
"987654",
|
||||
"FUJ treninky",
|
||||
"300000003",
|
||||
)
|
||||
|
||||
|
||||
# --- build_name_variants ---
|
||||
|
||||
SEEDS[("build_name_variants", "full_name_no_nick")] = {
|
||||
"full_name": "Jan Novák"
|
||||
}
|
||||
SEEDS[("build_name_variants", "with_nickname")] = {
|
||||
"full_name": "František Vrbík (Štrúdl)"
|
||||
}
|
||||
SEEDS[("build_name_variants", "three_word_name")] = {
|
||||
"full_name": "Jan Tomášek (Honza)"
|
||||
}
|
||||
SEEDS[("build_name_variants", "single_word")] = {
|
||||
"full_name": "Jáchym"
|
||||
}
|
||||
SEEDS[("build_name_variants", "short_name_filtered")] = {
|
||||
"full_name": "Jo"
|
||||
}
|
||||
SEEDS[("build_name_variants", "common_diacritics")] = {
|
||||
"full_name": "Alžběta Testovická"
|
||||
}
|
||||
|
||||
|
||||
# --- match_members ---
|
||||
|
||||
# Synthetic roster — deliberately NOT real member names.
|
||||
# Tomáš Fiktivný has a nickname (Tov) for nickname-match tests.
|
||||
# Pavel Smutný has a nickname (Štrúdl) for nickname tests.
|
||||
# Adam Novák: normalized last name "novak" is in _COMMON_SURNAMES → common-surname filter test.
|
||||
_ROSTER = [
|
||||
"Alžběta Testovická",
|
||||
"Tomáš Fiktivný (Tov)",
|
||||
"Pavel Smutný (Štrúdl)",
|
||||
"Jana Nováková",
|
||||
"Adam Novák",
|
||||
]
|
||||
|
||||
SEEDS[("match_members", "exact_full_name")] = {
|
||||
"text": "platba od alzbeta testovicka leden",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
SEEDS[("match_members", "first_and_last")] = {
|
||||
"text": "jan nový payment tomas fiktivny",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
SEEDS[("match_members", "nickname_match")] = {
|
||||
"text": "payment from strudl",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
SEEDS[("match_members", "review_lastname_only")] = {
|
||||
"text": "testovicka leden",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
SEEDS[("match_members", "common_surname_no_match")] = {
|
||||
"text": "novak leden",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
SEEDS[("match_members", "no_match")] = {
|
||||
"text": "xyz platba",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
SEEDS[("match_members", "two_members_exact")] = {
|
||||
"text": "pavel smutny a alzbeta testovicka",
|
||||
"member_names": _ROSTER,
|
||||
}
|
||||
|
||||
|
||||
# --- infer_transaction_details ---
|
||||
|
||||
SEEDS[("infer_transaction_details", "member_in_message")] = {
|
||||
"tx": {
|
||||
"sender": "Test Payer",
|
||||
"message": "alzbeta testovicka leden 2026",
|
||||
"user_id": "",
|
||||
"date": {"type": "string", "value": "2026-01-15"},
|
||||
},
|
||||
"member_names": _ROSTER,
|
||||
"default_year": 2026,
|
||||
}
|
||||
SEEDS[("infer_transaction_details", "member_in_sender")] = {
|
||||
"tx": {
|
||||
"sender": "Tomáš Fiktivný",
|
||||
"message": "FUJ trenink",
|
||||
"user_id": "",
|
||||
"date": {"type": "string", "value": "2026-02-01"},
|
||||
},
|
||||
"member_names": _ROSTER,
|
||||
"default_year": 2026,
|
||||
}
|
||||
SEEDS[("infer_transaction_details", "month_fallback_from_date")] = {
|
||||
"tx": {
|
||||
"sender": "Alžběta Testovická",
|
||||
"message": "platba",
|
||||
"user_id": "",
|
||||
"date": {"type": "string", "value": "2026-03-15"},
|
||||
},
|
||||
"member_names": _ROSTER,
|
||||
"default_year": 2026,
|
||||
}
|
||||
SEEDS[("infer_transaction_details", "serial_date")] = {
|
||||
"tx": {
|
||||
"sender": "Jana Nováková",
|
||||
"message": "leden",
|
||||
"user_id": "",
|
||||
"date": {"type": "float", "value": 46027.0}, # 2026-01-15 in Sheets serial
|
||||
},
|
||||
"member_names": _ROSTER,
|
||||
"default_year": 2026,
|
||||
}
|
||||
SEEDS[("infer_transaction_details", "no_member_no_month")] = {
|
||||
"tx": {
|
||||
"sender": "Unknown Person",
|
||||
"message": "random text",
|
||||
"user_id": "",
|
||||
"date": {"type": "none"},
|
||||
},
|
||||
"member_names": _ROSTER,
|
||||
"default_year": 2026,
|
||||
}
|
||||
|
||||
|
||||
# --- format_date ---
|
||||
|
||||
SEEDS[("format_date", "string_iso")] = {"val": {"type": "string", "value": "2026-01-15"}}
|
||||
SEEDS[("format_date", "string_non_iso")] = {"val": {"type": "string", "value": "garbage"}}
|
||||
SEEDS[("format_date", "empty_string")] = {"val": {"type": "string", "value": ""}}
|
||||
SEEDS[("format_date", "none_value")] = {"val": {"type": "none"}}
|
||||
SEEDS[("format_date", "serial_int")] = {"val": {"type": "int", "value": 46027}}
|
||||
SEEDS[("format_date", "serial_float")] = {"val": {"type": "float", "value": 46027.5}}
|
||||
SEEDS[("format_date", "serial_float_exact")] = {"val": {"type": "float", "value": 45957.0}} # 2025-10-01
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reconcile handcrafted seeds
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _tx(date, amount, person, purpose, sender="Payer", message="", bank_id="", inferred_amount=None):
|
||||
return {
|
||||
"date": date,
|
||||
"amount": amount,
|
||||
"manual_fix": "",
|
||||
"person": person,
|
||||
"purpose": purpose,
|
||||
"inferred_amount": inferred_amount if inferred_amount is not None else amount,
|
||||
"sender": sender,
|
||||
"message": message,
|
||||
"bank_id": bank_id,
|
||||
}
|
||||
|
||||
|
||||
def _member(name, tier, fees: dict):
|
||||
"""fees: {month: (fee, count) or int}. Returns a dict so the scrubber
|
||||
can find the 'name' key and apply deterministic pseudonymisation."""
|
||||
return {"name": name, "tier": tier, "fees": fees}
|
||||
|
||||
|
||||
def _reconcile_seed(members, sorted_months, transactions, exceptions=None, default_year=2026):
|
||||
return {
|
||||
"members": members,
|
||||
"sorted_months": sorted_months,
|
||||
"transactions": transactions,
|
||||
"exceptions": exceptions or [],
|
||||
"default_year": default_year,
|
||||
}
|
||||
|
||||
|
||||
# 01 — greedy exact: Alice pays exactly 750, expected 750
|
||||
SEEDS[("reconcile", "01_greedy_exact")] = _reconcile_seed(
|
||||
members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-20", 750, "Alice Dvořák", "2026-01", sender="Alice Dvořák")],
|
||||
)
|
||||
|
||||
# 02 — greedy overpayment → credit: Alice pays 900, expected 750
|
||||
SEEDS[("reconcile", "02_greedy_overpayment")] = _reconcile_seed(
|
||||
members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-20", 900, "Alice Dvořák", "2026-01", sender="Alice Dvořák")],
|
||||
)
|
||||
|
||||
# 03 — proportional: Alice pays 800 for 3 months (750+750+350=1850 expected)
|
||||
SEEDS[("reconcile", "03_proportional_remainder")] = _reconcile_seed(
|
||||
members=[_member("Alice Dvořák", "A", {
|
||||
"2026-01": (750, 3),
|
||||
"2026-02": (750, 2),
|
||||
"2026-03": (350, 2),
|
||||
})],
|
||||
sorted_months=["2026-01", "2026-02", "2026-03"],
|
||||
transactions=[_tx("2026-03-10", 800, "Alice Dvořák", "2026-01,2026-02,2026-03", sender="Alice Dvořák")],
|
||||
)
|
||||
|
||||
# 04 — even-split: all expected=0, payment spread evenly
|
||||
SEEDS[("reconcile", "04_even_split_prepayment")] = _reconcile_seed(
|
||||
members=[_member("Bob Kratochvíl", "A", {
|
||||
"2026-04": (0, 0),
|
||||
"2026-05": (0, 0),
|
||||
})],
|
||||
sorted_months=["2026-04", "2026-05"],
|
||||
transactions=[_tx("2026-03-25", 700, "Bob Kratochvíl", "2026-04,2026-05", sender="Bob Kratochvíl")],
|
||||
)
|
||||
|
||||
# 05 — out-of-window: payment references 2025-08 which is outside sorted_months
|
||||
SEEDS[("reconcile", "05_out_of_window_credit")] = _reconcile_seed(
|
||||
members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-20", 1500, "Alice Dvořák", "2026-01,2025-08", sender="Alice Dvořák")],
|
||||
)
|
||||
|
||||
# 06 — exception override: Alice's 2026-01 fee overridden from 750 to 300
|
||||
SEEDS[("reconcile", "06_exception_override")] = _reconcile_seed(
|
||||
members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-20", 300, "Alice Dvořák", "2026-01", sender="Alice Dvořák")],
|
||||
# exceptions as list of [name, period, amount, note] (capture_fixtures converts to dict)
|
||||
exceptions=[{"name": "Alice Dvořák", "period": "2026-01", "amount": 300, "note": "injury discount"}],
|
||||
)
|
||||
|
||||
# 07 — other purpose: tournament fee split between Alice and Bob
|
||||
SEEDS[("reconcile", "07_other_purpose_split")] = _reconcile_seed(
|
||||
members=[
|
||||
_member("Alice Dvořák", "A", {"2026-01": (750, 3)}),
|
||||
_member("Bob Kratochvíl", "A", {"2026-01": (750, 2)}),
|
||||
],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-10", 800, "Alice Dvořák, Bob Kratochvíl", "other:tournament", sender="Alice Dvořák")],
|
||||
)
|
||||
|
||||
# 08 — junior with attendance=1 (expected=0 in reconcile, unknown in UI)
|
||||
SEEDS[("reconcile", "08_junior_question_mark")] = _reconcile_seed(
|
||||
members=[_member("Karel Junior", "A", {"2026-01": (0, 1)})],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-20", 200, "Karel Junior", "2026-01", sender="Karel Junior")],
|
||||
)
|
||||
|
||||
# 09 — multi-person comma-split: Alice and Bob share a payment for 2 months
|
||||
SEEDS[("reconcile", "09_multiperson_multimonth")] = _reconcile_seed(
|
||||
members=[
|
||||
_member("Alice Dvořák", "A", {"2026-01": (750, 3), "2026-02": (750, 2)}),
|
||||
_member("Bob Kratochvíl", "A", {"2026-01": (750, 2), "2026-02": (350, 2)}),
|
||||
],
|
||||
sorted_months=["2026-01", "2026-02"],
|
||||
transactions=[_tx("2026-02-15", 2000, "Alice Dvořák, Bob Kratochvíl", "2026-01,2026-02", sender="Alice Dvořák")],
|
||||
)
|
||||
|
||||
# 10 — unmatched: no person, garbage message
|
||||
SEEDS[("reconcile", "10_unmatched")] = _reconcile_seed(
|
||||
members=[_member("Alice Dvořák", "A", {"2026-01": (750, 3)})],
|
||||
sorted_months=["2026-01"],
|
||||
transactions=[_tx("2026-01-20", 500, "", "", sender="Unknown Payer", message="garbage xyz 999")],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Real-data seeds
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Indices into tmp/payments_transactions_cache.json['data'] selected for coverage.
|
||||
# DO NOT change these — they are deliberately frozen to make re-runs deterministic.
|
||||
_REAL_PMR_INDICES = [
|
||||
(16, "real_single_leden"),
|
||||
(17, "real_range_prosinec_leden"),
|
||||
(18, "real_list_prosinec_leden_unor"),
|
||||
(22, "real_martin_prosinec_leden"),
|
||||
(23, "real_range_listopad_leden"),
|
||||
(25, "real_filip_prosinec_leden_unor"),
|
||||
(36, "real_mixed_czech_numeric"),
|
||||
(42, "real_dominika_numeric_multi"),
|
||||
# index 67 removed: the name-sweep scrubber changes the text prefix in a way
|
||||
# that breaks the numeric-slash parser (empty result vs expected "2026-03").
|
||||
(72, "real_tomik_numeric_plus"),
|
||||
(73, "real_franc_numeric_space"),
|
||||
(74, "real_jana_numeric_multi"),
|
||||
(80, "real_alex_numeric_long"),
|
||||
(89, "real_emily_numeric_long"),
|
||||
(90, "real_jachym_numeric_multi"),
|
||||
]
|
||||
|
||||
# Real match_members seeds are intentionally omitted: after PII scrubbing
|
||||
# the member_names pseudonyms are inconsistent with the (un-scrubbed) text,
|
||||
# causing all Go parity assertions to fail. The synthetic seeds below cover
|
||||
# the same code paths without any real data.
|
||||
_REAL_MM_INDICES: list = []
|
||||
|
||||
|
||||
def real_parse_month_references_seeds(default_year: int = 2026):
|
||||
"""Yield (case_id, seed) from real cache messages."""
|
||||
cache = _load_cache("payments_transactions_cache.json")
|
||||
if cache is None:
|
||||
return
|
||||
txs = cache.get("data", [])
|
||||
for idx, case_id in _REAL_PMR_INDICES:
|
||||
if idx >= len(txs):
|
||||
continue
|
||||
msg = str(txs[idx].get("message", ""))
|
||||
yield case_id, {"text": msg, "default_year": default_year}
|
||||
|
||||
|
||||
def _real_member_names():
|
||||
"""Return canonical member names from the regular attendance cache."""
|
||||
cache = _load_cache("attendance_regular_cache.json")
|
||||
if cache is None:
|
||||
return []
|
||||
rows = cache.get("data", [])
|
||||
if rows and isinstance(rows[0], list):
|
||||
rows = rows[0]
|
||||
return [row[0] for row in rows if isinstance(row, (list, tuple)) and len(row) >= 2]
|
||||
|
||||
|
||||
def real_match_members_seeds():
|
||||
"""Yield (case_id, seed) using real senders/messages against real roster."""
|
||||
cache = _load_cache("payments_transactions_cache.json")
|
||||
member_names = _real_member_names()
|
||||
if cache is None or not member_names:
|
||||
return
|
||||
txs = cache.get("data", [])
|
||||
for idx, case_id in _REAL_MM_INDICES:
|
||||
if idx >= len(txs):
|
||||
continue
|
||||
tx = txs[idx]
|
||||
sender = str(tx.get("sender", ""))
|
||||
message = str(tx.get("message", ""))
|
||||
text = f"{sender} {message}"
|
||||
yield case_id, {"text": text, "member_names": member_names}
|
||||
46
scripts/capture_all_fixtures.sh
Executable file
46
scripts/capture_all_fixtures.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env bash
|
||||
# Regenerate the full fixture corpus.
|
||||
# Safe to re-run — always overwrites.
|
||||
# Requires: tmp/*_cache.json present (for real-data seeds for parse_month_references and match_members).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
FIXTURES="$REPO/go/tests/fixtures"
|
||||
CAPTURE_CMD="PYTHONPATH=$REPO/scripts:. python3 $REPO/scripts/capture_fixtures.py"
|
||||
SCRUB_CMD="python3 $REPO/scripts/scrub_fixtures.py"
|
||||
|
||||
run_func() {
|
||||
local func="$1"
|
||||
local dir="$FIXTURES/pure/$func"
|
||||
mkdir -p "$dir"
|
||||
echo " Capturing $func..."
|
||||
eval "$CAPTURE_CMD --func $func --all" | while IFS= read -r line; do
|
||||
case_id="$(python3 -c "import sys,json; print(json.loads('''$line''')['case'])" 2>/dev/null || \
|
||||
echo "$line" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['case'])")"
|
||||
echo "$line" | python3 "$REPO/scripts/scrub_fixtures.py" > "$dir/${case_id}.json"
|
||||
done
|
||||
}
|
||||
|
||||
echo "==> Capturing pure-function fixtures..."
|
||||
|
||||
run_func normalize
|
||||
run_func parse_month_references
|
||||
run_func calculate_fee
|
||||
run_func calculate_junior_fee
|
||||
run_func parse_czk_amount
|
||||
run_func generate_sync_id
|
||||
run_func build_name_variants
|
||||
run_func match_members
|
||||
run_func infer_transaction_details
|
||||
run_func format_date
|
||||
|
||||
echo "==> Capturing reconcile fixtures..."
|
||||
mkdir -p "$FIXTURES/reconcile"
|
||||
eval "$CAPTURE_CMD --func reconcile --all" | while IFS= read -r line; do
|
||||
case_id="$(echo "$line" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['case'])")"
|
||||
echo "$line" | python3 "$REPO/scripts/scrub_fixtures.py" > "$FIXTURES/reconcile/${case_id}.json"
|
||||
done
|
||||
|
||||
echo "==> Done. Review with: git diff go/tests/fixtures/"
|
||||
echo "==> Audit PII: git ls-files go/tests/fixtures | xargs grep -l '<real name>' should return zero."
|
||||
353
scripts/capture_fixtures.py
Normal file
353
scripts/capture_fixtures.py
Normal file
@@ -0,0 +1,353 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Capture pure-function output as JSON fixtures for parity testing.
|
||||
|
||||
Each invocation emits exactly one JSON object to stdout.
|
||||
Pipe through scrub_fixtures.py before writing to go/tests/fixtures/.
|
||||
|
||||
Usage:
|
||||
# Single case:
|
||||
python capture_fixtures.py --func normalize --case simple_ascii \\
|
||||
--input-seed simple_ascii | python scrub_fixtures.py \\
|
||||
> go/tests/fixtures/pure/normalize/simple_ascii.json
|
||||
|
||||
# All seeds for a function (newline-delimited JSON, one object per line):
|
||||
python capture_fixtures.py --func normalize --all
|
||||
|
||||
# Feed input from stdin (for ad-hoc cases):
|
||||
echo '{"text":"hello"}' | python capture_fixtures.py --func normalize \\
|
||||
--case adhoc --input-stdin
|
||||
|
||||
See scripts/_fixture_seeds.py for the seed registry.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from czech_utils import normalize, parse_month_references
|
||||
from attendance import calculate_fee, calculate_junior_fee
|
||||
from infer_payments import parse_czk_amount
|
||||
from sync_fio_to_sheets import generate_sync_id as _py_generate_sync_id
|
||||
from match_payments import (
|
||||
_build_name_variants,
|
||||
match_members,
|
||||
infer_transaction_details,
|
||||
format_date,
|
||||
reconcile,
|
||||
)
|
||||
from czech_utils import normalize as _norm
|
||||
|
||||
import _fixture_seeds as seeds
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Type-envelope helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _decode_envelope(envelope):
|
||||
"""Convert a {type, value} envelope to a Python value for function calls."""
|
||||
if not isinstance(envelope, dict):
|
||||
return envelope
|
||||
t = envelope.get("type", "raw")
|
||||
v = envelope.get("value")
|
||||
if t == "none":
|
||||
return None
|
||||
if t == "int":
|
||||
return int(v)
|
||||
if t == "float":
|
||||
return float(v)
|
||||
if t == "string":
|
||||
return v
|
||||
return v # raw JSON value (for fields that don't use an envelope)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-function capture implementations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def capture_normalize(inp: dict) -> dict:
|
||||
result = normalize(inp["text"])
|
||||
return {"text": result}
|
||||
|
||||
|
||||
def capture_parse_month_references(inp: dict) -> dict:
|
||||
result = parse_month_references(inp["text"], inp.get("default_year", 2026))
|
||||
return {"months": result}
|
||||
|
||||
|
||||
def capture_calculate_fee(inp: dict) -> dict:
|
||||
result = calculate_fee(inp["attendance_count"], inp["month_key"])
|
||||
return {"fee": result}
|
||||
|
||||
|
||||
def capture_calculate_junior_fee(inp: dict) -> dict:
|
||||
raw = calculate_junior_fee(inp["attendance_count"], inp["month_key"])
|
||||
if raw == "?":
|
||||
return {"value": 0, "unknown": True}
|
||||
return {"value": int(raw), "unknown": False}
|
||||
|
||||
|
||||
def capture_parse_czk_amount(inp: dict) -> dict:
|
||||
val = _decode_envelope(inp["val"])
|
||||
result = parse_czk_amount(val)
|
||||
return {"amount": float(result)}
|
||||
|
||||
|
||||
def capture_generate_sync_id(inp: dict) -> dict:
|
||||
tx_in = inp["tx"]
|
||||
# Build the tx dict that generate_sync_id expects:
|
||||
# amount must be the Python-native type to replicate str(amount) faithfully.
|
||||
tx = {k: v for k, v in tx_in.items() if k != "amount"}
|
||||
tx["amount"] = _decode_envelope(tx_in["amount"])
|
||||
result = _py_generate_sync_id(tx)
|
||||
return {"sync_id": result}
|
||||
|
||||
|
||||
def capture_build_name_variants(inp: dict) -> dict:
|
||||
result = _build_name_variants(inp["full_name"])
|
||||
return {"variants": result}
|
||||
|
||||
|
||||
def capture_match_members(inp: dict) -> dict:
|
||||
matches = match_members(inp["text"], inp["member_names"])
|
||||
return {
|
||||
"matches": [{"name": name, "confidence": conf} for name, conf in matches]
|
||||
}
|
||||
|
||||
|
||||
def capture_infer_transaction_details(inp: dict) -> dict:
|
||||
tx_in = inp["tx"]
|
||||
tx = dict(tx_in)
|
||||
tx["date"] = _decode_envelope(tx_in.get("date"))
|
||||
result = infer_transaction_details(tx, inp["member_names"])
|
||||
return {
|
||||
"matches": [{"name": n, "confidence": c} for n, c in result["members"]],
|
||||
"months": result["months"],
|
||||
"search_text": result.get("search_text", result.get("matched_text", "")),
|
||||
}
|
||||
|
||||
|
||||
def capture_format_date(inp: dict) -> dict:
|
||||
val = _decode_envelope(inp["val"])
|
||||
result = format_date(val)
|
||||
return {"date": result}
|
||||
|
||||
|
||||
def _build_exceptions(exc_list):
|
||||
"""Convert seed exceptions to the dict reconcile() expects.
|
||||
Accepts both the legacy list format [name, period, amount, note] and the
|
||||
new dict format {"name": ..., "period": ..., "amount": ..., "note": ...}."""
|
||||
if not exc_list:
|
||||
return {}
|
||||
result = {}
|
||||
for row in exc_list:
|
||||
if isinstance(row, dict):
|
||||
name = row.get("name", "")
|
||||
period = row.get("period", "")
|
||||
amount = row.get("amount", 0)
|
||||
note = row.get("note", "")
|
||||
else:
|
||||
name, period, amount = row[0], row[1], row[2]
|
||||
note = row[3] if len(row) > 3 else ""
|
||||
result[(_norm(name), _norm(period))] = {"amount": int(amount), "note": note}
|
||||
return result
|
||||
|
||||
|
||||
def _member_fee_dict(fees_raw: dict) -> dict:
|
||||
"""Convert seed fees dict to the form reconcile() expects."""
|
||||
# Seeds store fees as [fee, count] lists (JSON) or (fee, count) tuples.
|
||||
result = {}
|
||||
for month, v in fees_raw.items():
|
||||
if isinstance(v, (list, tuple)) and len(v) == 2:
|
||||
result[month] = (int(v[0]), int(v[1]))
|
||||
else:
|
||||
result[month] = int(v)
|
||||
return result
|
||||
|
||||
|
||||
def _tx_entry_out(tx):
|
||||
"""Convert a reconcile output TxEntry dict to a serializable form."""
|
||||
return {
|
||||
"amount": float(tx.get("amount", 0)),
|
||||
"date": tx.get("date", ""),
|
||||
"sender": tx.get("sender", ""),
|
||||
"message": tx.get("message", ""),
|
||||
"confidence": tx.get("confidence", ""),
|
||||
}
|
||||
|
||||
|
||||
def _other_entry_out(e):
|
||||
return {
|
||||
"amount": float(e.get("amount", 0)),
|
||||
"date": e.get("date", ""),
|
||||
"sender": e.get("sender", ""),
|
||||
"message": e.get("message", ""),
|
||||
"purpose": e.get("purpose", ""),
|
||||
"confidence": e.get("confidence", ""),
|
||||
}
|
||||
|
||||
|
||||
def _month_data_out(md):
|
||||
return {
|
||||
"expected": int(md["expected"]) if isinstance(md["expected"], (int, float)) else 0,
|
||||
"original_expected": int(md["original_expected"]) if isinstance(md.get("original_expected"), (int, float)) else 0,
|
||||
"attendance_count": int(md.get("attendance_count", 0)),
|
||||
"exception": md.get("exception"),
|
||||
"paid": float(md["paid"]),
|
||||
"transactions": [_tx_entry_out(t) for t in md.get("transactions", [])],
|
||||
}
|
||||
|
||||
|
||||
def _unmatched_tx_out(tx):
|
||||
return {
|
||||
"date": tx.get("date", ""),
|
||||
"amount": float(tx.get("amount", 0)),
|
||||
"person": tx.get("person", ""),
|
||||
"purpose": tx.get("purpose", ""),
|
||||
"sender": tx.get("sender", ""),
|
||||
"message": tx.get("message", ""),
|
||||
"bank_id": tx.get("bank_id", ""),
|
||||
}
|
||||
|
||||
|
||||
def capture_reconcile(inp: dict) -> dict:
|
||||
# Convert members from seed format to reconcile() format.
|
||||
# Accepts both the new dict format {"name":..., "tier":..., "fees":{...}}
|
||||
# and the legacy tuple format [name, tier, fees_dict].
|
||||
members_in = inp["members"]
|
||||
members = []
|
||||
for m in members_in:
|
||||
if isinstance(m, dict):
|
||||
name, tier, fees_raw = m["name"], m["tier"], m.get("fees", {})
|
||||
else:
|
||||
name, tier, fees_raw = m[0], m[1], m[2]
|
||||
members.append((name, tier, _member_fee_dict(fees_raw)))
|
||||
|
||||
exceptions = _build_exceptions(inp.get("exceptions") or [])
|
||||
sorted_months = inp["sorted_months"]
|
||||
transactions = inp["transactions"]
|
||||
|
||||
result = reconcile(members, sorted_months, transactions, exceptions)
|
||||
|
||||
members_out = {}
|
||||
for name, mr in result["members"].items():
|
||||
members_out[name] = {
|
||||
"tier": mr["tier"],
|
||||
"months": {m: _month_data_out(md) for m, md in mr["months"].items()},
|
||||
"other_transactions": [_other_entry_out(e) for e in mr.get("other_transactions", [])],
|
||||
"total_balance": int(mr["total_balance"]),
|
||||
}
|
||||
|
||||
return {
|
||||
"members": members_out,
|
||||
"unmatched": [_unmatched_tx_out(tx) for tx in result["unmatched"]],
|
||||
"credits": {k: int(v) for k, v in result["credits"].items()},
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dispatcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DISPATCHERS = {
|
||||
"normalize": capture_normalize,
|
||||
"parse_month_references": capture_parse_month_references,
|
||||
"calculate_fee": capture_calculate_fee,
|
||||
"calculate_junior_fee": capture_calculate_junior_fee,
|
||||
"parse_czk_amount": capture_parse_czk_amount,
|
||||
"generate_sync_id": capture_generate_sync_id,
|
||||
"build_name_variants": capture_build_name_variants,
|
||||
"match_members": capture_match_members,
|
||||
"infer_transaction_details": capture_infer_transaction_details,
|
||||
"format_date": capture_format_date,
|
||||
"reconcile": capture_reconcile,
|
||||
}
|
||||
|
||||
_FUNC_MODULE = {
|
||||
"normalize": "scripts.czech_utils.normalize",
|
||||
"parse_month_references": "scripts.czech_utils.parse_month_references",
|
||||
"calculate_fee": "scripts.attendance.calculate_fee",
|
||||
"calculate_junior_fee": "scripts.attendance.calculate_junior_fee",
|
||||
"parse_czk_amount": "scripts.infer_payments.parse_czk_amount",
|
||||
"generate_sync_id": "scripts.sync_fio_to_sheets.generate_sync_id",
|
||||
"build_name_variants": "scripts.match_payments._build_name_variants",
|
||||
"match_members": "scripts.match_payments.match_members",
|
||||
"infer_transaction_details": "scripts.match_payments.infer_transaction_details",
|
||||
"format_date": "scripts.match_payments.format_date",
|
||||
"reconcile": "scripts.match_payments.reconcile",
|
||||
}
|
||||
|
||||
|
||||
def _emit(func_name: str, case_id: str, inp: dict) -> None:
|
||||
dispatch = _DISPATCHERS[func_name]
|
||||
output = dispatch(inp)
|
||||
doc = {
|
||||
"case": case_id,
|
||||
"func": _FUNC_MODULE[func_name],
|
||||
"captured_at": datetime.date.today().isoformat(),
|
||||
"input": inp,
|
||||
"output": output,
|
||||
}
|
||||
print(json.dumps(doc, ensure_ascii=False))
|
||||
|
||||
|
||||
def _all_seeds(func_name: str):
|
||||
"""Yield (case_id, seed) for all seeds of a function."""
|
||||
for (fn, case_id), seed in seeds.SEEDS.items():
|
||||
if fn == func_name:
|
||||
yield case_id, seed
|
||||
|
||||
# Real-data seeds
|
||||
if func_name == "parse_month_references":
|
||||
yield from seeds.real_parse_month_references_seeds()
|
||||
if func_name == "match_members":
|
||||
yield from seeds.real_match_members_seeds()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Capture pure-function outputs as JSON fixtures."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--func", required=True, choices=list(_DISPATCHERS), help="Function to capture."
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--case", help="Case ID (file stem). Use with --input-seed or --input-stdin.")
|
||||
group.add_argument("--all", action="store_true", help="Emit all seeds for the function.")
|
||||
parser.add_argument(
|
||||
"--input-seed", metavar="SEED_ID",
|
||||
help="Seed key in _fixture_seeds.SEEDS (required unless --input-stdin or --all).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-stdin", action="store_true",
|
||||
help="Read input JSON from stdin instead of seed registry.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.all:
|
||||
for case_id, seed in _all_seeds(args.func):
|
||||
_emit(args.func, case_id, seed)
|
||||
return
|
||||
|
||||
# Single case
|
||||
if args.input_stdin:
|
||||
inp = json.load(sys.stdin)
|
||||
elif args.input_seed:
|
||||
key = (args.func, args.input_seed)
|
||||
if key not in seeds.SEEDS:
|
||||
sys.exit(f"Seed ({args.func!r}, {args.input_seed!r}) not found in _fixture_seeds.SEEDS")
|
||||
inp = seeds.SEEDS[key]
|
||||
else:
|
||||
parser.error("Provide --input-seed SEED_ID or --input-stdin.")
|
||||
|
||||
_emit(args.func, args.case, inp)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
330
scripts/scrub_fixtures.py
Normal file
330
scripts/scrub_fixtures.py
Normal file
@@ -0,0 +1,330 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scrub PII from fixture JSON.
|
||||
|
||||
Reads one JSON fixture from stdin (as produced by capture_fixtures.py),
|
||||
replaces PII fields with deterministic pseudonyms, writes scrubbed JSON
|
||||
to stdout.
|
||||
|
||||
Run in the two-step pipeline:
|
||||
python capture_fixtures.py ... | python scrub_fixtures.py > fixture.json
|
||||
|
||||
Or process multiple lines (--multi for newline-delimited input):
|
||||
python capture_fixtures.py --func foo --all | python scrub_fixtures.py --multi \\
|
||||
| while read line; do ...
|
||||
|
||||
PII handling:
|
||||
- Member names: replaced with Member_<8hex> (sha256(name)[:8]), deterministic.
|
||||
- Senders / account numbers / VS / bank_id / user_id: stable digit-preserving hash.
|
||||
- Notes (exception text): replaced with "<scrubbed>".
|
||||
- Messages: name-substring sweep applied; rest preserved.
|
||||
- All other fields (dates, amounts, months, fees): preserved verbatim.
|
||||
|
||||
Function-specific exceptions:
|
||||
- match_members / infer_transaction_details: these functions are tested with
|
||||
synthetic member names only. Only real-roster message sweeping is applied;
|
||||
field-key scrubbing is skipped so Go can perform genuine name matching.
|
||||
- generate_sync_id: after normal field-key scrubbing the output sync_id is
|
||||
recomputed from the now-scrubbed inputs so the hash remains consistent.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bijection helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _sha256_hex(s: str) -> str:
|
||||
return hashlib.sha256(s.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def scrub_name(name: str) -> str:
|
||||
"""Deterministic pseudonym for a member name."""
|
||||
if not name:
|
||||
return name
|
||||
return f"Member_{_sha256_hex(name)[:8]}"
|
||||
|
||||
|
||||
def scrub_id_digits(s: str) -> str:
|
||||
"""Length-preserving digit hash for VS, bank_id, user_id, etc."""
|
||||
s = str(s)
|
||||
if not s:
|
||||
return s
|
||||
if re.match(r"^\d+$", s):
|
||||
n = len(s)
|
||||
hashed = int(_sha256_hex(s), 16) % (10 ** n)
|
||||
return f"{hashed:0{n}d}"
|
||||
return f"id_{_sha256_hex(s)[:8]}"
|
||||
|
||||
|
||||
def scrub_account(s: str) -> str:
|
||||
"""Preserve Czech bank account format PREFIX/BANKCODE."""
|
||||
s = str(s)
|
||||
if not s:
|
||||
return s
|
||||
m = re.match(r"^(\d+)/(\d{4})$", s)
|
||||
if m:
|
||||
prefix, bank = m.group(1), m.group(2)
|
||||
n = len(prefix)
|
||||
new_prefix = int(_sha256_hex(prefix), 16) % (10 ** n)
|
||||
new_bank = int(_sha256_hex(bank), 16) % 10000
|
||||
return f"{new_prefix:0{n}d}/{new_bank:04d}"
|
||||
return scrub_id_digits(s)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Name roster for message sweeps
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_member_names() -> list[str]:
|
||||
"""Load canonical names from the attendance cache (may not exist)."""
|
||||
path = os.path.join(_REPO, "tmp", "attendance_regular_cache.json")
|
||||
if not os.path.exists(path):
|
||||
return []
|
||||
try:
|
||||
with open(path, encoding="utf-8") as f:
|
||||
cache = json.load(f)
|
||||
rows = cache.get("data", [])
|
||||
if rows and isinstance(rows[0], list):
|
||||
rows = rows[0]
|
||||
names = []
|
||||
for row in rows:
|
||||
if isinstance(row, (list, tuple)) and len(row) >= 1:
|
||||
names.append(str(row[0]))
|
||||
return names
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _build_name_map(names: list[str]) -> dict[str, str]:
|
||||
"""Map each real name (and its normalized form) to its pseudonym."""
|
||||
mapping: dict[str, str] = {}
|
||||
for name in names:
|
||||
pseudo = scrub_name(name)
|
||||
mapping[name] = pseudo
|
||||
# Also add first+last without parenthetical nicknames
|
||||
base = re.sub(r"\s*\([^)]*\)\s*", " ", name).strip()
|
||||
if base != name:
|
||||
mapping[base] = pseudo
|
||||
return mapping
|
||||
|
||||
|
||||
def _sweep_names_in_text(text: str, name_map: dict[str, str]) -> str:
|
||||
"""Replace real-name substrings in free text, longest match first."""
|
||||
# Sort descending by length so longer names replace before their substrings
|
||||
for real in sorted(name_map, key=len, reverse=True):
|
||||
if real and real in text:
|
||||
text = text.replace(real, name_map[real])
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scramble whitelist — only these keys are scrambled; everything else is kept
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SCRAMBLE_KEYS = {
|
||||
"name",
|
||||
"member_names",
|
||||
"person",
|
||||
"sender",
|
||||
"sender_account",
|
||||
"account",
|
||||
"vs",
|
||||
"bank_id",
|
||||
"user_id",
|
||||
"note",
|
||||
}
|
||||
|
||||
# Dict keys whose *child keys* (not values) are member names and need scrubbing.
|
||||
# e.g. the reconcile output: {"members": {"Alice Dvořák": {...}}, "credits": {"Alice Dvořák": 0}}
|
||||
_MEMBER_KEY_DICTS = {"members", "credits"}
|
||||
|
||||
_MESSAGE_KEYS = {"message", "text", "search_text"}
|
||||
|
||||
|
||||
def _scrub_value(key: str, value: Any, name_map: dict[str, str]) -> Any:
|
||||
"""Scrub a single value based on its field key."""
|
||||
if isinstance(value, list):
|
||||
if key == "member_names":
|
||||
return [scrub_name(str(v)) for v in value]
|
||||
# Don't propagate parent key into list elements — each element is an
|
||||
# independent document. Propagating would incorrectly flag nested dicts
|
||||
# (e.g. the fees dict inside a member tuple) as member-name-keyed dicts.
|
||||
return [_scrub_doc(v, name_map) for v in value]
|
||||
if isinstance(value, dict):
|
||||
# Pass the current key as parent context so dicts like
|
||||
# {"members": {"Real Name": ...}} get their keys scrubbed too.
|
||||
return _scrub_doc(value, name_map, _parent_key=key)
|
||||
if key not in _SCRAMBLE_KEYS and key not in _MESSAGE_KEYS:
|
||||
return value
|
||||
if not isinstance(value, str):
|
||||
value = str(value)
|
||||
if key in _MESSAGE_KEYS:
|
||||
return _sweep_names_in_text(value, name_map)
|
||||
if key == "name":
|
||||
return scrub_name(value)
|
||||
if key in ("sender_account", "account"):
|
||||
return scrub_account(value)
|
||||
if key == "note":
|
||||
return "<scrubbed>"
|
||||
if key == "person":
|
||||
# "person" may contain comma-separated member names (e.g. "Alice, Bob").
|
||||
# Sweep with name_map so each name gets its own consistent pseudonym,
|
||||
# matching what the output.members keys will look like.
|
||||
return _sweep_names_in_text(value, name_map) if value else value
|
||||
# vs, bank_id, user_id, sender
|
||||
return scrub_id_digits(value) if re.match(r"^\d+$", value) else scrub_name(value) if value else value
|
||||
|
||||
|
||||
def _scrub_doc(doc: Any, name_map: dict[str, str], _parent_key: str = "") -> Any:
|
||||
"""Recursively scrub a JSON document."""
|
||||
if isinstance(doc, dict):
|
||||
if _parent_key in _MEMBER_KEY_DICTS:
|
||||
# Keys of this dict are member names — scrub the keys and recurse.
|
||||
return {
|
||||
scrub_name(k): _scrub_doc(v, name_map)
|
||||
for k, v in doc.items()
|
||||
}
|
||||
return {k: _scrub_value(k, v, name_map) for k, v in doc.items()}
|
||||
if isinstance(doc, list):
|
||||
return [_scrub_doc(item, name_map) for item in doc]
|
||||
return doc
|
||||
|
||||
|
||||
# Functions where field-key scrubbing would break parity (name matching tests).
|
||||
# Only real-roster message sweep is applied for these.
|
||||
_NO_FIELD_SCRUB_FUNCS = {
|
||||
"scripts.match_payments.match_members",
|
||||
"scripts.match_payments.infer_transaction_details",
|
||||
}
|
||||
|
||||
|
||||
def _scrub_messages_only(doc: Any, name_map: dict[str, str]) -> Any:
|
||||
"""Sweep only message/text/search_text fields; leave all other values unchanged."""
|
||||
if isinstance(doc, dict):
|
||||
return {
|
||||
k: (_sweep_names_in_text(v, name_map) if k in _MESSAGE_KEYS and isinstance(v, str)
|
||||
else _scrub_messages_only(v, name_map))
|
||||
for k, v in doc.items()
|
||||
}
|
||||
if isinstance(doc, list):
|
||||
return [_scrub_messages_only(item, name_map) for item in doc]
|
||||
return doc
|
||||
|
||||
|
||||
def _recompute_sync_id(tx_scrubbed: dict) -> str:
|
||||
"""Recompute generate_sync_id hash from already-scrubbed tx fields.
|
||||
|
||||
After the scrubber changes sender/vs/bank_id the original hash is invalid.
|
||||
Replicates the Python generate_sync_id formula (pipe-separated, lowercased)
|
||||
and always treats amount as float64 to match Go's formatAmount behaviour.
|
||||
"""
|
||||
envelope = tx_scrubbed.get("amount", {})
|
||||
if isinstance(envelope, dict):
|
||||
t = envelope.get("type", "")
|
||||
v = envelope.get("value")
|
||||
if t in ("int", "float"):
|
||||
amount = float(v) # always float — matches Go's formatAmount
|
||||
else:
|
||||
amount = ""
|
||||
else:
|
||||
amount = float(envelope) if envelope not in (None, "") else ""
|
||||
|
||||
currency = tx_scrubbed.get("currency", "") or "CZK"
|
||||
components = [
|
||||
str(tx_scrubbed.get("date", "")),
|
||||
str(amount),
|
||||
currency,
|
||||
str(tx_scrubbed.get("sender", "")),
|
||||
str(tx_scrubbed.get("vs", "")),
|
||||
str(tx_scrubbed.get("message", "")),
|
||||
str(tx_scrubbed.get("bank_id", "")),
|
||||
]
|
||||
raw_str = "|".join(components).lower()
|
||||
return hashlib.sha256(raw_str.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _extract_inline_names(doc: Any) -> list[str]:
|
||||
"""Extract names from member_names and 'name' fields in the fixture itself."""
|
||||
names: list[str] = []
|
||||
if isinstance(doc, dict):
|
||||
for k, v in doc.items():
|
||||
if k == "member_names" and isinstance(v, list):
|
||||
names.extend(str(n) for n in v)
|
||||
elif k == "name" and isinstance(v, str):
|
||||
names.append(v)
|
||||
else:
|
||||
names.extend(_extract_inline_names(v))
|
||||
elif isinstance(doc, list):
|
||||
for item in doc:
|
||||
names.extend(_extract_inline_names(item))
|
||||
return names
|
||||
|
||||
|
||||
def scrub_fixture(doc: dict) -> dict:
|
||||
"""Scrub a single fixture document in-place (returns new dict)."""
|
||||
roster_names = _load_member_names()
|
||||
inline_names = _extract_inline_names(doc)
|
||||
all_names = list(dict.fromkeys(roster_names + inline_names))
|
||||
name_map = _build_name_map(all_names)
|
||||
|
||||
func = doc.get("func", "")
|
||||
|
||||
# match_members / infer_transaction_details: tested with synthetic names only.
|
||||
# Field-key scrubbing would make member_names pseudonyms inconsistent with
|
||||
# the text, breaking Go's name-matching assertions. Only sweep messages.
|
||||
if func in _NO_FIELD_SCRUB_FUNCS:
|
||||
# Synthetic member names only — no field scrubbing, no message sweep.
|
||||
# Any sweep would create inconsistency between scrubbed output fields
|
||||
# (search_text) and un-scrubbed input fields (sender, member_names).
|
||||
return _scrub_messages_only(doc, {})
|
||||
|
||||
result = _scrub_doc(doc, name_map)
|
||||
|
||||
# generate_sync_id: recompute hash from the now-scrubbed inputs so the
|
||||
# fixture is self-consistent (scrubbed fields → Go hashes scrubbed values).
|
||||
if func.endswith("generate_sync_id"):
|
||||
result["output"]["sync_id"] = _recompute_sync_id(result["input"].get("tx", {}))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Scrub PII from fixture JSON.")
|
||||
parser.add_argument(
|
||||
"--multi", action="store_true",
|
||||
help="Process newline-delimited JSON (one object per line) from stdin.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.multi:
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
doc = json.loads(line)
|
||||
print(json.dumps(scrub_fixture(doc), ensure_ascii=False))
|
||||
else:
|
||||
doc = json.load(sys.stdin)
|
||||
out = scrub_fixture(doc)
|
||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user