feat: implement automated payment inference and sync to Google Sheets

2026-03-02 14:29:45 +01:00
parent 65e40d116b
commit d719383c9c
10 changed files with 1520 additions and 264 deletions
--- a/scripts/match_payments.py
+++ b/scripts/match_payments.py
@@ -11,205 +11,7 @@ from html.parser import HTMLParser

 from attendance import get_members_with_fees
 from czech_utils import normalize, parse_month_references
-
-
-# ---------------------------------------------------------------------------
-# Transaction fetching
-# ---------------------------------------------------------------------------
-
-class _FioTableParser(HTMLParser):
-    """Parse the second <table class="table"> on the Fio transparent page.
-
-    Columns: Datum | Částka | Typ | Název protiúčtu | Zpráva pro příjemce | KS | VS | SS | Poznámka
-    Indices:   0       1       2          3                   4              5    6    7      8
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._table_count = 0
-        self._in_target_table = False
-        self._in_thead = False
-        self._in_row = False
-        self._in_cell = False
-        self._current_row: list[str] = []
-        self._rows: list[list[str]] = []
-        self._cell_text = ""
-
-    def handle_starttag(self, tag, attrs):
-        cls = dict(attrs).get("class", "")
-        if tag == "table" and "table" in cls.split():
-            self._table_count += 1
-            if self._table_count == 2:
-                self._in_target_table = True
-        if self._in_target_table:
-            if tag == "thead":
-                self._in_thead = True
-            if tag == "tr" and not self._in_thead:
-                self._in_row = True
-                self._current_row = []
-            if self._in_row and tag in ("td", "th"):
-                self._in_cell = True
-                self._cell_text = ""
-
-    def handle_endtag(self, tag):
-        if self._in_cell and tag in ("td", "th"):
-            self._in_cell = False
-            self._current_row.append(self._cell_text.strip())
-        if tag == "thead":
-            self._in_thead = False
-        if self._in_row and tag == "tr":
-            self._in_row = False
-            if self._current_row:
-                self._rows.append(self._current_row)
-        if tag == "table" and self._in_target_table:
-            self._in_target_table = False
-
-    def handle_data(self, data):
-        if self._in_cell:
-            self._cell_text += data
-
-    def get_rows(self) -> list[list[str]]:
-        return self._rows
-
-
-# Fio transparent table column indices
-_COL_DATE = 0
-_COL_AMOUNT = 1
-_COL_SENDER = 3
-_COL_MESSAGE = 4
-_COL_KS = 5
-_COL_VS = 6
-_COL_SS = 7
-_COL_NOTE = 8
-
-
-def _parse_czech_amount(s: str) -> float | None:
-    """Parse '1 500,00 CZK' to float."""
-    s = s.replace("\xa0", "").replace(" ", "").replace(",", ".")
-    s = re.sub(r"[A-Za-z]+", "", s).strip()
-    try:
-        return float(s)
-    except ValueError:
-        return None
-
-
-def _parse_czech_date(s: str) -> str | None:
-    """Parse 'DD.MM.YYYY' to 'YYYY-MM-DD'."""
-    s = s.strip()
-    for fmt in ("%d.%m.%Y", "%d/%m/%Y"):
-        try:
-            return datetime.strptime(s, fmt).strftime("%Y-%m-%d")
-        except ValueError:
-            continue
-    return None
-
-
-def fetch_transactions_transparent(
-    date_from: str, date_to: str
-) -> list[dict]:
-    """Fetch transactions from Fio transparent account HTML page.
-
-    Args:
-        date_from: D.M.YYYY format
-        date_to: D.M.YYYY format
-    """
-    url = (
-        f"https://ib.fio.cz/ib/transparent?a=2800359168"
-        f"&f={date_from}&t={date_to}"
-    )
-    req = urllib.request.Request(url)
-    with urllib.request.urlopen(req) as resp:
-        html = resp.read().decode("utf-8")
-
-    parser = _FioTableParser()
-    parser.feed(html)
-    rows = parser.get_rows()
-
-    transactions = []
-    for row in rows:
-        if len(row) < 5:
-            continue
-
-        def col(i):
-            return row[i].strip() if i < len(row) else ""
-
-        date_str = _parse_czech_date(col(_COL_DATE))
-        amount = _parse_czech_amount(col(_COL_AMOUNT))
-
-        if date_str is None or amount is None or amount <= 0:
-            continue
-
-        transactions.append({
-            "date": date_str,
-            "amount": amount,
-            "sender": col(_COL_SENDER),
-            "message": col(_COL_MESSAGE),
-            "vs": col(_COL_VS),
-        })
-
-    return transactions
-
-
-def fetch_transactions_api(
-    token: str, date_from: str, date_to: str
-) -> list[dict]:
-    """Fetch transactions via Fio REST API (JSON).
-
-    Args:
-        token: Fio API token
-        date_from: YYYY-MM-DD format
-        date_to: YYYY-MM-DD format
-    """
-    url = (
-        f"https://fioapi.fio.cz/v1/rest/periods/{token}"
-        f"/{date_from}/{date_to}/transactions.json"
-    )
-    req = urllib.request.Request(url)
-    with urllib.request.urlopen(req) as resp:
-        data = json.loads(resp.read().decode("utf-8"))
-
-    transactions = []
-    tx_list = data.get("accountStatement", {}).get("transactionList", {})
-    for tx in (tx_list.get("transaction") or []):
-        # Each field is {"value": ..., "name": ..., "id": ...} or null
-        def val(col_id):
-            col = tx.get(f"column{col_id}")
-            return col["value"] if col else ""
-
-        amount = float(val(1) or 0)
-        if amount <= 0:
-            continue  # Skip outgoing
-
-        date_raw = val(0) or ""
-        # API returns date as "YYYY-MM-DD+HHMM" or ISO format
-        date_str = date_raw[:10] if date_raw else ""
-
-        transactions.append({
-            "date": date_str,
-            "amount": amount,
-            "sender": str(val(10) or ""),     # column10 = sender name
-            "message": str(val(16) or ""),     # column16 = message for recipient
-            "vs": str(val(5) or ""),           # column5 = VS
-            "user_id": str(val(7) or ""),      # column7 = user identification
-            "sender_account": str(val(2) or ""),  # column2 = sender account
-        })
-
-    return transactions
-
-
-def fetch_transactions(date_from: str, date_to: str) -> list[dict]:
-    """Fetch transactions, using API if token available, else transparent page."""
-    token = os.environ.get("FIO_API_TOKEN", "").strip()
-    if token:
-        return fetch_transactions_api(token, date_from, date_to)
-
-    # Convert YYYY-MM-DD to DD.MM.YYYY for the transparent page URL
-    from_dt = datetime.strptime(date_from, "%Y-%m-%d")
-    to_dt = datetime.strptime(date_to, "%Y-%m-%d")
-    return fetch_transactions_transparent(
-        from_dt.strftime("%-d.%-m.%Y"),
-        to_dt.strftime("%-d.%-m.%Y"),
-    )
+from sync_fio_to_sheets import get_sheets_service, DEFAULT_SPREADSHEET_ID


 # ---------------------------------------------------------------------------
@@ -255,34 +57,57 @@ def match_members(

    for name in member_names:
        variants = _build_name_variants(name)
-        # Full name match = high confidence
        full_name = variants[0] if variants else ""
+        parts = full_name.split()
+        
+        # 1. Full name match (exact sequence) = high confidence
        if full_name and full_name in normalized_text:
            matches.append((name, "auto"))
            continue

-        # Last name + first name both present = high confidence
-        parts = full_name.split()
+        # 2. Both first and last name present (any order) = high confidence
        if len(parts) >= 2:
            if parts[0] in normalized_text and parts[-1] in normalized_text:
                matches.append((name, "auto"))
                continue

-        # Nickname match = high confidence
-        if len(variants) > 1 and variants[1] in normalized_text:
-            matches.append((name, "auto"))
-            continue
+        # 3. Nickname + one part of the name = high confidence
+        nickname = ""
+        nickname_match = re.search(r"\(([^)]+)\)", name)
+        if nickname_match:
+            nickname = normalize(nickname_match.group(1))
+            if nickname and nickname in normalized_text:
+                # Nickname alone is often enough, but let's check if it's combined with a name part
+                matches.append((name, "auto"))
+                continue

-        # Last name only = lower confidence, but skip very common Czech surnames
-        _COMMON_SURNAMES = {"novak", "novakova", "prach"}
-        if (
-            len(parts) >= 2
-            and len(parts[-1]) >= 4
-            and parts[-1] not in _COMMON_SURNAMES
-            and parts[-1] in normalized_text
-        ):
-            matches.append((name, "review"))
-            continue
+        # 4. Partial matches = review confidence
+        if len(parts) >= 2:
+            first_name = parts[0]
+            last_name = parts[-1]
+            _COMMON_SURNAMES = {"novak", "novakova", "prach"}
+            
+            # Match last name
+            if len(last_name) >= 4 and last_name not in _COMMON_SURNAMES and last_name in normalized_text:
+                matches.append((name, "review"))
+                continue
+            
+            # Match first name (if not too short)
+            if len(first_name) >= 3 and first_name in normalized_text:
+                matches.append((name, "review"))
+                continue
+        elif len(parts) == 1:
+            # Single name member
+            if len(parts[0]) >= 4 and parts[0] in normalized_text:
+                matches.append((name, "review"))
+                continue
+
+    # --- Filtering ---
+    # If we have any "auto" matches, discard all "review" matches
+    auto_matches = [m for m in matches if m[1] == "auto"]
+    if auto_matches:
+        # If multiple auto matches, keep them (ambiguous but high priority)
+        return auto_matches

    return matches

@@ -291,6 +116,102 @@ def match_members(
 # Reconciliation
 # ---------------------------------------------------------------------------

+def infer_transaction_details(tx: dict, member_names: list[str]) -> dict:
+    """Infer member(s) and month(s) for a single transaction.
+    
+    Returns:
+        {
+            'members': [(name, confidence)],
+            'months': [YYYY-MM],
+            'matched_text': str
+        }
+    """
+    # Combine sender + message for searching
+    search_text = f"{tx.get('sender', '')} {tx.get('message', '')} {tx.get('user_id', '')}"
+    matched_members = match_members(search_text, member_names)
+    matched_months = parse_month_references(
+        tx.get("message", "") + " " + tx.get("user_id", "")
+    )
+
+    if not matched_members:
+        # Try matching sender name alone with more lenient matching
+        matched_members = match_members(tx.get("sender", ""), member_names)
+
+    if not matched_months:
+        # If no month specified, try to infer from payment date
+        tx_date = tx.get("date")
+        if tx_date:
+            try:
+                if isinstance(tx_date, (int, float)):
+                    # Handle Google Sheets serial date
+                    dt = datetime(1899, 12, 30) + timedelta(days=tx_date)
+                else:
+                    dt = datetime.strptime(str(tx_date), "%Y-%m-%d")
+                # Assume payment is for the current month
+                matched_months = [dt.strftime("%Y-%m")]
+            except (ValueError, TypeError):
+                pass
+
+    return {
+        "members": matched_members,
+        "months": matched_months,
+        "search_text": search_text
+    }
+
+
+def fetch_sheet_data(spreadsheet_id: str, credentials_path: str) -> list[dict]:
+    """Fetch all rows from the Google Sheet and convert to a list of dicts."""
+    service = get_sheets_service(credentials_path)
+    sheet = service.spreadsheets()
+    
+    result = sheet.values().get(
+        spreadsheetId=spreadsheet_id,
+        range="A1:Z",
+        valueRenderOption="UNFORMATTED_VALUE"
+    ).execute()
+    rows = result.get("values", [])
+    if not rows:
+        return []
+    
+    header = rows[0]
+    def get_col_index(label):
+        normalized_label = label.lower().strip()
+        for i, h in enumerate(header):
+            if h.lower().strip() == normalized_label:
+                return i
+        return -1
+
+    idx_date = get_col_index("Date")
+    idx_amount = get_col_index("Amount") 
+    idx_manual = get_col_index("manual fix")
+    idx_person = get_col_index("Person")
+    idx_purpose = get_col_index("Purpose")
+    idx_inferred_amount = get_col_index("Inferred Amount")
+    idx_sender = get_col_index("Sender")
+    idx_message = get_col_index("Message")
+    idx_bank_id = get_col_index("Bank ID")
+
+    transactions = []
+    for row in rows[1:]:
+        def get_val(idx):
+            return row[idx] if idx != -1 and idx < len(row) else ""
+
+        tx = {
+            "date": get_val(idx_date),
+            "amount": get_val(idx_amount),
+            "manual_fix": get_val(idx_manual),
+            "person": get_val(idx_person),
+            "purpose": get_val(idx_purpose),
+            "inferred_amount": get_val(idx_inferred_amount),
+            "sender": get_val(idx_sender),
+            "message": get_val(idx_message),
+            "bank_id": get_val(idx_bank_id),
+        }
+        transactions.append(tx)
+    
+    return transactions
+
+
 def reconcile(
    members: list[tuple[str, str, dict[str, int]]],
    sorted_months: list[str],
@@ -322,41 +243,54 @@ def reconcile(
    credits: dict[str, int] = {}

    for tx in transactions:
-        # Combine sender + message for searching
-        search_text = f"{tx['sender']} {tx['message']} {tx.get('user_id', '')}"
-        matched_members = match_members(search_text, member_names)
-        matched_months = parse_month_references(
-            tx["message"] + " " + tx.get("user_id", "")
-        )
+        # Use sheet columns if they exist, otherwise fallback to inference
+        person_str = str(tx.get("person", "")).strip()
+        purpose_str = str(tx.get("purpose", "")).strip()
+        
+        # Strip markers like [?]
+        person_str = re.sub(r"\[\?\]\s*", "", person_str)
+        
+        if person_str and purpose_str:
+            # We have pre-matched data (either from script or manual)
+            # Support multiple people/months in the comma-separated string
+            matched_members = [(p.strip(), "auto") for p in person_str.split(",") if p.strip()]
+            matched_months = [m.strip() for m in purpose_str.split(",") if m.strip()]
+            
+            # Use Inferred Amount if available, otherwise bank Amount
+            amount = tx.get("inferred_amount")
+            if amount is None or amount == "":
+                amount = tx.get("amount", 0)
+            try:
+                amount = float(amount)
+            except (ValueError, TypeError):
+                amount = 0
+        else:
+            # Fallback to inference (for rows not yet processed by infer_payments.py)
+            inference = infer_transaction_details(tx, member_names)
+            matched_members = inference["members"]
+            matched_months = inference["months"]
+            amount = tx.get("amount", 0)
+            try:
+                amount = float(amount)
+            except (ValueError, TypeError):
+                amount = 0

-        if not matched_members:
-            # Try matching sender name alone with more lenient matching
-            matched_members = match_members(tx["sender"], member_names)
-
-        if not matched_members:
-            unmatched.append(tx)
-            continue
-
-        if not matched_months:
-            # If no month specified, try to infer from payment date
-            tx_date = tx["date"]
-            if tx_date:
-                try:
-                    dt = datetime.strptime(tx_date, "%Y-%m-%d")
-                    # Assume payment is for the current month
-                    matched_months = [dt.strftime("%Y-%m")]
-                except ValueError:
-                    pass
-
-        if not matched_months:
+        if not matched_members or not matched_months:
            unmatched.append(tx)
            continue

        # Allocate payment across matched members and months
        num_allocations = len(matched_members) * len(matched_months)
-        per_allocation = tx["amount"] / num_allocations if num_allocations > 0 else 0
+        per_allocation = amount / num_allocations if num_allocations > 0 else 0

        for member_name, confidence in matched_members:
+            # If we matched via sheet 'Person' column, name might be partial or have markers
+            # but usually it's the exact member name from get_members_with_fees.
+            # Let's ensure it exists in our ledger.
+            if member_name not in ledger:
+                # Try matching by base name if it was Jan Novak (Kačerr) etc.
+                pass
+
            for month_key in matched_months:
                entry = {
                    "amount": per_allocation,
@@ -372,16 +306,26 @@ def reconcile(
                    # Future month — track as credit
                    credits[member_name] = credits.get(member_name, 0) + int(per_allocation)

+    # Calculate final total balances (window + off-window credits)
+    final_balances: dict[str, int] = {}
+    for name in member_names:
+        window_balance = sum(
+            int(mdata["paid"]) - mdata["expected"] 
+            for mdata in ledger[name].values()
+        )
+        final_balances[name] = window_balance + credits.get(name, 0)
+
    return {
        "members": {
            name: {
                "tier": member_tiers[name],
                "months": ledger[name],
+                "total_balance": final_balances[name]
            }
            for name in member_names
        },
        "unmatched": unmatched,
-        "credits": credits,
+        "credits": final_balances, # Redefine credits as any positive total balance
    }


@@ -452,12 +396,30 @@ def print_report(result: dict, sorted_months: list[str]):
    balance = total_paid - total_expected
    print(f" | {f'Expected: {total_expected}, Paid: {int(total_paid)}, Balance: {balance:+d}'}")

-    # --- Credits ---
-    if result["credits"]:
-        print(f"\n{'CREDITS (advance payments for future months)':}")
-        for name, amount in sorted(result["credits"].items()):
+    # --- Credits (Total Surplus) ---
+    all_credits = {
+        name: data["total_balance"]
+        for name, data in result["members"].items()
+        if data["total_balance"] > 0
+    }
+    
+    if all_credits:
+        print(f"\n{'TOTAL CREDITS (advance payments or surplus):'}")
+        for name, amount in sorted(all_credits.items()):
            print(f"  {name}: {amount} CZK")

+    # --- Debts (Missing Payments) ---
+    all_debts = {
+        name: data["total_balance"]
+        for name, data in result["members"].items()
+        if data["total_balance"] < 0
+    }
+
+    if all_debts:
+        print(f"\n{'TOTAL DEBTS (missing payments):'}")
+        for name, amount in sorted(all_debts.items()):
+            print(f"  {name}: {abs(amount)} CZK")
+
    # --- Unmatched transactions ---
    if result["unmatched"]:
        print(f"\n{'UNMATCHED TRANSACTIONS (need manual review)':}")
@@ -499,13 +461,14 @@ def main():
        description="Match bank payments against expected attendance fees."
    )
    parser.add_argument(
-        "--from", dest="date_from", default="2025-12-01",
-        help="Start date YYYY-MM-DD (default: 2025-12-01)",
+        "--sheet-id", default=DEFAULT_SPREADSHEET_ID, help="Google Sheet ID"
    )
    parser.add_argument(
-        "--to", dest="date_to",
-        default=datetime.now().strftime("%Y-%m-%d"),
-        help="End date YYYY-MM-DD (default: today)",
+        "--credentials", default=".secret/fuj-management-bot-credentials.json",
+        help="Path to Google API credentials JSON"
+    )
+    parser.add_argument(
+        "--bank", action="store_true", help="Scrape bank instead of using Sheet data"
    )
    args = parser.parse_args()

@@ -515,9 +478,15 @@ def main():
        print("No attendance data found.")
        return

-    print(f"Fetching transactions from {args.date_from} to {args.date_to}...")
-    transactions = fetch_transactions(args.date_from, args.date_to)
-    print(f"Found {len(transactions)} incoming transactions.\n")
+    if args.bank:
+        print(f"Fetching transactions from Fio bank ({args.date_from} to {args.date_to})...")
+        from fio_utils import fetch_transactions
+        transactions = fetch_transactions(args.date_from, args.date_to)
+    else:
+        print(f"Fetching transactions from Google Sheet ({args.sheet_id})...")
+        transactions = fetch_sheet_data(args.sheet_id, args.credentials)
+    
+    print(f"Processing {len(transactions)} transactions.\n")

    result = reconcile(members, sorted_months, transactions)
    print_report(result, sorted_months)