From 8662cb459269edb28e6ef9dd34fb36fb19e73f63 Mon Sep 17 00:00:00 2001 From: Jan Novak Date: Wed, 11 Mar 2026 01:16:00 +0100 Subject: [PATCH] feat: implement caching for google sheets data - Add cache_utils.py with JSON caching for Google Sheets - Authenticate and cache Drive/Sheets API services globally to reuse tokens - Use CACHE_SHEET_MAP dict to resolve cache names securely to Sheet IDs - Change app.py data fetching to skip downloads if modifiedTime matches cache - Replace global socket timeout with httplib2 to fix Werkzeug timeouts - Add VS Code attach debugpy configurations to launch.json and Makefile --- .vscode/launch.json | 33 +++++++ Makefile | 10 +- app.py | 63 ++++++++++--- scripts/cache_utils.py | 172 ++++++++++++++++++++++++++++++++++ scripts/match_payments.py | 4 +- scripts/sync_fio_to_sheets.py | 9 +- 6 files changed, 270 insertions(+), 21 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 scripts/cache_utils.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..8c6d99d --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,33 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Flask", + "type": "debugpy", + "request": "launch", + "module": "flask", + "python": "${workspaceFolder}/.venv/bin/python", + "env": { + "FLASK_APP": "app.py", + "FLASK_DEBUG": "1" + }, + "args": [ + "run", + "--no-debugger", + "--no-reload", + "--host", "0.0.0.0", + "--port", "5001" + ], + "jinja": true + }, + { + "name": "Python Debugger: Attach", + "type": "debugpy", + "request": "attach", + "connect": { + "host": "localhost", + "port": 5678 + } + } + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile index a337296..5190434 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help fees match web image run sync sync-2026 test test-v docs +.PHONY: help fees match web web-debug image run sync sync-2026 test test-v docs export PYTHONPATH := scripts:$(PYTHONPATH) VENV := .venv @@ -15,8 +15,9 @@ help: @echo "Available targets:" @echo " make fees - Calculate monthly fees from the attendance sheet" @echo " make match - Match Fio bank payments against expected attendance fees" - @echo " make web - Start a dynamic web dashboard locally" - @echo " make image - Build an OCI container image" + @echo " make web - Start a dynamic web dashboard locally" + @echo " make web-debug - Start a dynamic web dashboard locally in debug mode" + @echo " make image - Build an OCI container image" @echo " make run - Run the built Docker image locally" @echo " make sync - Sync Fio transactions to Google Sheets" @echo " make sync-2025 - Sync Fio transactions for Q4 2025 (Oct-Dec)" @@ -40,6 +41,9 @@ match: $(PYTHON) web: $(PYTHON) $(PYTHON) app.py +web-debug: $(PYTHON) + FLASK_DEBUG=1 $(PYTHON) app.py + image: docker build -t fuj-management:latest -f build/Dockerfile . diff --git a/app.py b/app.py index 63b6de1..f7915b0 100644 --- a/app.py +++ b/app.py @@ -6,14 +6,43 @@ import time import os import io import qrcode +import logging from flask import Flask, render_template, g, send_file, request +# Configure logging, allowing override via LOG_LEVEL environment variable +log_level = os.environ.get("LOG_LEVEL", "INFO").upper() +logging.basicConfig(level=getattr(logging, log_level, logging.INFO), format='%(asctime)s - %(name)s:%(filename)s:%(lineno)d [%(funcName)s] - %(levelname)s - %(message)s') + # Add scripts directory to path to allow importing from it scripts_dir = Path(__file__).parent / "scripts" sys.path.append(str(scripts_dir)) from attendance import get_members_with_fees, get_junior_members_with_fees, SHEET_ID as ATTENDANCE_SHEET_ID, JUNIOR_SHEET_GID, ADULT_MERGED_MONTHS, JUNIOR_MERGED_MONTHS from match_payments import reconcile, fetch_sheet_data, fetch_exceptions, normalize, DEFAULT_SPREADSHEET_ID as PAYMENTS_SHEET_ID +from cache_utils import get_sheet_modified_time, read_cache, write_cache + +def get_cached_data(cache_key, sheet_id, fetch_func, *args, **kwargs): + mod_time = get_sheet_modified_time(cache_key) + if mod_time: + cached = read_cache(cache_key, mod_time) + if cached is not None: + return cached + data = fetch_func(*args, **kwargs) + if mod_time: + write_cache(cache_key, mod_time, data) + return data + +def get_cached_exceptions(sheet_id, creds_path): + cache_key = "exceptions_dict" + mod_time = get_sheet_modified_time(cache_key) + if mod_time: + cached = read_cache(cache_key, mod_time) + if cached is not None: + return {tuple(k): v for k, v in cached} + data = fetch_exceptions(sheet_id, creds_path) + if mod_time: + write_cache(cache_key, mod_time, [[list(k), v] for k, v in data.items()]) + return data def get_month_labels(sorted_months, merged_months): labels = {} @@ -78,10 +107,11 @@ def fees(): attendance_url = f"https://docs.google.com/spreadsheets/d/{ATTENDANCE_SHEET_ID}/edit" payments_url = f"https://docs.google.com/spreadsheets/d/{PAYMENTS_SHEET_ID}/edit" - members, sorted_months = get_members_with_fees() + members_data = get_cached_data("attendance_regular", ATTENDANCE_SHEET_ID, get_members_with_fees) record_step("fetch_members") - if not members: + if not members_data: return "No data." + members, sorted_months = members_data # Filter to adults only for display results = [(name, fees) for name, tier, fees in members if tier == "A"] @@ -93,7 +123,7 @@ def fees(): # Get exceptions for formatting credentials_path = ".secret/fuj-management-bot-credentials.json" - exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path) + exceptions = get_cached_exceptions(PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_exceptions") formatted_results = [] @@ -135,10 +165,11 @@ def fees_juniors(): attendance_url = f"https://docs.google.com/spreadsheets/d/{ATTENDANCE_SHEET_ID}/edit#gid={JUNIOR_SHEET_GID}" payments_url = f"https://docs.google.com/spreadsheets/d/{PAYMENTS_SHEET_ID}/edit" - members, sorted_months = get_junior_members_with_fees() + members_data = get_cached_data("attendance_juniors", ATTENDANCE_SHEET_ID, get_junior_members_with_fees) record_step("fetch_junior_members") - if not members: + if not members_data: return "No data." + members, sorted_months = members_data # Sort members by name results = sorted([(name, fees) for name, tier, fees in members], key=lambda x: x[0]) @@ -150,7 +181,7 @@ def fees_juniors(): # Get exceptions for formatting (reusing payments sheet) credentials_path = ".secret/fuj-management-bot-credentials.json" - exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path) + exceptions = get_cached_exceptions(PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_exceptions") formatted_results = [] @@ -214,14 +245,15 @@ def reconcile_view(): # Use hardcoded credentials path for now, consistent with other scripts credentials_path = ".secret/fuj-management-bot-credentials.json" - members, sorted_months = get_members_with_fees() + members_data = get_cached_data("attendance_regular", ATTENDANCE_SHEET_ID, get_members_with_fees) record_step("fetch_members") - if not members: + if not members_data: return "No data." + members, sorted_months = members_data - transactions = fetch_sheet_data(PAYMENTS_SHEET_ID, credentials_path) + transactions = get_cached_data("payments_transactions", PAYMENTS_SHEET_ID, fetch_sheet_data, PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_payments") - exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path) + exceptions = get_cached_exceptions(PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_exceptions") result = reconcile(members, sorted_months, transactions, exceptions) record_step("reconcile") @@ -306,14 +338,15 @@ def reconcile_juniors_view(): credentials_path = ".secret/fuj-management-bot-credentials.json" - junior_members, sorted_months = get_junior_members_with_fees() + junior_members_data = get_cached_data("attendance_juniors", ATTENDANCE_SHEET_ID, get_junior_members_with_fees) record_step("fetch_junior_members") - if not junior_members: + if not junior_members_data: return "No data." + junior_members, sorted_months = junior_members_data - transactions = fetch_sheet_data(PAYMENTS_SHEET_ID, credentials_path) + transactions = get_cached_data("payments_transactions", PAYMENTS_SHEET_ID, fetch_sheet_data, PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_payments") - exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path) + exceptions = get_cached_exceptions(PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_exceptions") # Adapt junior tuple format (name, tier, {month: (fee, total_count, adult_count, junior_count)}) @@ -414,7 +447,7 @@ def payments(): payments_url = f"https://docs.google.com/spreadsheets/d/{PAYMENTS_SHEET_ID}/edit" credentials_path = ".secret/fuj-management-bot-credentials.json" - transactions = fetch_sheet_data(PAYMENTS_SHEET_ID, credentials_path) + transactions = get_cached_data("payments_transactions", PAYMENTS_SHEET_ID, fetch_sheet_data, PAYMENTS_SHEET_ID, credentials_path) record_step("fetch_payments") # Group transactions by person diff --git a/scripts/cache_utils.py b/scripts/cache_utils.py new file mode 100644 index 0000000..2fa1bec --- /dev/null +++ b/scripts/cache_utils.py @@ -0,0 +1,172 @@ +import json +import os +import socket +import logging +from datetime import datetime +from pathlib import Path +from google.oauth2 import service_account +from googleapiclient.discovery import build + +logger = logging.getLogger(__name__) + +# Constants +CACHE_DIR = Path(__file__).parent.parent / "tmp" +CREDS_PATH = Path(__file__).parent.parent / ".secret" / "fuj-management-bot-credentials.json" +DRIVE_TIMEOUT = 10 # seconds +CACHE_TTL_SECONDS = int(os.environ.get("CACHE_TTL_SECONDS", 1800)) # 30 min default for max cache age +CACHE_API_CHECK_TTL_SECONDS = int(os.environ.get("CACHE_API_CHECK_TTL_SECONDS", 300)) # 5 min default + +# Known mappings mapping "cache name" to Google Sheet ID +CACHE_SHEET_MAP = { + "attendance_regular": "1E2e_gT_K5AwSRCDLDTa2UetZTkHmBOcz0kFbBUNUNBA", + "attendance_juniors": "1wXm4gB0rW_LCHgLhCqg0Rk-pGkP5xKIf14dO3D3Z_g4", + "exceptions_dict": "1Om0YPoDVCH5cV8BrNz5LG5eR5MMU05ypQC7UMN1xn_Y", + "transactions_ledger": "1Om0YPoDVCH5cV8BrNz5LG5eR5MMU05ypQC7UMN1xn_Y" +} + +# Global state to track last Drive API check time per sheet +_LAST_CHECKED = {} +_DRIVE_SERVICE = None + +def _get_drive_service(): + global _DRIVE_SERVICE + if _DRIVE_SERVICE is not None: + return _DRIVE_SERVICE + + if not CREDS_PATH.exists(): + logger.warning(f"Credentials not found at {CREDS_PATH}. Cannot check Google Drive API.") + return None + + try: + creds = service_account.Credentials.from_service_account_file( + str(CREDS_PATH), + scopes=["https://www.googleapis.com/auth/drive.readonly"] + ) + + # Apply timeout safely to the httplib2 connection without mutating global socket + import httplib2 + import google_auth_httplib2 + http = httplib2.Http(timeout=DRIVE_TIMEOUT) + http = google_auth_httplib2.AuthorizedHttp(creds, http=http) + + _DRIVE_SERVICE = build("drive", "v3", http=http, cache_discovery=False) + return _DRIVE_SERVICE + except Exception as e: + logger.error(f"Failed to build Drive API service: {e}") + return None + +import time + +def get_sheet_modified_time(cache_key: str) -> str | None: + """Gets the modifiedTime from Google Drive API for a given cache_key. + Returns the ISO timestamp string if successful. + If the Drive API fails (e.g., lack of permissions for public sheets), + it generates a virtual time bucket string to provide a 5-minute TTL cache. + """ + sheet_id = CACHE_SHEET_MAP.get(cache_key, cache_key) + + cache_file = CACHE_DIR / f"{cache_key}_cache.json" + + # 1. Check if we should skip the Drive API check entirely (global memory TTL) + now = time.time() + last_check = _LAST_CHECKED.get(sheet_id, 0) + + if CACHE_API_CHECK_TTL_SECONDS > 0 and (now - last_check) < CACHE_API_CHECK_TTL_SECONDS: + # We checked recently. Do we have a valid cache file? + if cache_file.exists(): + try: + # Still respect the older, broader CACHE_TTL_SECONDS + file_mtime = os.path.getmtime(cache_file) + if CACHE_TTL_SECONDS <= 0 or (now - file_mtime) < CACHE_TTL_SECONDS: + with open(cache_file, "r", encoding="utf-8") as f: + cache_data = json.load(f) + cached_time = cache_data.get("modifiedTime") + if cached_time: + logger.info(f"Skipping Drive API check for {sheet_id} due to {CACHE_API_CHECK_TTL_SECONDS}s API check TTL") + return cached_time + except Exception as e: + logger.warning(f"Error reading existing cache during API skip for {sheet_id}: {e}") + + # 2. Check if the cache file is simply too new (legacy check) + if CACHE_TTL_SECONDS > 0 and cache_file.exists(): + try: + file_mtime = os.path.getmtime(cache_file) + if time.time() - file_mtime < CACHE_TTL_SECONDS: + with open(cache_file, "r", encoding="utf-8") as f: + cache_data = json.load(f) + cached_time = cache_data.get("modifiedTime") + if cached_time: + logger.info(f"Skipping Drive API check for {sheet_id} due to {CACHE_TTL_SECONDS}s max CACHE_TTL") + # We consider this a valid check, update the global state + _LAST_CHECKED[sheet_id] = now + return cached_time + except Exception as e: + logger.warning(f"Error checking cache TTL for {sheet_id}: {e}") + + def _fallback_ttl(): + bucket = int(time.time() // 300) + return f"ttl-5m-{bucket}" + + logger.info(f"Checking Drive API for {sheet_id}") + drive_service = _get_drive_service() + if not drive_service: + return _fallback_ttl() + + try: + file_meta = drive_service.files().get(fileId=sheet_id, fields="modifiedTime", supportsAllDrives=True).execute() + # Successfully checked API, update the global state + _LAST_CHECKED[sheet_id] = time.time() + return file_meta.get("modifiedTime") + except Exception as e: + logger.warning(f"Could not get modifiedTime for sheet {sheet_id}: {e}. Falling back to 5-minute TTL.") + return _fallback_ttl() + +def read_cache(sheet_id: str, current_modified_time: str) -> list | dict | None: + """Reads the JSON cache for the given sheet_id. + Returns the cached data if it exists AND the cached modifiedTime matches + current_modified_time. + Otherwise, returns None. + """ + if not current_modified_time: + return None + + cache_file = CACHE_DIR / f"{sheet_id}_cache.json" + if not cache_file.exists(): + return None + + try: + with open(cache_file, "r", encoding="utf-8") as f: + cache_data = json.load(f) + + cached_time = cache_data.get("modifiedTime") + if cached_time == current_modified_time: + logger.info(f"Cache hit for {sheet_id} ({current_modified_time})") + return cache_data.get("data") + else: + logger.info(f"Cache miss for {sheet_id}. Cached: {cached_time}, Current: {current_modified_time}") + return None + except Exception as e: + logger.error(f"Failed to read cache {cache_file}: {e}") + return None + +def write_cache(sheet_id: str, modified_time: str, data: list | dict) -> None: + """Writes the data to a JSON cache file with the given modified_time.""" + if not modified_time: + return + + try: + CACHE_DIR.mkdir(parents=True, exist_ok=True) + cache_file = CACHE_DIR / f"{sheet_id}_cache.json" + + cache_data = { + "modifiedTime": modified_time, + "data": data, + "cachedAt": datetime.now().isoformat() + } + + with open(cache_file, "w", encoding="utf-8") as f: + json.dump(cache_data, f, ensure_ascii=False) + + logger.info(f"Wrote cache for {sheet_id}") + except Exception as e: + logger.error(f"Failed to write cache {sheet_id}: {e}") diff --git a/scripts/match_payments.py b/scripts/match_payments.py index c400a65..5d426d9 100644 --- a/scripts/match_payments.py +++ b/scripts/match_payments.py @@ -300,8 +300,8 @@ def reconcile( norm_name = normalize(name) norm_period = normalize(m) fee_data = member_fees[name].get(m, (0, 0)) - original_expected = fee_data[0] if isinstance(fee_data, tuple) else fee_data - attendance_count = fee_data[1] if isinstance(fee_data, tuple) else 0 + original_expected = fee_data[0] if isinstance(fee_data, (tuple, list)) else fee_data + attendance_count = fee_data[1] if isinstance(fee_data, (tuple, list)) else 0 ex_data = exceptions.get((norm_name, norm_period)) if ex_data is not None: diff --git a/scripts/sync_fio_to_sheets.py b/scripts/sync_fio_to_sheets.py index 42ec9d7..652f315 100644 --- a/scripts/sync_fio_to_sheets.py +++ b/scripts/sync_fio_to_sheets.py @@ -19,8 +19,14 @@ DEFAULT_SPREADSHEET_ID = "1Om0YPoDVCH5cV8BrNz5LG5eR5MMU05ypQC7UMN1xn_Y" SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] TOKEN_FILE = "token.pickle" COLUMN_LABELS = ["Date", "Amount", "manual fix", "Person", "Purpose", "Inferred Amount", "Sender", "VS", "Message", "Bank ID", "Sync ID"] +_SHEETS_SERVICE = None + def get_sheets_service(credentials_path: str): """Authenticate and return the Google Sheets API service.""" + global _SHEETS_SERVICE + if _SHEETS_SERVICE is not None: + return _SHEETS_SERVICE + if not os.path.exists(credentials_path): raise FileNotFoundError(f"Credentials file not found: {credentials_path}") @@ -50,7 +56,8 @@ def get_sheets_service(credentials_path: str): with open(TOKEN_FILE, "wb") as token: pickle.dump(creds, token) - return build("sheets", "v4", credentials=creds) + _SHEETS_SERVICE = build("sheets", "v4", credentials=creds) + return _SHEETS_SERVICE def generate_sync_id(tx: dict) -> str: