google-documents-read-caching #2

Merged
kacerr merged 4 commits from google-documents-read-caching into main 2026-03-11 10:13:18 +00:00
8 changed files with 307 additions and 21 deletions

3
.gitignore vendored
View File

@@ -1,3 +1,6 @@
# python cache
**/*.pyc
.secret
# local tmp folder
tmp/

33
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,33 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Flask",
"type": "debugpy",
"request": "launch",
"module": "flask",
"python": "${workspaceFolder}/.venv/bin/python",
"env": {
"FLASK_APP": "app.py",
"FLASK_DEBUG": "1"
},
"args": [
"run",
"--no-debugger",
"--no-reload",
"--host", "0.0.0.0",
"--port", "5001"
],
"jinja": true
},
{
"name": "Python Debugger: Attach",
"type": "debugpy",
"request": "attach",
"connect": {
"host": "localhost",
"port": 5678
}
}
]
}

View File

@@ -1,4 +1,4 @@
.PHONY: help fees match web image run sync sync-2026 test test-v docs
.PHONY: help fees match web web-debug image run sync sync-2026 test test-v docs
export PYTHONPATH := scripts:$(PYTHONPATH)
VENV := .venv
@@ -15,8 +15,9 @@ help:
@echo "Available targets:"
@echo " make fees - Calculate monthly fees from the attendance sheet"
@echo " make match - Match Fio bank payments against expected attendance fees"
@echo " make web - Start a dynamic web dashboard locally"
@echo " make image - Build an OCI container image"
@echo " make web - Start a dynamic web dashboard locally"
@echo " make web-debug - Start a dynamic web dashboard locally in debug mode"
@echo " make image - Build an OCI container image"
@echo " make run - Run the built Docker image locally"
@echo " make sync - Sync Fio transactions to Google Sheets"
@echo " make sync-2025 - Sync Fio transactions for Q4 2025 (Oct-Dec)"
@@ -40,6 +41,9 @@ match: $(PYTHON)
web: $(PYTHON)
$(PYTHON) app.py
web-debug: $(PYTHON)
FLASK_DEBUG=1 $(PYTHON) app.py
image:
docker build -t fuj-management:latest -f build/Dockerfile .

71
app.py
View File

@@ -6,14 +6,31 @@ import time
import os
import io
import qrcode
import logging
from flask import Flask, render_template, g, send_file, request
# Configure logging, allowing override via LOG_LEVEL environment variable
log_level = os.environ.get("LOG_LEVEL", "INFO").upper()
logging.basicConfig(level=getattr(logging, log_level, logging.INFO), format='%(asctime)s - %(name)s:%(filename)s:%(lineno)d [%(funcName)s] - %(levelname)s - %(message)s')
# Add scripts directory to path to allow importing from it
scripts_dir = Path(__file__).parent / "scripts"
sys.path.append(str(scripts_dir))
from attendance import get_members_with_fees, get_junior_members_with_fees, SHEET_ID as ATTENDANCE_SHEET_ID, JUNIOR_SHEET_GID, ADULT_MERGED_MONTHS, JUNIOR_MERGED_MONTHS
from match_payments import reconcile, fetch_sheet_data, fetch_exceptions, normalize, DEFAULT_SPREADSHEET_ID as PAYMENTS_SHEET_ID
from cache_utils import get_sheet_modified_time, read_cache, write_cache, _LAST_CHECKED
def get_cached_data(cache_key, sheet_id, fetch_func, *args, serialize=None, deserialize=None, **kwargs):
mod_time = get_sheet_modified_time(cache_key)
if mod_time:
cached = read_cache(cache_key, mod_time)
if cached is not None:
return deserialize(cached) if deserialize else cached
data = fetch_func(*args, **kwargs)
if mod_time:
write_cache(cache_key, mod_time, serialize(data) if serialize else data)
return data
def get_month_labels(sorted_months, merged_months):
labels = {}
@@ -78,10 +95,11 @@ def fees():
attendance_url = f"https://docs.google.com/spreadsheets/d/{ATTENDANCE_SHEET_ID}/edit"
payments_url = f"https://docs.google.com/spreadsheets/d/{PAYMENTS_SHEET_ID}/edit"
members, sorted_months = get_members_with_fees()
members_data = get_cached_data("attendance_regular", ATTENDANCE_SHEET_ID, get_members_with_fees)
record_step("fetch_members")
if not members:
if not members_data:
return "No data."
members, sorted_months = members_data
# Filter to adults only for display
results = [(name, fees) for name, tier, fees in members if tier == "A"]
@@ -93,7 +111,12 @@ def fees():
# Get exceptions for formatting
credentials_path = ".secret/fuj-management-bot-credentials.json"
exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path)
exceptions = get_cached_data(
"exceptions_dict", PAYMENTS_SHEET_ID, fetch_exceptions,
PAYMENTS_SHEET_ID, credentials_path,
serialize=lambda d: [[list(k), v] for k, v in d.items()],
deserialize=lambda c: {tuple(k): v for k, v in c},
)
record_step("fetch_exceptions")
formatted_results = []
@@ -135,10 +158,11 @@ def fees_juniors():
attendance_url = f"https://docs.google.com/spreadsheets/d/{ATTENDANCE_SHEET_ID}/edit#gid={JUNIOR_SHEET_GID}"
payments_url = f"https://docs.google.com/spreadsheets/d/{PAYMENTS_SHEET_ID}/edit"
members, sorted_months = get_junior_members_with_fees()
members_data = get_cached_data("attendance_juniors", ATTENDANCE_SHEET_ID, get_junior_members_with_fees)
record_step("fetch_junior_members")
if not members:
if not members_data:
return "No data."
members, sorted_months = members_data
# Sort members by name
results = sorted([(name, fees) for name, tier, fees in members], key=lambda x: x[0])
@@ -150,7 +174,12 @@ def fees_juniors():
# Get exceptions for formatting (reusing payments sheet)
credentials_path = ".secret/fuj-management-bot-credentials.json"
exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path)
exceptions = get_cached_data(
"exceptions_dict", PAYMENTS_SHEET_ID, fetch_exceptions,
PAYMENTS_SHEET_ID, credentials_path,
serialize=lambda d: [[list(k), v] for k, v in d.items()],
deserialize=lambda c: {tuple(k): v for k, v in c},
)
record_step("fetch_exceptions")
formatted_results = []
@@ -214,14 +243,20 @@ def reconcile_view():
# Use hardcoded credentials path for now, consistent with other scripts
credentials_path = ".secret/fuj-management-bot-credentials.json"
members, sorted_months = get_members_with_fees()
members_data = get_cached_data("attendance_regular", ATTENDANCE_SHEET_ID, get_members_with_fees)
record_step("fetch_members")
if not members:
if not members_data:
return "No data."
members, sorted_months = members_data
transactions = fetch_sheet_data(PAYMENTS_SHEET_ID, credentials_path)
transactions = get_cached_data("payments_transactions", PAYMENTS_SHEET_ID, fetch_sheet_data, PAYMENTS_SHEET_ID, credentials_path)
record_step("fetch_payments")
exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path)
exceptions = get_cached_data(
"exceptions_dict", PAYMENTS_SHEET_ID, fetch_exceptions,
PAYMENTS_SHEET_ID, credentials_path,
serialize=lambda d: [[list(k), v] for k, v in d.items()],
deserialize=lambda c: {tuple(k): v for k, v in c},
)
record_step("fetch_exceptions")
result = reconcile(members, sorted_months, transactions, exceptions)
record_step("reconcile")
@@ -306,14 +341,20 @@ def reconcile_juniors_view():
credentials_path = ".secret/fuj-management-bot-credentials.json"
junior_members, sorted_months = get_junior_members_with_fees()
junior_members_data = get_cached_data("attendance_juniors", ATTENDANCE_SHEET_ID, get_junior_members_with_fees)
record_step("fetch_junior_members")
if not junior_members:
if not junior_members_data:
return "No data."
junior_members, sorted_months = junior_members_data
transactions = fetch_sheet_data(PAYMENTS_SHEET_ID, credentials_path)
transactions = get_cached_data("payments_transactions", PAYMENTS_SHEET_ID, fetch_sheet_data, PAYMENTS_SHEET_ID, credentials_path)
record_step("fetch_payments")
exceptions = fetch_exceptions(PAYMENTS_SHEET_ID, credentials_path)
exceptions = get_cached_data(
"exceptions_dict", PAYMENTS_SHEET_ID, fetch_exceptions,
PAYMENTS_SHEET_ID, credentials_path,
serialize=lambda d: [[list(k), v] for k, v in d.items()],
deserialize=lambda c: {tuple(k): v for k, v in c},
)
record_step("fetch_exceptions")
# Adapt junior tuple format (name, tier, {month: (fee, total_count, adult_count, junior_count)})
@@ -414,7 +455,7 @@ def payments():
payments_url = f"https://docs.google.com/spreadsheets/d/{PAYMENTS_SHEET_ID}/edit"
credentials_path = ".secret/fuj-management-bot-credentials.json"
transactions = fetch_sheet_data(PAYMENTS_SHEET_ID, credentials_path)
transactions = get_cached_data("payments_transactions", PAYMENTS_SHEET_ID, fetch_sheet_data, PAYMENTS_SHEET_ID, credentials_path)
record_step("fetch_payments")
# Group transactions by person

View File

@@ -0,0 +1,29 @@
# Google Sheets Data Caching Implementation
**Date:** 2026-03-11
**Objective:** Optimize Flask application performance by heavily caching expensive Google Sheets data processing, avoiding redundant HTTP roundtrips to Google APIs, and ensuring rate limits are not exhausted during simple web app reloads.
## Implemented Features
### 1. File-Based JSON Caching (`cache_utils.py`)
- **Mechanism:** Implemented a new generic caching system that saves API responses and heavily calculated datasets as `.json` files directly to the local `/tmp/` directory.
- **Drive Metadata Checks:** The cache is validated by asking the Google Drive API (`drive.files().get`) for the remote `modifiedTime` of the target Sheet.
- **Cache Hit logic:** If the cached version on disk matches the remote `modifiedTime`, the application skips downloading the full CSV payload and computing tuples—instead serving the instant static cache via `json.load`.
### 2. Global API Auth Object Reuse
- **The Problem:** The `_get_drive_service()` and `get_sheets_service()` implementations were completely rebuilding `googleapiclient.discovery` objects for *every single file check*—re-seeking and exchanging Google Service Account tokens constantly.
- **The Fix:** Service objects (`_DRIVE_SERVICE`, `_SHEETS_SERVICE`) are now globally cached in application memory. The server authenticates exactly *once* when it wakes up, dramatically saving milliseconds and network resources across every web request. The underlying `httplib2` and `google-auth` intelligently handle silent token refreshes natively.
### 3. Graceful Configurable Rate Limiting
- **In-Memory Debouncing:** Implemented an internal memory state (`_LAST_CHECKED`) inside `cache_utils` that forcefully prevents checking the Drive API `modifiedTime` for a specific file if we already explicitly checked it within the last 5 minutes. This prevents flooding the Google Drive API while clicking wildly around the app GUI.
- **Semantic Mappings:** Created a `CACHE_SHEET_MAP` that maps friendly internal cache keys (e.g. `attendance_regular`) back to their raw 44-character Google Sheet IDs.
### 4. HTTP / Socket Timeout Safety Fix
- **The Bug:** Originally, `socket.setdefaulttimeout(10)` was used to prevent Google Drive metadata checks from locking up the worker pool. However, this brutally mutated the underlying Werkzeug/Flask default sockets globally. If fetching thousands of lines from Google *Sheets* (the payload logic) took longer than 10 seconds, Flask would just kill the request with a random `TimeoutError('timed out')`.
- **The Fix:** Removed the global mutation. Instantiated a targeted, isolated `httplib2.Http(timeout=10)` injected *specifically* into only the Google Drive API build. The rest of the app can now download massive files without randomly timing out.
### 5. Developer Experience (DX) Enhancements
- **Logging Line Origins:** Enriched the console logging format strings (`logging.basicConfig`) to output `[%(funcName)s]` and `%(filename)s:%(lineno)d` to easily trace exactly which exact file and function is executing on complex stack traces.
- **Improved VS Code Local Debugging:**
- Integrated `debugpy` launch profiles in `.vscode/launch.json` for "Python Debugger: Flask" (Launching) and "Python Debugger: Attach" (Connecting).
- Implemented a standard `make web-attach` target inside the Makefile via `uv run python -m debugpy --listen ...` to allow the background web app to automatically halt and wait for external debuggers before bootstrapping caching layers.

169
scripts/cache_utils.py Normal file
View File

@@ -0,0 +1,169 @@
import json
import os
import socket
import logging
from datetime import datetime
from pathlib import Path
from google.oauth2 import service_account
from googleapiclient.discovery import build
logger = logging.getLogger(__name__)
# Constants
CACHE_DIR = Path(__file__).parent.parent / "tmp"
CREDS_PATH = Path(__file__).parent.parent / ".secret" / "fuj-management-bot-credentials.json"
DRIVE_TIMEOUT = 10 # seconds
CACHE_TTL_SECONDS = int(os.environ.get("CACHE_TTL_SECONDS", 300)) # 30 min default for max cache age
CACHE_API_CHECK_TTL_SECONDS = int(os.environ.get("CACHE_API_CHECK_TTL_SECONDS", 300)) # 5 min default
# Known mappings mapping "cache name" to Google Sheet ID
CACHE_SHEET_MAP = {
"attendance_regular": "1E2e_gT_K5AwSRCDLDTa2UetZTkHmBOcz0kFbBUNUNBA",
"attendance_juniors": "1E2e_gT_K5AwSRCDLDTa2UetZTkHmBOcz0kFbBUNUNBA",
"exceptions_dict": "1Om0YPoDVCH5cV8BrNz5LG5eR5MMU05ypQC7UMN1xn_Y",
"payments_transactions": "1Om0YPoDVCH5cV8BrNz5LG5eR5MMU05ypQC7UMN1xn_Y"
}
# Global state to track last Drive API check time per sheet
_LAST_CHECKED = {}
_DRIVE_SERVICE = None
def _get_drive_service():
global _DRIVE_SERVICE
if _DRIVE_SERVICE is not None:
return _DRIVE_SERVICE
if not CREDS_PATH.exists():
logger.warning(f"Credentials not found at {CREDS_PATH}. Cannot check Google Drive API.")
return None
try:
creds = service_account.Credentials.from_service_account_file(
str(CREDS_PATH),
scopes=["https://www.googleapis.com/auth/drive.readonly"]
)
# Apply timeout safely to the httplib2 connection without mutating global socket
import httplib2
import google_auth_httplib2
http = httplib2.Http(timeout=DRIVE_TIMEOUT)
http = google_auth_httplib2.AuthorizedHttp(creds, http=http)
_DRIVE_SERVICE = build("drive", "v3", http=http, cache_discovery=False)
return _DRIVE_SERVICE
except Exception as e:
logger.error(f"Failed to build Drive API service: {e}")
return None
import time
def get_sheet_modified_time(cache_key: str) -> str | None:
"""Gets the modifiedTime from Google Drive API for a given cache_key.
Returns the ISO timestamp string if successful.
If the Drive API fails (e.g., lack of permissions for public sheets),
it generates a virtual time bucket string to provide a 5-minute TTL cache.
"""
sheet_id = CACHE_SHEET_MAP.get(cache_key, cache_key)
cache_file = CACHE_DIR / f"{cache_key}_cache.json"
# 1. Check if we should skip the Drive API check entirely (global memory TTL)
now = time.time()
last_check = _LAST_CHECKED.get(sheet_id, 0)
if CACHE_API_CHECK_TTL_SECONDS > 0 and (now - last_check) < CACHE_API_CHECK_TTL_SECONDS:
# We checked recently. Return cached modifiedTime if cache file exists.
if cache_file.exists():
try:
with open(cache_file, "r", encoding="utf-8") as f:
cache_data = json.load(f)
cached_time = cache_data.get("modifiedTime")
if cached_time:
logger.info(f"Skipping Drive API check for {sheet_id} due to {CACHE_API_CHECK_TTL_SECONDS}s API check TTL")
return cached_time
except Exception as e:
logger.warning(f"Error reading existing cache during API skip for {sheet_id}: {e}")
# 2. Check if the cache file is simply too new (legacy check)
if CACHE_TTL_SECONDS > 0 and cache_file.exists():
try:
file_mtime = os.path.getmtime(cache_file)
if time.time() - file_mtime < CACHE_TTL_SECONDS:
with open(cache_file, "r", encoding="utf-8") as f:
cache_data = json.load(f)
cached_time = cache_data.get("modifiedTime")
if cached_time:
logger.info(f"Skipping Drive API check for {sheet_id} due to {CACHE_TTL_SECONDS}s max CACHE_TTL")
# We consider this a valid check, update the global state
_LAST_CHECKED[sheet_id] = now
return cached_time
except Exception as e:
logger.warning(f"Error checking cache TTL for {sheet_id}: {e}")
def _fallback_ttl():
bucket = int(time.time() // 300)
return f"ttl-5m-{bucket}"
logger.info(f"Checking Drive API for {sheet_id}")
drive_service = _get_drive_service()
if not drive_service:
return _fallback_ttl()
try:
file_meta = drive_service.files().get(fileId=sheet_id, fields="modifiedTime", supportsAllDrives=True).execute()
# Successfully checked API, update the global state
_LAST_CHECKED[sheet_id] = time.time()
return file_meta.get("modifiedTime")
except Exception as e:
logger.warning(f"Could not get modifiedTime for sheet {sheet_id}: {e}. Falling back to 5-minute TTL.")
return _fallback_ttl()
def read_cache(sheet_id: str, current_modified_time: str) -> list | dict | None:
"""Reads the JSON cache for the given sheet_id.
Returns the cached data if it exists AND the cached modifiedTime matches
current_modified_time.
Otherwise, returns None.
"""
if not current_modified_time:
return None
cache_file = CACHE_DIR / f"{sheet_id}_cache.json"
if not cache_file.exists():
return None
try:
with open(cache_file, "r", encoding="utf-8") as f:
cache_data = json.load(f)
cached_time = cache_data.get("modifiedTime")
if cached_time == current_modified_time:
logger.info(f"Cache hit for {sheet_id} ({current_modified_time})")
return cache_data.get("data")
else:
logger.info(f"Cache miss for {sheet_id}. Cached: {cached_time}, Current: {current_modified_time}")
return None
except Exception as e:
logger.error(f"Failed to read cache {cache_file}: {e}")
return None
def write_cache(sheet_id: str, modified_time: str, data: list | dict) -> None:
"""Writes the data to a JSON cache file with the given modified_time."""
if not modified_time:
return
try:
CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_file = CACHE_DIR / f"{sheet_id}_cache.json"
cache_data = {
"modifiedTime": modified_time,
"data": data,
"cachedAt": datetime.now().isoformat()
}
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(cache_data, f, ensure_ascii=False)
logger.info(f"Wrote cache for {sheet_id}")
except Exception as e:
logger.error(f"Failed to write cache {sheet_id}: {e}")

View File

@@ -300,8 +300,8 @@ def reconcile(
norm_name = normalize(name)
norm_period = normalize(m)
fee_data = member_fees[name].get(m, (0, 0))
original_expected = fee_data[0] if isinstance(fee_data, tuple) else fee_data
attendance_count = fee_data[1] if isinstance(fee_data, tuple) else 0
original_expected = fee_data[0] if isinstance(fee_data, (tuple, list)) else fee_data
attendance_count = fee_data[1] if isinstance(fee_data, (tuple, list)) else 0
ex_data = exceptions.get((norm_name, norm_period))
if ex_data is not None:

View File

@@ -19,8 +19,14 @@ DEFAULT_SPREADSHEET_ID = "1Om0YPoDVCH5cV8BrNz5LG5eR5MMU05ypQC7UMN1xn_Y"
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
TOKEN_FILE = "token.pickle"
COLUMN_LABELS = ["Date", "Amount", "manual fix", "Person", "Purpose", "Inferred Amount", "Sender", "VS", "Message", "Bank ID", "Sync ID"]
_SHEETS_SERVICE = None
def get_sheets_service(credentials_path: str):
"""Authenticate and return the Google Sheets API service."""
global _SHEETS_SERVICE
if _SHEETS_SERVICE is not None:
return _SHEETS_SERVICE
if not os.path.exists(credentials_path):
raise FileNotFoundError(f"Credentials file not found: {credentials_path}")
@@ -50,7 +56,8 @@ def get_sheets_service(credentials_path: str):
with open(TOKEN_FILE, "wb") as token:
pickle.dump(creds, token)
return build("sheets", "v4", credentials=creds)
_SHEETS_SERVICE = build("sheets", "v4", credentials=creds)
return _SHEETS_SERVICE
def generate_sync_id(tx: dict) -> str: