Add status dashboard, server, scraper stats, and DATA_DIR support

Key changes: - Replace ratings_server.py + status.html with a unified server.py that serves the map, scraper status dashboard, and ratings API in one process - Add scraper_stats.py utility: each scraper writes per-run stats (fetched, accepted, excluded, duration) to stats_<source>.json for the status page - generate_status.py: respect DATA_DIR env var so status.json lands in the configured data directory instead of always the project root - run_all.sh: replace the {"status":"running"} overwrite of status.json with a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup even on kill/error, preventing the previous run's results from being wiped - server.py: detect running state via scraper_running.json existence instead of status["status"] field, eliminating the dual-use race condition - Makefile: add serve (local dev), debug (Docker debug container) targets; add SERVER_PORT variable - build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR, adjust volume mounts - .gitignore: add *.json and *.log to keep runtime data files out of VCS - mapa_bytu.html: price-per-m² colouring, status link, UX tweaks Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 00:30:25 +01:00
parent 6f49533c94
commit 5fb3b984b6
17 changed files with 929 additions and 1122 deletions
--- a/scrape_cityhome.py
+++ b/scrape_cityhome.py
@@ -14,6 +14,9 @@ import time
 import urllib.request
 from datetime import datetime
 from pathlib import Path
+from scraper_stats import write_stats
+
+STATS_FILE = "stats_cityhome.json"

 logger = logging.getLogger(__name__)

@@ -203,6 +206,8 @@ def extract_project_gps(html: str) -> tuple[float, float] | None:


 def scrape(max_pages: int | None = None, max_properties: int | None = None):
+    _run_start = time.time()
+    _run_ts = datetime.now().isoformat(timespec="seconds")
    logger.info("=" * 60)
    logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
    logger.info(f"Cena: do {format_price(MAX_PRICE)}")
@@ -374,6 +379,23 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
    logger.info(f"  ✓ Vyhovující byty:    {len(results)}")
    logger.info(f"{'=' * 60}")

+    write_stats(STATS_FILE, {
+        "source": "CityHome",
+        "timestamp": _run_ts,
+        "duration_sec": round(time.time() - _run_start, 1),
+        "success": True,
+        "accepted": len(results),
+        "fetched": len(all_listings),
+        "excluded": {
+            "prodáno": excluded_sold,
+            "typ": excluded_type,
+            "dispozice": excluded_disp,
+            "cena": excluded_price,
+            "plocha": excluded_area,
+            "patro": excluded_floor,
+            "bez GPS": excluded_no_gps,
+        },
+    })
    return results


@@ -394,8 +416,22 @@ if __name__ == "__main__":
        handlers=[logging.StreamHandler()]
    )

+    _run_ts = datetime.now().isoformat(timespec="seconds")
    start = time.time()
-    estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
+    try:
+        estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
+    except Exception as e:
+        logger.error(f"Scraper failed: {e}", exc_info=True)
+        write_stats(STATS_FILE, {
+            "source": "CityHome",
+            "timestamp": _run_ts,
+            "duration_sec": round(time.time() - start, 1),
+            "success": False,
+            "accepted": 0,
+            "fetched": 0,
+            "error": str(e),
+        })
+        raise

    if estates:
        json_path = Path("byty_cityhome.json")