Add status dashboard, server, scraper stats, and DATA_DIR support

Key changes: - Replace ratings_server.py + status.html with a unified server.py that serves the map, scraper status dashboard, and ratings API in one process - Add scraper_stats.py utility: each scraper writes per-run stats (fetched, accepted, excluded, duration) to stats_<source>.json for the status page - generate_status.py: respect DATA_DIR env var so status.json lands in the configured data directory instead of always the project root - run_all.sh: replace the {"status":"running"} overwrite of status.json with a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup even on kill/error, preventing the previous run's results from being wiped - server.py: detect running state via scraper_running.json existence instead of status["status"] field, eliminating the dual-use race condition - Makefile: add serve (local dev), debug (Docker debug container) targets; add SERVER_PORT variable - build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR, adjust volume mounts - .gitignore: add *.json and *.log to keep runtime data files out of VCS - mapa_bytu.html: price-per-m² colouring, status link, UX tweaks Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 00:30:25 +01:00
parent 6f49533c94
commit 5fb3b984b6
17 changed files with 929 additions and 1122 deletions
--- a/scrape_idnes.py
+++ b/scrape_idnes.py
@@ -17,6 +17,9 @@ import urllib.request
 import urllib.parse
 from html.parser import HTMLParser
 from pathlib import Path
+from scraper_stats import write_stats
+
+STATS_FILE = "stats_idnes.json"

 logger = logging.getLogger(__name__)

@@ -279,6 +282,8 @@ def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:


 def scrape(max_pages: int | None = None, max_properties: int | None = None):
+    _run_start = time.time()
+    _run_ts = datetime.now().isoformat(timespec="seconds")
    cache = load_cache()

    logger.info("=" * 60)
@@ -478,6 +483,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
    logger.info(f"  ✓ Vyhovující byty:    {len(results)}")
    logger.info(f"{'=' * 60}")

+    write_stats(STATS_FILE, {
+        "source": "iDNES",
+        "timestamp": _run_ts,
+        "duration_sec": round(time.time() - _run_start, 1),
+        "success": True,
+        "accepted": len(results),
+        "fetched": len(all_listings),
+        "pages": page,
+        "cache_hits": cache_hits,
+        "excluded": {
+            "cena": excluded_price,
+            "plocha": excluded_area,
+            "dispozice": excluded_disp,
+            "panel/síd": excluded_panel,
+            "patro": excluded_floor,
+            "bez GPS": excluded_no_gps,
+            "bez detailu": excluded_detail,
+        },
+    })
    return results


@@ -498,8 +522,22 @@ if __name__ == "__main__":
        handlers=[logging.StreamHandler()]
    )

+    _run_ts = datetime.now().isoformat(timespec="seconds")
    start = time.time()
-    estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
+    try:
+        estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
+    except Exception as e:
+        logger.error(f"Scraper failed: {e}", exc_info=True)
+        write_stats(STATS_FILE, {
+            "source": "iDNES",
+            "timestamp": _run_ts,
+            "duration_sec": round(time.time() - start, 1),
+            "success": False,
+            "accepted": 0,
+            "fetched": 0,
+            "error": str(e),
+        })
+        raise

    if estates:
        json_path = Path("byty_idnes.json")