Add status dashboard, server, scraper stats, and DATA_DIR support
All checks were successful
Build and Push / build (push) Successful in 7s

Key changes:
- Replace ratings_server.py + status.html with a unified server.py that
  serves the map, scraper status dashboard, and ratings API in one process
- Add scraper_stats.py utility: each scraper writes per-run stats (fetched,
  accepted, excluded, duration) to stats_<source>.json for the status page
- generate_status.py: respect DATA_DIR env var so status.json lands in the
  configured data directory instead of always the project root
- run_all.sh: replace the {"status":"running"} overwrite of status.json with
  a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup
  even on kill/error, preventing the previous run's results from being wiped
- server.py: detect running state via scraper_running.json existence instead
  of status["status"] field, eliminating the dual-use race condition
- Makefile: add serve (local dev), debug (Docker debug container) targets;
  add SERVER_PORT variable
- build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR,
  adjust volume mounts
- .gitignore: add *.json and *.log to keep runtime data files out of VCS
- mapa_bytu.html: price-per-m² colouring, status link, UX tweaks

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jan Novak
2026-02-26 00:30:25 +01:00
parent 6f49533c94
commit 5fb3b984b6
17 changed files with 929 additions and 1122 deletions

View File

@@ -15,6 +15,9 @@ import re
import time
import urllib.request
from pathlib import Path
from scraper_stats import write_stats
STATS_FILE = "stats_bezrealitky.json"
logger = logging.getLogger(__name__)
@@ -171,6 +174,8 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
def scrape(max_pages: int | None = None, max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
cache = load_cache()
logger.info("=" * 60)
@@ -374,6 +379,25 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "Bezrealitky",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_adverts),
"pages": page - 1,
"cache_hits": cache_hits,
"excluded": {
"dispozice": excluded_disp,
"cena": excluded_price,
"plocha": excluded_area,
"bez GPS": excluded_no_gps,
"panel/síd": excluded_panel,
"patro": excluded_floor,
"bez detailu": excluded_detail,
},
})
return results
@@ -394,8 +418,22 @@ if __name__ == "__main__":
handlers=[logging.StreamHandler()]
)
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "Bezrealitky",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates:
json_path = Path("byty_bezrealitky.json")