All checks were successful
Build and Push / build (push) Successful in 7s
Key changes:
- Replace ratings_server.py + status.html with a unified server.py that
serves the map, scraper status dashboard, and ratings API in one process
- Add scraper_stats.py utility: each scraper writes per-run stats (fetched,
accepted, excluded, duration) to stats_<source>.json for the status page
- generate_status.py: respect DATA_DIR env var so status.json lands in the
configured data directory instead of always the project root
- run_all.sh: replace the {"status":"running"} overwrite of status.json with
a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup
even on kill/error, preventing the previous run's results from being wiped
- server.py: detect running state via scraper_running.json existence instead
of status["status"] field, eliminating the dual-use race condition
- Makefile: add serve (local dev), debug (Docker debug container) targets;
add SERVER_PORT variable
- build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR,
adjust volume mounts
- .gitignore: add *.json and *.log to keep runtime data files out of VCS
- mapa_bytu.html: price-per-m² colouring, status link, UX tweaks
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
162 lines
5.4 KiB
Python
162 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate status.json from scraper JSON outputs and per-scraper stats files."""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
HERE = Path(__file__).parent
|
|
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
|
|
|
|
SOURCE_FILES = {
|
|
"Sreality": "byty_sreality.json",
|
|
"Realingo": "byty_realingo.json",
|
|
"Bezrealitky": "byty_bezrealitky.json",
|
|
"iDNES": "byty_idnes.json",
|
|
"PSN": "byty_psn.json",
|
|
"CityHome": "byty_cityhome.json",
|
|
}
|
|
|
|
STATS_FILES = {
|
|
"Sreality": "stats_sreality.json",
|
|
"Realingo": "stats_realingo.json",
|
|
"Bezrealitky": "stats_bezrealitky.json",
|
|
"iDNES": "stats_idnes.json",
|
|
"PSN": "stats_psn.json",
|
|
"CityHome": "stats_cityhome.json",
|
|
}
|
|
|
|
MERGED_FILE = "byty_merged.json"
|
|
HISTORY_FILE = "scraper_history.json"
|
|
|
|
|
|
def count_source(path: Path) -> dict:
|
|
"""Read a scraper JSON and return accepted count + file mtime."""
|
|
if not path.exists():
|
|
return {"accepted": 0, "error": "soubor nenalezen"}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
|
|
return {"accepted": len(data), "updated_at": mtime}
|
|
except Exception as e:
|
|
return {"accepted": 0, "error": str(e)}
|
|
|
|
|
|
def read_scraper_stats(path: Path) -> dict:
|
|
"""Load a per-scraper stats JSON. Returns {} on missing or corrupt file."""
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
return data if isinstance(data, dict) else {}
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def append_to_history(status: dict, keep: int) -> None:
|
|
"""Append the current status entry to scraper_history.json, keeping only `keep` latest."""
|
|
history_path = HERE / HISTORY_FILE
|
|
history: list = []
|
|
if history_path.exists():
|
|
try:
|
|
history = json.loads(history_path.read_text(encoding="utf-8"))
|
|
if not isinstance(history, list):
|
|
history = []
|
|
except Exception:
|
|
history = []
|
|
|
|
history.append(status)
|
|
|
|
# Keep only the N most recent entries
|
|
if keep > 0 and len(history) > keep:
|
|
history = history[-keep:]
|
|
|
|
history_path.write_text(json.dumps(history, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"Historie uložena: {history_path} ({len(history)} záznamů)")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Generate status.json from scraper outputs.")
|
|
parser.add_argument("--start-time", dest="start_time", default=None,
|
|
help="ISO timestamp of scrape start (default: now)")
|
|
parser.add_argument("--duration", dest="duration", type=int, default=None,
|
|
help="Run duration in seconds")
|
|
parser.add_argument("--keep", dest="keep", type=int, default=5,
|
|
help="Number of history entries to keep (default: 5, 0=unlimited)")
|
|
args = parser.parse_args()
|
|
|
|
start_time = args.start_time or datetime.now().isoformat(timespec="seconds")
|
|
duration_sec = args.duration
|
|
|
|
sources = []
|
|
for name, filename in SOURCE_FILES.items():
|
|
path = HERE / filename
|
|
info = count_source(path)
|
|
info["name"] = name
|
|
|
|
# Merge in stats from the per-scraper stats file (authoritative for run data)
|
|
stats = read_scraper_stats(HERE / STATS_FILES[name])
|
|
for key in ("accepted", "fetched", "pages", "cache_hits", "excluded", "excluded_total",
|
|
"success", "duration_sec", "error"):
|
|
if key in stats:
|
|
info[key] = stats[key]
|
|
|
|
sources.append(info)
|
|
|
|
# Total accepted before dedup
|
|
total_accepted = sum(s.get("accepted", 0) for s in sources)
|
|
|
|
# Merged / deduplicated count
|
|
merged_path = HERE / MERGED_FILE
|
|
deduplicated = 0
|
|
if merged_path.exists():
|
|
try:
|
|
merged = json.loads(merged_path.read_text(encoding="utf-8"))
|
|
deduplicated = len(merged)
|
|
except Exception:
|
|
pass
|
|
|
|
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
|
|
|
|
# Top-level success: True if no source has an error
|
|
success = not any("error" in s for s in sources)
|
|
|
|
status = {
|
|
"status": "done",
|
|
"timestamp": start_time,
|
|
"duration_sec": duration_sec,
|
|
"success": success,
|
|
"total_accepted": total_accepted,
|
|
"deduplicated": deduplicated,
|
|
"duplicates_removed": duplicates_removed,
|
|
"sources": sources,
|
|
}
|
|
|
|
out = DATA_DIR / "status.json"
|
|
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"Status uložen: {out}")
|
|
print(f" Celkem bytů (před dedup): {total_accepted}")
|
|
print(f" Po deduplikaci: {deduplicated}")
|
|
if duplicates_removed:
|
|
print(f" Odstraněno duplikátů: {duplicates_removed}")
|
|
for s in sources:
|
|
acc = s.get("accepted", 0)
|
|
err = s.get("error", "")
|
|
exc = s.get("excluded", {})
|
|
exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
|
|
parts = [f"{s['name']:12s}: {acc} bytů"]
|
|
if exc_total:
|
|
parts.append(f"({exc_total} vyloučeno)")
|
|
if err:
|
|
parts.append(f"[CHYBA: {err}]")
|
|
print(" " + " ".join(parts))
|
|
|
|
append_to_history(status, args.keep)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|