Add status dashboard, server, scraper stats, and DATA_DIR support
All checks were successful
Build and Push / build (push) Successful in 7s

Key changes:
- Replace ratings_server.py + status.html with a unified server.py that
  serves the map, scraper status dashboard, and ratings API in one process
- Add scraper_stats.py utility: each scraper writes per-run stats (fetched,
  accepted, excluded, duration) to stats_<source>.json for the status page
- generate_status.py: respect DATA_DIR env var so status.json lands in the
  configured data directory instead of always the project root
- run_all.sh: replace the {"status":"running"} overwrite of status.json with
  a dedicated scraper_running.json lock file; trap on EXIT ensures cleanup
  even on kill/error, preventing the previous run's results from being wiped
- server.py: detect running state via scraper_running.json existence instead
  of status["status"] field, eliminating the dual-use race condition
- Makefile: add serve (local dev), debug (Docker debug container) targets;
  add SERVER_PORT variable
- build/Dockerfile + entrypoint.sh: switch to server.py, set DATA_DIR,
  adjust volume mounts
- .gitignore: add *.json and *.log to keep runtime data files out of VCS
- mapa_bytu.html: price-per-m² colouring, status link, UX tweaks

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jan Novak
2026-02-26 00:30:25 +01:00
parent 6f49533c94
commit 5fb3b984b6
17 changed files with 929 additions and 1122 deletions

477
server.py Normal file
View File

@@ -0,0 +1,477 @@
#!/usr/bin/env python3
"""
General-purpose HTTP server for maru-hleda-byt.
Serves static files from DATA_DIR and additionally handles:
GET /scrapers-status → SSR scraper status page
GET /api/ratings → ratings.json contents
POST /api/ratings → save entire ratings object
GET /api/ratings/export → same as GET, with download header
GET /api/status → status.json contents (JSON)
GET /api/status/history → scraper_history.json contents (JSON)
"""
from __future__ import annotations
import functools
import json
import logging
import os
import sys
from datetime import datetime
from http.server import HTTPServer, SimpleHTTPRequestHandler
from pathlib import Path
PORT = int(os.environ.get("SERVER_PORT", 8080))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
_LOG_LEVEL = getattr(logging, os.environ.get("LOG_LEVEL", "INFO").upper(), logging.INFO)
logging.basicConfig(
level=_LOG_LEVEL,
format="%(asctime)s [server] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
# ── Helpers ──────────────────────────────────────────────────────────────────
COLORS = {
"sreality": "#1976D2",
"realingo": "#7B1FA2",
"bezrealitky": "#E65100",
"idnes": "#C62828",
"psn": "#2E7D32",
"cityhome": "#00838F",
}
MONTHS_CZ = [
"ledna", "února", "března", "dubna", "května", "června",
"července", "srpna", "září", "října", "listopadu", "prosince",
]
def _load_json(path: Path, default=None):
"""Read and parse JSON file; return default on missing or parse error."""
log.debug("_load_json: %s", path.resolve())
try:
if path.exists():
return json.loads(path.read_text(encoding="utf-8"))
except Exception as e:
log.warning("Failed to load %s: %s", path, e)
return default
def _fmt_date(iso_str: str) -> str:
"""Format ISO timestamp as Czech date string."""
try:
d = datetime.fromisoformat(iso_str)
return f"{d.day}. {MONTHS_CZ[d.month - 1]} {d.year}, {d.hour:02d}:{d.minute:02d}"
except Exception:
return iso_str
def load_ratings() -> dict:
return _load_json(RATINGS_FILE, default={})
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# ── SSR status page ──────────────────────────────────────────────────────────
_CSS = """\
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp { font-size: 28px; font-weight: 700; color: #1976D2; }
.timestamp-sub { font-size: 13px; color: #999; margin-top: 2px; }
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; }
.bar-fill { height: 100%; border-radius: 4px; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
.history-table { width: 100%; border-collapse: collapse; font-size: 12px; }
.history-table th {
text-align: left; font-weight: 600; color: #999; font-size: 11px;
padding: 4px 6px 8px 6px; border-bottom: 2px solid #f0f0f0;
}
.history-table td { padding: 7px 6px; border-bottom: 1px solid #f5f5f5; vertical-align: middle; }
.history-table tr:last-child td { border-bottom: none; }
.history-table tr.latest td { background: #f8fbff; font-weight: 600; }
.src-nums { display: flex; gap: 4px; flex-wrap: wrap; }
.src-chip {
display: inline-block; padding: 1px 5px; border-radius: 3px;
font-size: 10px; color: white; font-variant-numeric: tabular-nums;
}
.clickable-row { cursor: pointer; }
.clickable-row:hover td { background: #f0f7ff !important; }
/* Modal */
#md-overlay {
position: fixed; inset: 0; background: rgba(0,0,0,0.45);
display: flex; align-items: flex-start; justify-content: center;
z-index: 1000; padding: 40px 16px; overflow-y: auto;
}
#md-box {
background: white; border-radius: 12px; padding: 24px;
width: 100%; max-width: 620px; position: relative;
box-shadow: 0 8px 32px rgba(0,0,0,0.24); margin: auto;
}
#md-close {
position: absolute; top: 10px; right: 14px;
background: none; border: none; font-size: 26px; cursor: pointer;
color: #aaa; line-height: 1;
}
#md-close:hover { color: #333; }
#md-box h3 { font-size: 15px; margin-bottom: 14px; padding-right: 24px; }
.md-summary { display: flex; gap: 20px; flex-wrap: wrap; font-size: 13px; margin-bottom: 16px; color: #555; }
.md-summary b { color: #333; }
.detail-table { width: 100%; border-collapse: collapse; font-size: 12px; }
.detail-table th {
text-align: left; color: #999; font-size: 11px; font-weight: 600;
padding: 4px 8px 6px 0; border-bottom: 2px solid #f0f0f0; white-space: nowrap;
}
.detail-table td { padding: 6px 8px 6px 0; border-bottom: 1px solid #f5f5f5; vertical-align: top; }
.detail-table tr:last-child td { border-bottom: none; }
"""
_SOURCE_ORDER = ["Sreality", "Realingo", "Bezrealitky", "iDNES", "PSN", "CityHome"]
_SOURCE_ABBR = ["Sre", "Rea", "Bez", "iDN", "PSN", "CH"]
def _sources_html(sources: list) -> str:
if not sources:
return ""
max_count = max((s.get("accepted", 0) for s in sources), default=1) or 1
parts = ['<div class="card"><h2>Zdroje</h2>']
for s in sources:
name = s.get("name", "?")
accepted = s.get("accepted", 0)
error = s.get("error")
exc = s.get("excluded", {})
excluded_total = sum(exc.values()) if isinstance(exc, dict) else s.get("excluded_total", 0)
color = COLORS.get(name.lower(), "#999")
pct = round(accepted / max_count * 100) if max_count else 0
if error:
badge = '<span class="badge badge-err">chyba</span>'
elif accepted == 0:
badge = '<span class="badge badge-skip">0</span>'
else:
badge = '<span class="badge badge-ok">OK</span>'
parts.append(
f'<div style="margin-bottom:12px;">'
f'<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">'
f'<span style="font-weight:600;font-size:14px;">{name} {badge}</span>'
f'<span style="font-size:12px;color:#999;">{excluded_total} vyloučených</span>'
f'</div>'
f'<div class="bar-row">'
f'<div class="bar-track"><div class="bar-fill" style="width:{pct}%;background:{color};"></div></div>'
f'<span class="bar-count">{accepted}</span>'
f'</div></div>'
)
parts.append("</div>")
return "".join(parts)
def _history_html(history: list) -> str:
if not history:
return ""
rows = list(reversed(history))
parts = [
'<div class="card">'
'<h2>Historie běhů <span style="font-size:11px;font-weight:400;color:#bbb;"> klikni pro detaily</span></h2>',
'<table class="history-table"><thead><tr>',
'<th>Datum</th><th>Trvání</th><th>Přijato&nbsp;/&nbsp;Dedup</th><th>Zdroje</th><th>OK</th>',
'</tr></thead><tbody>',
]
for i, entry in enumerate(rows):
row_class = ' class="latest clickable-row"' if i == 0 else ' class="clickable-row"'
src_map = {s["name"]: s for s in entry.get("sources", []) if "name" in s}
chips = "".join(
f'<span class="src-chip" style="background:{"#F44336" if (src_map.get(name) or {}).get("error") else COLORS.get(name.lower(), "#999")}" title="{name}">'
f'{abbr}&nbsp;{src_map[name].get("accepted", 0) if name in src_map else "-"}</span>'
for name, abbr in zip(_SOURCE_ORDER, _SOURCE_ABBR)
)
ok_badge = (
'<span class="badge badge-err">chyba</span>'
if entry.get("success") is False
else '<span class="badge badge-ok">OK</span>'
)
dur = f'{entry["duration_sec"]}s' if entry.get("duration_sec") is not None else "-"
parts.append(
f'<tr{row_class} data-idx="{i}">'
f'<td>{_fmt_date(entry.get("timestamp", ""))}</td>'
f'<td>{dur}</td>'
f'<td>{entry.get("total_accepted", "-")}&nbsp;/&nbsp;{entry.get("deduplicated", "-")}</td>'
f'<td><div class="src-nums">{chips}</div></td>'
f'<td>{ok_badge}</td>'
f'</tr>'
)
parts.append("</tbody></table></div>")
return "".join(parts)
def _modal_script(rows_json: str) -> str:
"""Return the modal overlay HTML + JS for the history detail popup."""
return (
'<div id="md-overlay" style="display:none">'
'<div id="md-box"><button id="md-close">&times;</button>'
'<div id="md-body"></div></div></div>\n'
'<script>\n(function(){\n'
f'var H={rows_json};\n'
'var C={"sreality":"#1976D2","realingo":"#7B1FA2","bezrealitky":"#E65100","idnes":"#C62828","psn":"#2E7D32","cityhome":"#00838F"};\n'
'var MN=["ledna","února","března","dubna","května","června","července","srpna","září","října","listopadu","prosince"];\n'
'function fd(s){var d=new Date(s);return d.getDate()+". "+MN[d.getMonth()]+" "+d.getFullYear()+", "+String(d.getHours()).padStart(2,"0")+":"+String(d.getMinutes()).padStart(2,"0");}\n'
'function openModal(idx){\n'
' var e=H[idx],src=e.sources||[];\n'
' var h="<h3>Detaily b\u011bhu \u2013 "+fd(e.timestamp)+"</h3>";\n'
' h+="<div class=\\"md-summary\\">";\n'
' if(e.duration_sec!=null) h+="<span><b>Trvání:</b> "+e.duration_sec+"s</span>";\n'
' if(e.total_accepted!=null) h+="<span><b>Přijato:</b> "+e.total_accepted+"</span>";\n'
' if(e.deduplicated!=null) h+="<span><b>Po dedup:</b> "+e.deduplicated+"</span>";\n'
' h+="</div>";\n'
' h+="<table class=\\"detail-table\\"><thead><tr>";\n'
' h+="<th>Zdroj</th><th>Přijato</th><th>Staženo</th><th>Stránky</th><th>Cache</th><th>Vyloučeno</th><th>Čas</th><th>OK</th>";\n'
' h+="</tr></thead><tbody>";\n'
' src.forEach(function(s){\n'
' var nm=s.name||"?",col=C[nm.toLowerCase()]||"#999";\n'
' var exc=s.excluded||{};\n'
' var excStr=Object.entries(exc).filter(function(kv){return kv[1]>0;}).map(function(kv){return kv[0]+":&nbsp;"+kv[1];}).join(", ")||"\u2013";\n'
' var ok=s.error?"<span class=\\"badge badge-err\\" title=\\""+s.error+"\\">chyba</span>":"<span class=\\"badge badge-ok\\">OK</span>";\n'
' var dot="<span style=\\"display:inline-block;width:8px;height:8px;border-radius:50%;background:"+col+";margin-right:5px;\\"></span>";\n'
' h+="<tr>";\n'
' h+="<td>"+dot+nm+"</td>";\n'
' h+="<td>"+(s.accepted!=null?s.accepted:"\u2013")+"</td>";\n'
' h+="<td>"+(s.fetched!=null?s.fetched:"\u2013")+"</td>";\n'
' h+="<td>"+(s.pages!=null?s.pages:"\u2013")+"</td>";\n'
' h+="<td>"+(s.cache_hits!=null?s.cache_hits:"\u2013")+"</td>";\n'
' h+="<td style=\\"font-size:11px;color:#666;\\">"+excStr+"</td>";\n'
' h+="<td>"+(s.duration_sec!=null?s.duration_sec+"s":"\u2013")+"</td>";\n'
' h+="<td>"+ok+"</td></tr>";\n'
' });\n'
' h+="</tbody></table>";\n'
' document.getElementById("md-body").innerHTML=h;\n'
' document.getElementById("md-overlay").style.display="flex";\n'
'}\n'
'function closeModal(){document.getElementById("md-overlay").style.display="none";}\n'
'var tb=document.querySelector(".history-table tbody");\n'
'if(tb)tb.addEventListener("click",function(e){var tr=e.target.closest("tr[data-idx]");if(tr)openModal(parseInt(tr.dataset.idx,10));});\n'
'document.getElementById("md-close").addEventListener("click",closeModal);\n'
'document.getElementById("md-overlay").addEventListener("click",function(e){if(e.target===this)closeModal();});\n'
'document.addEventListener("keydown",function(e){if(e.key==="Escape")closeModal();});\n'
'})();\n</script>'
)
def _render_status_html(status: dict | None, history: list, is_running: bool = False) -> str:
"""Generate the complete HTML page for /scrapers-status."""
head_open = (
'<!DOCTYPE html>\n<html lang="cs">\n<head>\n'
'<meta charset="UTF-8">\n'
'<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'
f'<title>Scraper status</title>\n<style>{_CSS}</style>\n'
)
page_header = '<h1>Scraper status</h1>\n<div class="subtitle">maru-hleda-byt</div>\n'
footer = '<div class="link-row"><a href="/">Otevřít mapu</a></div>'
if status is None:
return (
head_open + '</head>\n<body>\n' + page_header
+ '<div class="card"><p style="color:#F44336">Status není k dispozici.</p></div>\n'
+ footer + '\n</body>\n</html>'
)
if is_running:
return (
head_open
+ '<meta http-equiv="refresh" content="30">\n'
+ '</head>\n<body>\n' + page_header
+ '<div class="loader-wrap"><div class="spinner"></div>'
+ '<div class="loader-text">Scraper právě běží…</div></div>\n'
+ footer + '\n</body>\n</html>'
)
# ── Done state ────────────────────────────────────────────────────────────
ts = status.get("timestamp", "")
duration = status.get("duration_sec")
total_accepted = status.get("total_accepted", 0)
deduplicated = status.get("deduplicated")
ts_card = (
'<div class="card"><h2>Poslední scrape</h2>'
f'<div class="timestamp">{_fmt_date(ts)}</div>'
+ (f'<div class="timestamp-sub">Trvání: {round(duration)}s</div>' if duration is not None else "")
+ '</div>'
)
sum_card = (
'<div class="card"><h2>Souhrn</h2>'
f'<div class="summary-row"><span class="summary-label">Vyhovujících bytů</span>'
f'<span class="summary-value" style="color:#4CAF50">{total_accepted}</span></div>'
+ (
f'<div class="summary-row"><span class="summary-label">Po deduplikaci (v mapě)</span>'
f'<span class="summary-value" style="color:#1976D2">{deduplicated}</span></div>'
if deduplicated is not None else ""
)
+ '</div>'
)
rows_for_js = list(reversed(history))
body = (
page_header
+ ts_card + "\n"
+ sum_card + "\n"
+ _sources_html(status.get("sources", [])) + "\n"
+ _history_html(history) + "\n"
+ footer
)
modal = _modal_script(json.dumps(rows_for_js, ensure_ascii=False))
return head_open + '</head>\n<body>\n' + body + '\n' + modal + '\n</body>\n</html>'
# ── HTTP handler ──────────────────────────────────────────────────────────────
class Handler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass # suppress default access log; use our own where needed
def _send_json(self, status: int, body, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path.startswith("/api/"):
self._handle_api_get()
elif self.path.rstrip("/") == "/scrapers-status":
self._serve_status_page()
else:
log.debug("GET %s → static file: %s", self.path, self.translate_path(self.path))
super().do_GET()
def _handle_api_get(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
elif self.path == "/api/status":
data = _load_json(DATA_DIR / "status.json")
if data is None:
self._send_json(404, {"error": "status not available"})
return
log.info("GET /api/status → ok")
self._send_json(200, data)
elif self.path == "/api/status/history":
data = _load_json(DATA_DIR / "scraper_history.json", default=[])
if not isinstance(data, list):
data = []
log.info("GET /api/status/history → %d entries", len(data))
self._send_json(200, data)
else:
self._send_json(404, {"error": "not found"})
def _serve_status_page(self):
status = _load_json(DATA_DIR / "status.json")
history = _load_json(DATA_DIR / "scraper_history.json", default=[])
if not isinstance(history, list):
history = []
is_running = (DATA_DIR / "scraper_running.json").exists()
html = _render_status_html(status, history, is_running)
payload = html.encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Server starting on port %d, data dir: %s", PORT, DATA_DIR)
handler = functools.partial(Handler, directory=str(DATA_DIR))
server = HTTPServer(("0.0.0.0", PORT), handler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)