Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard

- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing)
- Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries
- Color map markers by price/m² instead of disposition (blue→green→orange→red scale)
- Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback
- Rejected markers show original color at reduced opacity with 🚫 SVG overlay
- Favorite markers shown as  star icons with gold pulse animation
- Add "new today" marker logic (scraped_at == today) with larger pulsing green outline
- Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons
- Add generate_status.py for scraper run statistics and status.html dashboard
- Add scraped_at field to all scrapers for freshness tracking
- Update run_all.sh with log capture and status generation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-18 15:15:25 +01:00
parent c6089f0da9
commit b8d4d44164
13 changed files with 1922 additions and 395 deletions

View File

@@ -10,7 +10,7 @@ WORKDIR /app
COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \
scrape_idnes.py scrape_psn.py scrape_cityhome.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \
merge_and_map.py regen_map.py run_all.sh ./ merge_and_map.py regen_map.py run_all.sh ratings_server.py ./
COPY build/crontab /etc/crontabs/root COPY build/crontab /etc/crontabs/root
COPY build/entrypoint.sh /entrypoint.sh COPY build/entrypoint.sh /entrypoint.sh
@@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh
RUN mkdir -p /app/data RUN mkdir -p /app/data
EXPOSE 8080 EXPOSE 8080 8081
HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \
CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 CMD wget -q -O /dev/null http://localhost:8080/ || exit 1

View File

@@ -6,7 +6,7 @@ DATA_DIR="/app/data"
# Create symlinks so scripts (which write to /app/) persist data to the volume # Create symlinks so scripts (which write to /app/) persist data to the volume
for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \
byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \
mapa_bytu.html; do mapa_bytu.html ratings.json; do
# Remove real file if it exists (e.g. baked into image) # Remove real file if it exists (e.g. baked into image)
[ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f"
ln -sf "$DATA_DIR/$f" "/app/$f" ln -sf "$DATA_DIR/$f" "/app/$f"
@@ -18,5 +18,8 @@ crond -b -l 2
echo "[entrypoint] Starting initial scrape in background..." echo "[entrypoint] Starting initial scrape in background..."
bash /app/run_all.sh & bash /app/run_all.sh &
echo "[entrypoint] Starting ratings API server on port 8081..."
DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py &
echo "[entrypoint] Starting HTTP server on port 8080..." echo "[entrypoint] Starting HTTP server on port 8080..."
exec python3 -m http.server 8080 --directory "$DATA_DIR" exec python3 -m http.server 8080 --directory "$DATA_DIR"

202
generate_status.py Normal file
View File

@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""Generate status.json from scraper JSON outputs and run log."""
from __future__ import annotations
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
HERE = Path(__file__).parent
SOURCE_FILES = {
"Sreality": "byty_sreality.json",
"Realingo": "byty_realingo.json",
"Bezrealitky": "byty_bezrealitky.json",
"iDNES": "byty_idnes.json",
"PSN": "byty_psn.json",
"CityHome": "byty_cityhome.json",
}
MERGED_FILE = "byty_merged.json"
def count_source(path: Path) -> dict:
"""Read a scraper JSON and return accepted count + file mtime."""
if not path.exists():
return {"accepted": 0, "error": "soubor nenalezen"}
try:
data = json.loads(path.read_text(encoding="utf-8"))
mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds")
return {"accepted": len(data), "updated_at": mtime}
except Exception as e:
return {"accepted": 0, "error": str(e)}
def parse_log(log_path: str) -> dict[str, dict]:
"""Parse scraper run log and extract per-source statistics.
Scrapers log summary lines like:
✓ Vyhovující byty: 12
Vyloučeno (prodáno): 5
Staženo stránek: 3
Staženo inzerátů: 48
Celkem bytů v cache: 120
and section headers like:
[2/6] Realingo
"""
if not log_path or not os.path.exists(log_path):
return {}
with open(log_path, encoding="utf-8") as f:
content = f.read()
# Split into per-source sections by the [N/6] Step header
# Each section header looks like "[2/6] Realingo\n----..."
section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE)
sections_found = list(section_pattern.finditer(content))
if not sections_found:
return {}
stats = {}
for i, match in enumerate(sections_found):
step_name = match.group(2).strip()
start = match.end()
end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content)
section_text = content[start:end]
# Identify which sources this section covers
# "PSN + CityHome" covers both
source_names = []
for name in SOURCE_FILES:
if name.lower() in step_name.lower():
source_names.append(name)
if not source_names:
continue
# Parse numeric summary lines
def extract(pattern: str) -> Optional[int]:
m = re.search(pattern, section_text)
return int(m.group(1)) if m else None
# Lines present in all/most scrapers
accepted = extract(r'Vyhovující byty[:\s]+(\d+)')
fetched = extract(r'Staženo inzerátů[:\s]+(\d+)')
pages = extract(r'Staženo stránek[:\s]+(\d+)')
cached = extract(r'Celkem bytů v cache[:\s]+(\d+)')
cache_hits = extract(r'Cache hit[:\s]+(\d+)')
# Rejection reasons — collect all into a dict
excluded = {}
for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text):
excluded[m.group(1)] = int(m.group(2))
# Also PSN-style "Vyloučeno (prodáno): N"
total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)')
entry = {}
if accepted is not None:
entry["accepted"] = accepted
if fetched is not None:
entry["fetched"] = fetched
if pages is not None:
entry["pages"] = pages
if cached is not None:
entry["cached"] = cached
if cache_hits is not None:
entry["cache_hits"] = cache_hits
if excluded:
entry["excluded"] = excluded
elif total_excluded is not None:
entry["excluded_total"] = total_excluded
for name in source_names:
stats[name] = entry
return stats
def main():
start_time = None
duration_sec = None
if len(sys.argv) >= 3:
start_time = sys.argv[1]
try:
duration_sec = int(sys.argv[2])
except ValueError:
pass
if not start_time:
start_time = datetime.now().isoformat(timespec="seconds")
log_path = sys.argv[3] if len(sys.argv) >= 4 else None
log_stats = parse_log(log_path)
sources = []
for name, filename in SOURCE_FILES.items():
path = HERE / filename
info = count_source(path)
info["name"] = name
# Merge log stats
ls = log_stats.get(name, {})
for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"):
if k in ls:
info[k] = ls[k]
# Override accepted from log if available (log is authoritative for latest run)
if "accepted" in ls:
info["accepted"] = ls["accepted"]
sources.append(info)
# Total accepted before dedup
total_accepted = sum(s.get("accepted", 0) for s in sources)
# Merged / deduplicated count
merged_path = HERE / MERGED_FILE
deduplicated = 0
if merged_path.exists():
try:
merged = json.loads(merged_path.read_text(encoding="utf-8"))
deduplicated = len(merged)
except Exception:
pass
duplicates_removed = total_accepted - deduplicated if deduplicated else 0
status = {
"status": "done",
"timestamp": start_time,
"duration_sec": duration_sec,
"total_accepted": total_accepted,
"deduplicated": deduplicated,
"duplicates_removed": duplicates_removed,
"sources": sources,
}
out = HERE / "status.json"
out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Status uložen: {out}")
print(f" Celkem bytů (před dedup): {total_accepted}")
print(f" Po deduplikaci: {deduplicated}")
if duplicates_removed:
print(f" Odstraněno duplikátů: {duplicates_removed}")
for s in sources:
acc = s.get("accepted", 0)
err = s.get("error", "")
exc = s.get("excluded", {})
exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0)
parts = [f"{s['name']:12s}: {acc} bytů"]
if exc_total:
parts.append(f"({exc_total} vyloučeno)")
if err:
parts.append(f"[CHYBA: {err}]")
print(" " + " ".join(parts))
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

116
ratings_server.py Normal file
View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
Minimal HTTP API server for persisting apartment ratings.
GET /api/ratings → returns ratings.json contents
POST /api/ratings → saves entire ratings object
GET /api/ratings/export → same as GET, but with download header
Ratings file: /app/data/ratings.json (or ./ratings.json locally)
"""
import json
import logging
import os
import sys
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
PORT = int(os.environ.get("RATINGS_PORT", 8081))
DATA_DIR = Path(os.environ.get("DATA_DIR", "."))
RATINGS_FILE = DATA_DIR / "ratings.json"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [ratings] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger(__name__)
def load_ratings() -> dict:
try:
if RATINGS_FILE.exists():
return json.loads(RATINGS_FILE.read_text(encoding="utf-8"))
except Exception as e:
log.error("Failed to load ratings: %s", e)
return {}
def save_ratings(data: dict) -> None:
RATINGS_FILE.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
class RatingsHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
# Suppress default HTTP access log (we use our own)
pass
def _send_json(self, status: int, body: dict, extra_headers=None):
payload = json.dumps(body, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
if extra_headers:
for k, v in extra_headers.items():
self.send_header(k, v)
self.end_headers()
self.wfile.write(payload)
def do_OPTIONS(self):
# CORS preflight
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self):
if self.path in ("/api/ratings", "/api/ratings/export"):
ratings = load_ratings()
extra = None
if self.path == "/api/ratings/export":
extra = {"Content-Disposition": 'attachment; filename="ratings.json"'}
log.info("GET %s%d ratings", self.path, len(ratings))
self._send_json(200, ratings, extra)
else:
self._send_json(404, {"error": "not found"})
def do_POST(self):
if self.path == "/api/ratings":
length = int(self.headers.get("Content-Length", 0))
if length == 0:
self._send_json(400, {"error": "empty body"})
return
try:
raw = self.rfile.read(length)
data = json.loads(raw.decode("utf-8"))
except Exception as e:
log.warning("Bad request body: %s", e)
self._send_json(400, {"error": "invalid JSON"})
return
if not isinstance(data, dict):
self._send_json(400, {"error": "expected JSON object"})
return
save_ratings(data)
log.info("POST /api/ratings → saved %d ratings", len(data))
self._send_json(200, {"ok": True, "count": len(data)})
else:
self._send_json(404, {"error": "not found"})
if __name__ == "__main__":
log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR)
log.info("Ratings file: %s", RATINGS_FILE)
server = HTTPServer(("0.0.0.0", PORT), RatingsHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
log.info("Stopped.")
sys.exit(0)

View File

@@ -16,6 +16,12 @@ NC='\033[0m'
TOTAL=6 TOTAL=6
CURRENT=0 CURRENT=0
FAILED=0 FAILED=0
START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
START_EPOCH=$(date +%s)
LOG_FILE="$(pwd)/scrape_run.log"
# Mark status as running
echo '{"status":"running"}' > status.json
show_help() { show_help() {
echo "Usage: ./run_all.sh [OPTIONS]" echo "Usage: ./run_all.sh [OPTIONS]"
@@ -63,6 +69,8 @@ step() {
} }
# ── Scrapery (paralelně kde to jde) ───────────────────────── # ── Scrapery (paralelně kde to jde) ─────────────────────────
# Tee all output to log file for status generation
exec > >(tee -a "$LOG_FILE") 2>&1
step "Sreality" step "Sreality"
python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); }
@@ -91,6 +99,12 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F
# ── Otevření mapy ──────────────────────────────────────────── # ── Otevření mapy ────────────────────────────────────────────
# ── Generování statusu ─────────────────────────────────────
END_EPOCH=$(date +%s)
DURATION=$((END_EPOCH - START_EPOCH))
python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE"
echo "" echo ""
echo "============================================================" echo "============================================================"
if [ $FAILED -eq 0 ]; then if [ $FAILED -eq 0 ]; then

View File

@@ -347,6 +347,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"ownership": ownership, "ownership": ownership,
"url": sreality_url(hash_id, seo), "url": sreality_url(hash_id, seo),
"image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""), "image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""),
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
details_fetched += 1 details_fetched += 1
@@ -373,20 +374,58 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
"""Generate an interactive Leaflet.js HTML map.""" """Generate an interactive Leaflet.js HTML map."""
# Color by disposition # Color by price per m² — cool blue→warm red scale, no yellow
color_map = { # Thresholds based on Prague market distribution (p25=120k, p50=144k, p75=162k)
"3+kk": "#2196F3", # blue price_color_scale = [
"3+1": "#4CAF50", # green (110_000, "#1565C0"), # < 110k/m² → deep blue (levné)
"4+kk": "#FF9800", # orange (130_000, "#42A5F5"), # 110130k → light blue
"4+1": "#F44336", # red (150_000, "#66BB6A"), # 130150k → green (střed)
"5+kk": "#9C27B0", # purple (165_000, "#EF6C00"), # 150165k → dark orange
"5+1": "#795548", # brown (float("inf"), "#C62828"), # > 165k → dark red (drahé)
"6+": "#607D8B", # grey-blue ]
}
def price_color(estate: dict) -> str:
price = estate.get("price") or 0
area = estate.get("area") or 0
if not area:
return "#9E9E9E"
ppm2 = price / area
for threshold, color in price_color_scale:
if ppm2 < threshold:
return color
return "#E53935"
# Legend bands for info panel (built once)
price_legend_items = (
'<div style="margin-bottom:4px;font-size:12px;color:#555;font-weight:600;">Cena / m²:</div>'
)
bands = [
("#1565C0", "< 110 000 Kč/m²"),
("#42A5F5", "110 130 000 Kč/m²"),
("#66BB6A", "130 150 000 Kč/m²"),
("#EF6C00", "150 165 000 Kč/m²"),
("#C62828", "> 165 000 Kč/m²"),
("#9E9E9E", "cena/plocha neuvedena"),
]
for bcolor, blabel in bands:
price_legend_items += (
f'<div style="display:flex;align-items:center;gap:6px;margin:2px 0;">'
f'<span style="width:14px;height:14px;border-radius:50%;background:{bcolor};'
f'display:inline-block;border:2px solid white;box-shadow:0 1px 3px rgba(0,0,0,0.3);flex-shrink:0;"></span>'
f'<span>{blabel}</span></div>'
)
# New marker indicator — bigger dot, no extra border
price_legend_items += (
'<div style="display:flex;align-items:center;gap:6px;margin:6px 0 0 0;'
'padding-top:6px;border-top:1px solid #eee;">'
'<span style="width:18px;height:18px;border-radius:50%;background:#66BB6A;'
'display:inline-block;box-shadow:0 1px 4px rgba(0,0,0,0.35);flex-shrink:0;"></span>'
'<span>Nové (z dnešního scrapu) — větší</span></div>'
)
markers_js = "" markers_js = ""
for e in estates: for e in estates:
color = color_map.get(e["disposition"], "#999999") color = price_color(e)
floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno" floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno"
area_text = f'{e["area"]}' if e["area"] else "neuvedeno" area_text = f'{e["area"]}' if e["area"] else "neuvedeno"
building_text = e["building_type"] or "neuvedeno" building_text = e["building_type"] or "neuvedeno"
@@ -405,11 +444,19 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
hash_id = e.get("hash_id", "") hash_id = e.get("hash_id", "")
scraped_at = e.get("scraped_at", "")
is_new = scraped_at == datetime.now().strftime("%Y-%m-%d")
new_badge = (
'<span style="margin-left:6px;font-size:11px;background:#FFD600;color:#333;'
'padding:1px 6px;border-radius:3px;font-weight:bold;">NOVÉ</span>'
if is_new else ""
)
popup = ( popup = (
f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">' f'<div style="min-width:280px;font-family:system-ui,sans-serif;" data-hashid="{hash_id}">'
f'<b style="font-size:14px;">{format_price(e["price"])}</b>' f'<b style="font-size:14px;">{format_price(e["price"])}</b>'
f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;' f'<span style="margin-left:8px;font-size:11px;background:{source_color};color:white;'
f'padding:1px 6px;border-radius:3px;">{source_label}</span><br>' f'padding:1px 6px;border-radius:3px;">{source_label}</span>{new_badge}<br>'
f'<span style="color:#666;">{e["disposition"]} | {area_text} | {floor_text}</span>' f'<span style="color:#666;">{e["disposition"]} | {area_text} | {floor_text}</span>'
f'{floor_note}<br><br>' f'{floor_note}<br><br>'
f'<b>{e["locality"]}</b><br>' f'<b>{e["locality"]}</b><br>'
@@ -438,27 +485,33 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
popup = popup.replace("'", "\\'").replace("\n", "") popup = popup.replace("'", "\\'").replace("\n", "")
is_fav = source in ("psn", "cityhome") is_fav = source in ("psn", "cityhome")
marker_fn = "addHeartMarker" if is_fav else "addMarker"
if is_fav:
marker_fn = "addHeartMarker"
elif is_new:
marker_fn = "addNewMarker"
else:
marker_fn = "addMarker"
markers_js += ( markers_js += (
f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n" f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n"
) )
# Build legend # Build legend — price per m² bands + disposition counts
legend_items = "" legend_items = price_legend_items
# Disposition counts below the color legend
disp_counts = {} disp_counts = {}
for e in estates: for e in estates:
d = e["disposition"] d = e["disposition"]
disp_counts[d] = disp_counts.get(d, 0) + 1 disp_counts[d] = disp_counts.get(d, 0) + 1
for disp, color in color_map.items(): disp_order = ["3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+"]
count = disp_counts.get(disp, 0) disp_summary = ", ".join(
if count > 0: f"{d} ({disp_counts[d]})" for d in disp_order if d in disp_counts
legend_items += ( )
f'<div style="display:flex;align-items:center;gap:6px;margin:3px 0;">' legend_items += (
f'<span style="width:14px;height:14px;border-radius:50%;' f'<div style="margin-top:8px;padding-top:6px;border-top:1px solid #eee;'
f'background:{color};display:inline-block;border:2px solid white;' f'font-size:12px;color:#666;">{disp_summary}</div>'
f'box-shadow:0 1px 3px rgba(0,0,0,0.3);"></span>' )
f'<span>{disp} ({count})</span></div>'
)
# Heart marker legend for PSN/CityHome # Heart marker legend for PSN/CityHome
fav_count = sum(1 for e in estates if e.get("source") in ("psn", "cityhome")) fav_count = sum(1 for e in estates if e.get("source") in ("psn", "cityhome"))
@@ -493,6 +546,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
body {{ font-family: system-ui, -apple-system, sans-serif; }} body {{ font-family: system-ui, -apple-system, sans-serif; }}
#map {{ width: 100%; height: 100vh; }} #map {{ width: 100%; height: 100vh; }}
.heart-icon {{ background: none !important; border: none !important; }} .heart-icon {{ background: none !important; border: none !important; }}
.star-icon {{ background: none !important; border: none !important; }}
.rate-btn:hover {{ background: #f0f0f0 !important; }} .rate-btn:hover {{ background: #f0f0f0 !important; }}
.rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }} .rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }}
.rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }} .rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }}
@@ -503,13 +557,42 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
}} }}
.marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }} .marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }}
.heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }} .heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }}
.heart-icon-rej {{ opacity: 0.2 !important; }} .heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }}
.reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }}
@keyframes pulse-new {{
0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }}
100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }}
}}
.marker-new {{ animation: pulse-new 2s ease-in-out infinite; }}
.info-panel {{ .info-panel {{
position: absolute; top: 10px; right: 10px; z-index: 1000; position: absolute; top: 10px; right: 10px; z-index: 1000;
background: white; padding: 16px; border-radius: 10px; background: white; padding: 16px; border-radius: 10px;
box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px; box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px;
font-size: 13px; line-height: 1.5; font-size: 13px; line-height: 1.5;
transition: transform 0.3s ease, opacity 0.3s ease;
}} }}
.info-panel.collapsed {{
transform: translateX(calc(100% + 20px));
opacity: 0; pointer-events: none;
}}
.panel-open-btn {{
position: absolute; top: 10px; right: 10px; z-index: 1001;
width: 40px; height: 40px; border-radius: 8px;
background: white; border: none; cursor: pointer;
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
font-size: 20px; display: flex; align-items: center; justify-content: center;
transition: opacity 0.3s ease;
}}
.panel-open-btn.hidden {{ opacity: 0; pointer-events: none; }}
.panel-close-btn {{
position: absolute; top: 8px; right: 8px;
width: 28px; height: 28px; border-radius: 6px;
background: none; border: 1px solid #ddd; cursor: pointer;
font-size: 16px; display: flex; align-items: center; justify-content: center;
color: #888;
}}
.panel-close-btn:hover {{ background: #f0f0f0; color: #333; }}
.info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }} .info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }}
.info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }} .info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }}
.filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }} .filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }}
@@ -517,18 +600,26 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
.filter-section input[type="checkbox"] {{ accent-color: #1976D2; }} .filter-section input[type="checkbox"] {{ accent-color: #1976D2; }}
#floor-filter {{ margin-top: 8px; }} #floor-filter {{ margin-top: 8px; }}
#floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }} #floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }}
.status-link {{ display: block; margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; text-align: center; }}
.status-link a {{ color: #1976D2; text-decoration: none; font-size: 12px; }}
@media (max-width: 600px) {{
.info-panel {{ max-width: calc(100vw - 60px); right: 10px; }}
.info-panel.collapsed {{ transform: translateX(calc(100% + 20px)); }}
.panel-close-btn {{ top: 6px; right: 6px; }}
}}
</style> </style>
</head> </head>
<body> <body>
<div id="map"></div> <div id="map"></div>
<div class="info-panel"> <button class="panel-open-btn hidden" id="panel-open-btn" onclick="togglePanel()">☰</button>
<div class="info-panel" id="info-panel">
<button class="panel-close-btn" id="panel-close-btn" onclick="togglePanel()">✕</button>
<h2>Byty v Praze</h2> <h2>Byty v Praze</h2>
<div class="stats"> <div class="stats">
<div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div> <div>Celkem: <b id="visible-count">{len(estates)}</b> bytů</div>
<div>Cena: {min_price}{max_price}</div> <div>Cena: {min_price}{max_price}</div>
<div>Průměr: {avg_price}</div> <div>Průměr: {avg_price}</div>
</div> </div>
<div><b>Dispozice:</b></div>
{legend_items} {legend_items}
<div class="filter-section"> <div class="filter-section">
<b>Filtry:</b> <b>Filtry:</b>
@@ -562,6 +653,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
Skrýt zamítnuté Skrýt zamítnuté
</label> </label>
</div> </div>
<div class="status-link"><a href="status.html">Scraper status</a></div>
</div> </div>
<script> <script>
@@ -597,6 +689,23 @@ function addMarker(lat, lon, color, popup, hashId) {{
marker.addTo(map); marker.addTo(map);
}} }}
function addNewMarker(lat, lon, color, popup, hashId) {{
var marker = L.circleMarker([lat, lon], {{
radius: 12,
fillColor: color,
color: color,
weight: 4,
opacity: 0.35,
fillOpacity: 0.95,
}}).bindPopup(popup);
marker._data = {{ lat: lat, lon: lon, color: color, hashId: hashId, isNew: true }};
allMarkers.push(marker);
marker.addTo(map);
marker.on('add', function() {{
if (marker._path) marker._path.classList.add('marker-new');
}});
}}
function heartIcon(color) {{ function heartIcon(color) {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">' var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">'
+ '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 ' + '<path d="M12 21.35l-1.45-1.32C5.4 15.36 2 12.28 2 8.5 '
@@ -612,6 +721,21 @@ function heartIcon(color) {{
}}); }});
}} }}
function starIcon() {{
var svg = '<svg xmlns="http://www.w3.org/2000/svg" width="28" height="28" viewBox="0 0 24 24">'
+ '<path d="M12 2l3.09 6.26L22 9.27l-5 4.87L18.18 22 12 18.27 '
+ '5.82 22 7 14.14 2 9.27l6.91-1.01L12 2z" '
+ 'fill="#FFC107" stroke="#F57F17" stroke-width="1" '
+ 'filter="drop-shadow(0 1px 3px rgba(0,0,0,0.3))"/></svg>';
return L.divIcon({{
html: svg,
className: 'star-icon',
iconSize: [28, 28],
iconAnchor: [14, 14],
popupAnchor: [0, -14],
}});
}}
function addHeartMarker(lat, lon, color, popup, hashId) {{ function addHeartMarker(lat, lon, color, popup, hashId) {{
var marker = L.marker([lat, lon], {{ var marker = L.marker([lat, lon], {{
icon: heartIcon(color), icon: heartIcon(color),
@@ -637,6 +761,36 @@ function saveRatings(ratings) {{
localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings)); localStorage.setItem(RATINGS_KEY, JSON.stringify(ratings));
}} }}
function addRejectStrike(marker) {{
removeRejectStrike(marker);
var color = marker._data.color || '#999';
// SVG "no entry" icon — circle with diagonal line, colored to match marker
var svg = '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20">'
+ '<circle cx="12" cy="12" r="10" fill="none" stroke="' + color + '" stroke-width="2.5" opacity="0.85"/>'
+ '<line x1="5.5" y1="5.5" x2="18.5" y2="18.5" stroke="' + color + '" stroke-width="2.5" stroke-linecap="round" opacity="0.85"/>'
+ '</svg>';
var icon = L.divIcon({{
className: 'reject-overlay',
html: svg,
iconSize: [20, 20],
iconAnchor: [10, 10],
}});
var m = L.marker([marker._data.lat, marker._data.lon], {{
icon: icon,
interactive: false,
pane: 'markerPane',
}});
m.addTo(map);
marker._rejectStrike = m;
}}
function removeRejectStrike(marker) {{
if (marker._rejectStrike) {{
map.removeLayer(marker._rejectStrike);
marker._rejectStrike = null;
}}
}}
function applyMarkerStyle(marker, status) {{ function applyMarkerStyle(marker, status) {{
if (marker._data.isHeart) {{ if (marker._data.isHeart) {{
var el = marker._icon; var el = marker._icon;
@@ -651,26 +805,59 @@ function applyMarkerStyle(marker, status) {{
}} }}
}} else {{ }} else {{
if (status === 'fav') {{ if (status === 'fav') {{
marker.setStyle({{ removeRejectStrike(marker);
radius: 12, fillOpacity: 1, weight: 3, if (!marker._data._origCircle) marker._data._origCircle = true;
fillColor: marker._data.color, color: '#fff', var popup = marker.getPopup();
}}); var popupContent = popup ? popup.getContent() : '';
if (marker._path) marker._path.classList.add('marker-favorite'); var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var starMarker = L.marker([marker._data.lat, marker._data.lon], {{
icon: starIcon(),
}}).bindPopup(popupContent);
starMarker._data = marker._data;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = starMarker;
if (wasOnMap) starMarker.addTo(map);
}} else if (status === 'reject') {{ }} else if (status === 'reject') {{
marker.setStyle({{ if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
radius: 6, fillOpacity: 0.15, fillColor: '#999', color: '#bbb', weight: 1, revertToCircle(marker, {{ radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1 }});
}}); }} else {{
if (marker._path) marker._path.classList.remove('marker-favorite'); marker.setStyle({{
radius: 6, fillOpacity: 0.35, fillColor: marker._data.color, color: '#fff', weight: 1,
}});
if (marker._path) marker._path.classList.remove('marker-favorite');
}}
// Add strikethrough line over the marker
addRejectStrike(marker);
}} else {{ }} else {{
marker.setStyle({{ if (marker._data._origCircle && !(marker instanceof L.CircleMarker)) {{
radius: 8, fillColor: marker._data.color, color: '#fff', revertToCircle(marker, {{ radius: 8, fillColor: marker._data.color, color: '#fff', weight: 2, fillOpacity: 0.85 }});
weight: 2, fillOpacity: 0.85, }} else {{
}}); marker.setStyle({{
if (marker._path) marker._path.classList.remove('marker-favorite'); radius: 8, fillColor: marker._data.color, color: '#fff',
weight: 2, fillOpacity: 0.85,
}});
if (marker._path) marker._path.classList.remove('marker-favorite');
}}
if (marker._path) marker._path.classList.remove('marker-rejected');
removeRejectStrike(marker);
}} }}
}} }}
}} }}
function revertToCircle(marker, style) {{
var popup = marker.getPopup();
var popupContent = popup ? popup.getContent() : '';
var wasOnMap = map.hasLayer(marker);
if (wasOnMap) map.removeLayer(marker);
var cm = L.circleMarker([marker._data.lat, marker._data.lon], style).bindPopup(popupContent);
cm._data = marker._data;
delete cm._data._starRef;
var idx = allMarkers.indexOf(marker);
if (idx !== -1) allMarkers[idx] = cm;
if (wasOnMap) cm.addTo(map);
}}
function rateMarker(marker, action) {{ function rateMarker(marker, action) {{
var hashId = marker._data.hashId; var hashId = marker._data.hashId;
var ratings = loadRatings(); var ratings = loadRatings();
@@ -832,8 +1019,12 @@ function applyFilters() {{
if (show) {{ if (show) {{
if (!map.hasLayer(m)) m.addTo(map); if (!map.hasLayer(m)) m.addTo(map);
visible++; visible++;
// Show strike line if rejected and visible
if (m._rejectStrike && !map.hasLayer(m._rejectStrike)) m._rejectStrike.addTo(map);
}} else {{ }} else {{
if (map.hasLayer(m)) map.removeLayer(m); if (map.hasLayer(m)) map.removeLayer(m);
// Hide strike line when marker hidden
if (m._rejectStrike && map.hasLayer(m._rejectStrike)) map.removeLayer(m._rejectStrike);
}} }}
}}); }});
@@ -851,6 +1042,26 @@ function applyFilters() {{
// Initialize ratings on load // Initialize ratings on load
restoreRatings(); restoreRatings();
// ── Panel toggle ──────────────────────────────────────────────
function togglePanel() {{
var panel = document.getElementById('info-panel');
var openBtn = document.getElementById('panel-open-btn');
var isOpen = !panel.classList.contains('collapsed');
if (isOpen) {{
panel.classList.add('collapsed');
openBtn.classList.remove('hidden');
}} else {{
panel.classList.remove('collapsed');
openBtn.classList.add('hidden');
}}
}}
// On mobile, start with panel collapsed
if (window.innerWidth <= 600) {{
document.getElementById('info-panel').classList.add('collapsed');
document.getElementById('panel-open-btn').classList.remove('hidden');
}}
</script> </script>
</body> </body>
</html>""" </html>"""

View File

@@ -7,6 +7,7 @@ Výstup: byty_bezrealitky.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -355,6 +356,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
"source": "bezrealitky", "source": "bezrealitky",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -12,6 +12,7 @@ import logging
import re import re
import time import time
import urllib.request import urllib.request
from datetime import datetime
from pathlib import Path from pathlib import Path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -33,24 +34,26 @@ HEADERS = {
BASE_URL = "https://www.city-home.cz" BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str) -> str: def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string.""" """Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx."""
for attempt in range(3): for attempt in range(retries):
try: try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}") logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS) req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30) resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8") html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html return html
except urllib.error.HTTPError:
# Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < 2: if attempt < retries - 1:
wait = (attempt + 1) * 2 wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}") logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait) time.sleep(wait)
else: else:
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True) logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise raise
@@ -124,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]:
if detail_url and not detail_url.startswith("http"): if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url detail_url = BASE_URL + detail_url
# Extract floor from cells — look for pattern like "3.NP" or "2.PP" # Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price]
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL) cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
floor = None cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
floor_text = ""
project_name = ""
for cell in cells: # Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP")
cell_text = re.sub(r'<[^>]+>', '', cell).strip() project_address = cell_texts[2] if len(cell_texts) > 2 else ""
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text) floor = None
pp_match = re.search(r'(\d+)\.\s*PP', cell_text) if len(cell_texts) > 3:
np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3])
pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3])
if np_match: if np_match:
floor = int(np_match.group(1)) floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match: elif pp_match:
floor = -int(pp_match.group(1)) # Underground floor = -int(pp_match.group(1))
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "" not in cell_text and "" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
listing = { listing = {
"price": int(cena.group(1)), "price": int(cena.group(1)),
@@ -158,27 +151,55 @@ def parse_filter_page(html: str) -> list[dict]:
"project_id": project.group(1) if project else "", "project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "", "transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "", "disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url, "url": detail_url,
"unit_name": unit_name, "unit_name": unit_name,
"floor": floor, "floor": floor,
"project_name": project_name, "project_address": project_address,
} }
listings.append(listing) listings.append(listing)
return listings return listings
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]: def get_lokalita_urls(slug: str) -> list[str]:
"""Extract GPS coordinates for projects from locality pages.""" """Return candidate lokalita URLs to try in order."""
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name'] return [
gps_data = {} f"{BASE_URL}/projekty/{slug}/lokalita",
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html): f"{BASE_URL}/bytove-domy/{slug}/lokalita",
name = match.group(1).strip() f"{BASE_URL}/bytove-domy/{slug}/lokalita1",
lat = float(match.group(2)) ]
lon = float(match.group(3))
gps_data[name] = (lat, lon)
return gps_data def extract_project_gps(html: str) -> tuple[float, float] | None:
"""Extract project GPS from lokalita page JS variable.
The page contains: var locations = [['<h4>Name</h4>...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...]
Category '1' = the project's own marker. Some projects have two cat-1 entries (data error);
in that case we pick the one whose name contains a digit and is not a transit landmark.
"""
block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL)
if not block:
return None
entries = re.findall(
r"'<h4>(.*?)</h4>.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'",
block.group(0),
re.DOTALL,
)
if not entries:
return None
if len(entries) == 1:
return float(entries[0][1]), float(entries[0][2])
# Multiple cat-1 entries: pick the real project marker
transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE)
for name, lat, lng in entries:
if re.search(r'\d', name) and not transit_re.search(name):
return float(lat), float(lng)
# Fallback: first entry
return float(entries[0][1]), float(entries[0][2])
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_pages: int | None = None, max_properties: int | None = None):
@@ -210,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
# Fetch GPS for each project from locality pages # Fetch GPS for each project from locality pages
project_gps = {} project_gps = {}
for slug in sorted(project_slugs): for slug in sorted(project_slugs):
time.sleep(0.5) time.sleep(0.3)
try: gps = None
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita" for url in get_lokalita_urls(slug):
logger.debug(f"Fetching project GPS: {locality_url}") try:
loc_html = fetch_url(locality_url) logger.debug(f"Fetching project GPS: {url}")
gps = extract_project_gps(loc_html) loc_html = fetch_url(url)
if gps: gps = extract_project_gps(loc_html)
# Take first entry (the project itself) if gps:
first_name, (lat, lon) = next(iter(gps.items())) break
project_gps[slug] = (lat, lon) except Exception as e:
logger.info(f"{slug}: {lat}, {lon}") logger.debug(f"GPS fetch failed for {url}: {e}")
else: continue
logger.info(f"{slug}: GPS nenalezeno")
except Exception as e: if gps:
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True) project_gps[slug] = gps
logger.info(f" {slug}: chyba ({e})") logger.info(f" {slug}: {gps[0]}, {gps[1]}")
else:
logger.info(f"{slug}: GPS nenalezeno")
# Step 3: Filter listings # Step 3: Filter listings
logger.info(f"\nFáze 3: Filtrování...") logger.info(f"\nFáze 3: Filtrování...")
@@ -303,22 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
lat, lon = gps lat, lon = gps
# locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup
project_address = listing.get("project_address", "")
# derive city from slug (GPS lookup key)
city_map = {
"karlinske-namesti-5": "Praha 8",
"melnicka-12": "Praha 7",
"na-vaclavce-34": "Praha 5",
"nad-kajetankou-12": "Praha 6",
"vosmikovych-3": "Praha 9",
"zateckych-14": "Praha 2",
}
city_str = city_map.get(slug, "Praha")
locality_str = f"{project_address}, {city_str}" if project_address else city_str
result = { result = {
"hash_id": f"cityhome_{slug}_{listing['unit_name']}", "hash_id": f"cityhome_{slug}_{listing['unit_name']}",
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}", "name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}",
"price": price, "price": price,
"price_formatted": format_price(price), "price_formatted": format_price(price),
"locality": f"{listing['project_name']}, Praha", "locality": locality_str,
"lat": lat, "lat": lat,
"lon": lon, "lon": lon,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": area, "area": float(area),
"building_type": "Cihlová", # CityHome renovuje cihlové domy "building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno", "ownership": "neuvedeno",
"url": url, "url": url,
"source": "cityhome", "source": "cityhome",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -7,6 +7,7 @@ Výstup: byty_idnes.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -458,6 +459,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": item["url"], "url": item["url"],
"source": "idnes", "source": "idnes",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
PSN.cz scraper. PSN.cz scraper.
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií. Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
Výstup: byty_psn.json Výstup: byty_psn.json
""" """
from __future__ import annotations from __future__ import annotations
@@ -12,7 +12,9 @@ import logging
import re import re
import subprocess import subprocess
import time import time
from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import urlencode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -22,82 +24,37 @@ MAX_PRICE = 14_000_000
MIN_AREA = 69 MIN_AREA = 69
MIN_FLOOR = 2 MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
WANTED_CITIES = {"Praha"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz" BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list"
# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]
def fetch_url(url: str) -> str: def fetch_json(url: str) -> dict:
"""Fetch URL via curl (urllib SSL too old for Cloudflare).""" """Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
logger.debug(f"HTTP GET request (via curl): {url}") logger.debug(f"HTTP GET: {url}")
logger.debug(f"User-Agent: {UA}")
result = subprocess.run( result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30", ["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}", "-H", f"User-Agent: {UA}",
"-H", "Accept: text/html", "-H", "Accept: application/json",
url], url],
capture_output=True, text=True, timeout=60 capture_output=True, text=True, timeout=60
) )
if result.returncode != 0: if result.returncode != 0:
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
logger.debug(f"HTTP response: size={len(result.stdout)} bytes") return json.loads(result.stdout)
return result.stdout
def extract_units_from_html(html: str) -> list[dict]: def fix_gps(lat, lng):
"""Extract unit JSON objects from raw HTML with escaped quotes.""" """PSN má u některých projektů prohozené lat/lng — opravíme."""
# The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\" if lat is not None and lng is not None and lat < 20 and lng > 20:
# Step 1: Unescape the double-backslash-quotes to regular quotes return lng, lat
cleaned = html.replace('\\"', '"') return lat, lng
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
units = []
decoder = json.JSONDecoder()
for m in re.finditer(r'"title":"Byt', cleaned):
pos = m.start()
# Walk backwards to find the opening brace
depth = 0
found = False
for i in range(pos - 1, max(pos - 3000, 0), -1):
if cleaned[i] == '}':
depth += 1
elif cleaned[i] == '{':
if depth == 0:
try:
obj, end = decoder.raw_decode(cleaned, i)
if isinstance(obj, dict) and 'price_czk' in obj:
units.append(obj)
found = True
except (json.JSONDecodeError, ValueError):
pass
break
depth -= 1
return units
def format_price(price: int) -> str: def format_price(price: int) -> str:
@@ -109,209 +66,178 @@ def format_price(price: int) -> str:
return " ".join(reversed(parts)) + "" return " ".join(reversed(parts)) + ""
def scrape(max_pages: int | None = None, max_properties: int | None = None): def scrape(max_properties: int | None = None):
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz") logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}") logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)") logger.info(f"Region: Praha")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties: if max_properties:
logger.info(f"Max. bytů: {max_properties}") logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60) logger.info("=" * 60)
# Fetch units from each Prague project # Jediný API požadavek — vrátí všechny jednotky (cca 236)
all_units = [] params = urlencode({
"locale": "cs",
"filters": "{}",
"type": "list",
"order": "price-asc",
"offset": 0,
"limit": 500,
})
url = f"{UNITS_API}?{params}"
logger.info("Stahuji jednotky z API ...")
for proj in PRAGUE_PROJECTS: try:
page = 1 data = fetch_json(url)
project_units = [] except Exception as e:
logger.error(f"Chyba při stahování: {e}", exc_info=True)
return []
while True: all_units = data.get("units", {}).get("data", [])
if max_pages and page > max_pages: logger.info(f"Staženo jednotek celkem: {len(all_units)}")
logger.debug(f"Max pages limit reached: {max_pages}")
break
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
logger.info(f"{proj['name']} — strana {page} ...")
time.sleep(0.5)
try: # Filtrování
html = fetch_url(url)
except Exception as e:
logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
break
units = extract_units_from_html(html)
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")
if not units:
if page == 1:
logger.info(f"→ 0 jednotek")
break
# Add project info to each unit
for unit in units:
if not unit.get("latitude") or not unit.get("longitude"):
unit["latitude"] = proj["lat"]
unit["longitude"] = proj["lon"]
unit["_project_name"] = proj["name"]
unit["_project_slug"] = proj["slug"]
project_units.extend(units)
if page == 1:
logger.info(f"{len(units)} jednotek na stránce")
# Check if there might be more pages
# If we got fewer than expected or same units, stop
if len(units) < 10:
break
page += 1
if page > 10: # Safety limit
break
all_units.extend(project_units)
# Deduplicate by slug
seen_slugs = set()
unique_units = []
for u in all_units:
slug = u.get("slug", "")
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
unique_units.append(u)
elif not slug:
unique_units.append(u)
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
# Filter
logger.info(f"\nFiltrování...")
results = [] results = []
excluded_sold = 0 excluded = {
excluded_type = 0 "prodáno": 0,
excluded_disp = 0 "typ": 0,
excluded_price = 0 "město": 0,
excluded_area = 0 "dispozice": 0,
excluded_floor = 0 "cena": 0,
excluded_panel = 0 "plocha": 0,
"patro": 0,
}
properties_fetched = 0 properties_fetched = 0
for unit in unique_units: for unit in all_units:
if max_properties and properties_fetched >= max_properties: if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break break
unit_id = unit.get("id", unit.get("slug", "unknown"))
# Only free units unit_id = unit.get("id", "?")
# Pouze prodej bytů (type_id=0)
if unit.get("type_id") != 0:
excluded["typ"] += 1
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
continue
# Pouze volné (ne rezervované, prodané, v přípravě)
sale_status = unit.get("sale_status", "")
is_free = unit.get("is_free", False) is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False) is_sold = unit.get("is_sold", False)
if is_sold or not is_free: if is_sold or not is_free:
excluded_sold += 1 excluded["prodáno"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)") logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
continue continue
# Only apartments # Pouze Praha
category = str(unit.get("category", "")).lower() city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
if "byt" not in category and "ateliér" not in category: # location field je typicky "Praha 4", "Praha 7" atd.
excluded_type += 1 city_base = city.split(" ")[0] if city else ""
logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})") if city_base not in WANTED_CITIES:
excluded["město"] += 1
logger.debug(f"id={unit_id}: přeskočen (město={city})")
continue continue
# Disposition # Dispozice
disp = unit.get("disposition", "") disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS: if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1 excluded["dispozice"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})") logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
continue continue
# Price # Cena
price = unit.get("price_czk") or unit.get("action_price_czk") or 0 price = unit.get("action_price_czk") or unit.get("price_czk") or 0
if price <= 0 or price > MAX_PRICE: if not price or price <= 0 or price > MAX_PRICE:
excluded_price += 1 excluded["cena"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (price {price})") logger.debug(f"id={unit_id}: přeskočen (cena={price})")
continue continue
# Area # Plocha
area = unit.get("total_area") or unit.get("floor_area") or 0 area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA: if area < MIN_AREA:
excluded_area += 1 excluded["plocha"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)") logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
continue continue
# Floor # Patro
floor_str = str(unit.get("floor", "")) floor_str = str(unit.get("floor", ""))
floor = None floor = None
if floor_str: if floor_str:
try: try:
floor = int(floor_str) floor = int(floor_str)
except ValueError: except ValueError:
floor_match = re.search(r'(-?\d+)', floor_str) m = re.search(r'(-?\d+)', floor_str)
if floor_match: if m:
floor = int(floor_match.group(1)) floor = int(m.group(1))
if floor is not None and floor < MIN_FLOOR: if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1 excluded["patro"] += 1
logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})") logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
continue continue
# Construction — check for panel # GPS — opravit prohozené souřadnice
build_type = str(unit.get("build_type", "")).lower() lat_raw = unit.get("latitude")
if "panel" in build_type: lng_raw = unit.get("longitude")
excluded_panel += 1 lat, lng = fix_gps(lat_raw, lng_raw)
logger.debug(f"Filter: id={unit_id} - excluded (panel construction)") if not lat or not lng:
logger.info(f"✗ Vyloučen: panel ({build_type})") logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
continue continue
# Build construction label # Sestavit adresu pro locality
building_type = "neuvedeno" addr = unit.get("address") or {}
if build_type and build_type != "nevybráno": street = addr.get("street", "")
if "cihlo" in build_type or "cihla" in build_type: street_no = addr.get("street_no", "")
building_type = "Cihlová" if street and street_no:
elif "skelet" in build_type: locality_str = f"{street} {street_no}, {city}"
building_type = "Skeletová" elif street:
else: locality_str = f"{street}, {city}"
building_type = build_type.capitalize() else:
project_name = unit.get("project", "")
locality_str = f"{project_name}, {city}" if project_name else city
lat = unit.get("latitude", 0) # URL na detail jednotky
lon = unit.get("longitude", 0) unit_slug = unit.get("slug", "")
project_slug = ""
slug = unit.get("slug", "") # project_slug lze odvodit z projektu nebo z reference_no
project_slug = unit.get("_project_slug", "") # API nevrací project_slug přímo — použijeme reference_no nebo jen ID
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}" reference_no = unit.get("reference_no", "")
if unit_slug:
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
elif reference_no:
detail_url = f"{BASE_URL}/prodej/{reference_no}"
else:
detail_url = BASE_URL
result = { result = {
"hash_id": unit.get("id", slug), "hash_id": str(unit_id),
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}", "name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
"price": int(price), "price": int(price),
"price_formatted": format_price(int(price)), "price_formatted": format_price(int(price)),
"locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha", "locality": locality_str,
"lat": lat, "lat": lat,
"lon": lon, "lon": lng,
"disposition": disp, "disposition": disp,
"floor": floor, "floor": floor,
"area": area, "area": float(area),
"building_type": building_type, "building_type": "neuvedeno",
"ownership": unit.get("ownership", "neuvedeno") or "neuvedeno", "ownership": "osobní",
"url": detail_url, "url": detail_url,
"source": "psn", "source": "psn",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1
logger.info(f"\n{'=' * 60}") logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky PSN:") logger.info(f"Výsledky PSN:")
logger.info(f" Celkem jednotek: {len(unique_units)}") logger.info(f" Staženo jednotek: {len(all_units)}")
logger.info(f" Vyloučeno (prodáno): {excluded_sold}") for reason, count in excluded.items():
logger.info(f" Vyloučeno (typ): {excluded_type}") if count:
logger.info(f" Vyloučeno (dispozice): {excluded_disp}") logger.info(f" Vyloučeno ({reason}): {count}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (panel): {excluded_panel}")
logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}") logger.info(f"{'=' * 60}")
@@ -320,15 +246,13 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages per project to scrape")
parser.add_argument("--max-properties", type=int, default=None, parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results") help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], parser.add_argument("--log-level", type=str, default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)") help="Logging level (default: INFO)")
args = parser.parse_args() args = parser.parse_args()
# Configure logging
logging.basicConfig( logging.basicConfig(
level=getattr(logging, args.log_level), level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
@@ -336,7 +260,7 @@ if __name__ == "__main__":
) )
start = time.time() start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) estates = scrape(max_properties=args.max_properties)
if estates: if estates:
json_path = Path("byty_psn.json") json_path = Path("byty_psn.json")
@@ -346,6 +270,6 @@ if __name__ == "__main__":
) )
elapsed = time.time() - start elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
else: else:
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(") logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")

View File

@@ -7,6 +7,7 @@ Výstup: byty_realingo.json
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from datetime import datetime
import json import json
import logging import logging
import math import math
@@ -314,6 +315,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": f"{BASE_URL}{item['url']}", "url": f"{BASE_URL}{item['url']}",
"source": "realingo", "source": "realingo",
"image": "", "image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
} }
results.append(result) results.append(result)
properties_fetched += 1 properties_fetched += 1

204
status.html Normal file
View File

@@ -0,0 +1,204 @@
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scraper status</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: #f5f5f5; color: #333;
padding: 24px; max-width: 640px; margin: 0 auto;
}
h1 { font-size: 22px; margin-bottom: 4px; }
.subtitle { color: #888; font-size: 13px; margin-bottom: 24px; }
.card {
background: white; border-radius: 12px; padding: 20px;
box-shadow: 0 1px 4px rgba(0,0,0,0.08); margin-bottom: 16px;
}
.card h2 { font-size: 15px; margin-bottom: 12px; color: #555; }
.timestamp {
font-size: 28px; font-weight: 700; color: #1976D2;
}
.timestamp-ago { font-size: 13px; color: #999; margin-top: 2px; }
/* Source table */
.source-table { width: 100%; border-collapse: collapse; }
.source-table td { padding: 8px 0; border-bottom: 1px solid #f0f0f0; font-size: 14px; }
.source-table tr:last-child td { border-bottom: none; }
.source-table .name { font-weight: 600; }
.source-table .count { text-align: right; font-variant-numeric: tabular-nums; }
.source-table .rejected { text-align: right; color: #999; font-size: 12px; }
.badge {
display: inline-block; padding: 2px 8px; border-radius: 4px;
font-size: 11px; font-weight: 600; color: white;
}
.badge-ok { background: #4CAF50; }
.badge-err { background: #F44336; }
.badge-skip { background: #FF9800; }
/* Summary bar */
.summary-row {
display: flex; justify-content: space-between; align-items: center;
padding: 10px 0; border-bottom: 1px solid #f0f0f0;
}
.summary-row:last-child { border-bottom: none; }
.summary-label { font-size: 13px; color: #666; }
.summary-value { font-size: 18px; font-weight: 700; }
/* Source bar chart */
.bar-row { display: flex; align-items: center; gap: 8px; margin: 4px 0; }
.bar-label { width: 90px; font-size: 12px; text-align: right; color: #666; }
.bar-track { flex: 1; height: 20px; background: #f0f0f0; border-radius: 4px; overflow: hidden; position: relative; }
.bar-fill { height: 100%; border-radius: 4px; transition: width 0.5s ease; }
.bar-count { font-size: 12px; width: 36px; font-variant-numeric: tabular-nums; }
/* Loader */
.loader-wrap {
display: flex; flex-direction: column; align-items: center;
justify-content: center; padding: 60px 0;
}
.spinner {
width: 40px; height: 40px; border: 4px solid #e0e0e0;
border-top-color: #1976D2; border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin { to { transform: rotate(360deg); } }
.loader-text { margin-top: 16px; color: #999; font-size: 14px; }
.error-msg { color: #F44336; padding: 40px 0; text-align: center; }
.link-row { text-align: center; margin-top: 8px; }
.link-row a { color: #1976D2; text-decoration: none; font-size: 14px; }
</style>
</head>
<body>
<h1>Scraper status</h1>
<div class="subtitle">maru-hleda-byt</div>
<div id="content">
<div class="loader-wrap">
<div class="spinner"></div>
<div class="loader-text">Nacitam status...</div>
</div>
</div>
<div class="link-row"><a href="mapa_bytu.html">Otevrit mapu</a></div>
<script>
var COLORS = {
sreality: '#1976D2',
realingo: '#7B1FA2',
bezrealitky: '#E65100',
idnes: '#C62828',
psn: '#2E7D32',
cityhome: '#00838F',
};
function timeAgo(dateStr) {
var d = new Date(dateStr);
var now = new Date();
var diff = Math.floor((now - d) / 1000);
if (diff < 60) return 'prave ted';
if (diff < 3600) return Math.floor(diff / 60) + ' min zpet';
if (diff < 86400) return Math.floor(diff / 3600) + ' hod zpet';
return Math.floor(diff / 86400) + ' dni zpet';
}
function formatDate(dateStr) {
var d = new Date(dateStr);
var day = d.getDate();
var months = ['ledna','unora','brezna','dubna','kvetna','cervna',
'cervence','srpna','zari','rijna','listopadu','prosince'];
var hh = String(d.getHours()).padStart(2, '0');
var mm = String(d.getMinutes()).padStart(2, '0');
return day + '. ' + months[d.getMonth()] + ' ' + d.getFullYear() + ', ' + hh + ':' + mm;
}
function render(data) {
// Check if scrape is currently running
if (data.status === 'running') {
document.getElementById('content').innerHTML =
'<div class="loader-wrap">' +
'<div class="spinner"></div>' +
'<div class="loader-text">Scraper prave bezi...</div>' +
'</div>';
setTimeout(loadStatus, 30000);
return;
}
var sources = data.sources || [];
var totalOk = 0, totalRej = 0;
var maxCount = 0;
sources.forEach(function(s) {
totalOk += s.accepted || 0;
totalRej += s.rejected || 0;
if (s.accepted > maxCount) maxCount = s.accepted;
});
var html = '';
// Timestamp card
html += '<div class="card">';
html += '<h2>Posledni scrape</h2>';
html += '<div class="timestamp">' + formatDate(data.timestamp) + '</div>';
html += '<div class="timestamp-ago">' + timeAgo(data.timestamp) + '</div>';
if (data.duration_sec) {
html += '<div class="timestamp-ago">Trvani: ' + Math.round(data.duration_sec) + 's</div>';
}
html += '</div>';
// Summary card
html += '<div class="card">';
html += '<h2>Souhrn</h2>';
html += '<div class="summary-row"><span class="summary-label">Vyhovujicich bytu</span><span class="summary-value" style="color:#4CAF50">' + totalOk + '</span></div>';
html += '<div class="summary-row"><span class="summary-label">Vyloucenych</span><span class="summary-value" style="color:#999">' + totalRej + '</span></div>';
if (data.deduplicated !== undefined) {
html += '<div class="summary-row"><span class="summary-label">Po deduplikaci (v mape)</span><span class="summary-value" style="color:#1976D2">' + data.deduplicated + '</span></div>';
}
html += '</div>';
// Sources card
html += '<div class="card">';
html += '<h2>Zdroje</h2>';
sources.forEach(function(s) {
var color = COLORS[s.name.toLowerCase()] || '#999';
var pct = maxCount > 0 ? Math.round((s.accepted / maxCount) * 100) : 0;
var badge = s.error
? '<span class="badge badge-err">chyba</span>'
: (s.accepted === 0 ? '<span class="badge badge-skip">0</span>' : '<span class="badge badge-ok">OK</span>');
html += '<div style="margin-bottom:12px;">';
html += '<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">';
html += '<span style="font-weight:600;font-size:14px;">' + s.name + ' ' + badge + '</span>';
html += '<span style="font-size:12px;color:#999;">' + (s.rejected || 0) + ' vyloucenych</span>';
html += '</div>';
html += '<div class="bar-row">';
html += '<div class="bar-track"><div class="bar-fill" style="width:' + pct + '%;background:' + color + ';"></div></div>';
html += '<span class="bar-count">' + (s.accepted || 0) + '</span>';
html += '</div>';
html += '</div>';
});
html += '</div>';
document.getElementById('content').innerHTML = html;
}
function loadStatus() {
fetch('status.json?t=' + Date.now())
.then(function(r) {
if (!r.ok) throw new Error(r.status);
return r.json();
})
.then(render)
.catch(function(err) {
document.getElementById('content').innerHTML =
'<div class="error-msg">Status zatim neni k dispozici.<br><small>(' + err.message + ')</small></div>';
});
}
loadStatus();
</script>
</body>
</html>