From b8d4d44164411c804bf03c0f134112f34b399602 Mon Sep 17 00:00:00 2001 From: Marie Michalova Date: Wed, 18 Feb 2026 15:15:25 +0100 Subject: [PATCH] =?UTF-8?q?Rewrite=20PSN=20+=20CityHome=20scrapers,=20add?= =?UTF-8?q?=20price/m=C2=B2=20map=20coloring,=20ratings=20system,=20and=20?= =?UTF-8?q?status=20dashboard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing) - Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries - Color map markers by price/m² instead of disposition (blue→green→orange→red scale) - Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback - Rejected markers show original color at reduced opacity with 🚫 SVG overlay - Favorite markers shown as ⭐ star icons with gold pulse animation - Add "new today" marker logic (scraped_at == today) with larger pulsing green outline - Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons - Add generate_status.py for scraper run statistics and status.html dashboard - Add scraped_at field to all scrapers for freshness tracking - Update run_all.sh with log capture and status generation Co-Authored-By: Claude Opus 4.6 --- build/Dockerfile | 4 +- build/entrypoint.sh | 5 +- generate_status.py | 202 +++++++++ mapa_bytu.html | 977 ++++++++++++++++++++++++++++++++++++++---- ratings_server.py | 116 +++++ run_all.sh | 14 + scrape_and_map.py | 295 +++++++++++-- scrape_bezrealitky.py | 2 + scrape_cityhome.py | 156 ++++--- scrape_idnes.py | 2 + scrape_psn.py | 338 ++++++--------- scrape_realingo.py | 2 + status.html | 204 +++++++++ 13 files changed, 1922 insertions(+), 395 deletions(-) create mode 100644 generate_status.py create mode 100644 ratings_server.py create mode 100644 status.html diff --git a/build/Dockerfile b/build/Dockerfile index f672cee..a9bc15f 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -10,7 +10,7 @@ WORKDIR /app COPY scrape_and_map.py scrape_realingo.py scrape_bezrealitky.py \ scrape_idnes.py scrape_psn.py scrape_cityhome.py \ - merge_and_map.py regen_map.py run_all.sh ./ + merge_and_map.py regen_map.py run_all.sh ratings_server.py ./ COPY build/crontab /etc/crontabs/root COPY build/entrypoint.sh /entrypoint.sh @@ -18,7 +18,7 @@ RUN chmod +x /entrypoint.sh run_all.sh RUN mkdir -p /app/data -EXPOSE 8080 +EXPOSE 8080 8081 HEALTHCHECK --interval=60s --timeout=5s --start-period=300s \ CMD wget -q -O /dev/null http://localhost:8080/ || exit 1 diff --git a/build/entrypoint.sh b/build/entrypoint.sh index 032afe5..da84e83 100644 --- a/build/entrypoint.sh +++ b/build/entrypoint.sh @@ -6,7 +6,7 @@ DATA_DIR="/app/data" # Create symlinks so scripts (which write to /app/) persist data to the volume for f in byty_sreality.json byty_realingo.json byty_bezrealitky.json \ byty_idnes.json byty_psn.json byty_cityhome.json byty_merged.json \ - mapa_bytu.html; do + mapa_bytu.html ratings.json; do # Remove real file if it exists (e.g. baked into image) [ -f "/app/$f" ] && [ ! -L "/app/$f" ] && rm -f "/app/$f" ln -sf "$DATA_DIR/$f" "/app/$f" @@ -18,5 +18,8 @@ crond -b -l 2 echo "[entrypoint] Starting initial scrape in background..." bash /app/run_all.sh & +echo "[entrypoint] Starting ratings API server on port 8081..." +DATA_DIR="$DATA_DIR" python3 /app/ratings_server.py & + echo "[entrypoint] Starting HTTP server on port 8080..." exec python3 -m http.server 8080 --directory "$DATA_DIR" diff --git a/generate_status.py b/generate_status.py new file mode 100644 index 0000000..8eaf1f5 --- /dev/null +++ b/generate_status.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +"""Generate status.json from scraper JSON outputs and run log.""" +from __future__ import annotations + +import json +import os +import re +import sys +from datetime import datetime +from pathlib import Path +from typing import Optional + +HERE = Path(__file__).parent + +SOURCE_FILES = { + "Sreality": "byty_sreality.json", + "Realingo": "byty_realingo.json", + "Bezrealitky": "byty_bezrealitky.json", + "iDNES": "byty_idnes.json", + "PSN": "byty_psn.json", + "CityHome": "byty_cityhome.json", +} + +MERGED_FILE = "byty_merged.json" + + +def count_source(path: Path) -> dict: + """Read a scraper JSON and return accepted count + file mtime.""" + if not path.exists(): + return {"accepted": 0, "error": "soubor nenalezen"} + try: + data = json.loads(path.read_text(encoding="utf-8")) + mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(timespec="seconds") + return {"accepted": len(data), "updated_at": mtime} + except Exception as e: + return {"accepted": 0, "error": str(e)} + + +def parse_log(log_path: str) -> dict[str, dict]: + """Parse scraper run log and extract per-source statistics. + + Scrapers log summary lines like: + ✓ Vyhovující byty: 12 + Vyloučeno (prodáno): 5 + Staženo stránek: 3 + Staženo inzerátů: 48 + Celkem bytů v cache: 120 + and section headers like: + [2/6] Realingo + """ + if not log_path or not os.path.exists(log_path): + return {} + + with open(log_path, encoding="utf-8") as f: + content = f.read() + + # Split into per-source sections by the [N/6] Step header + # Each section header looks like "[2/6] Realingo\n----..." + section_pattern = re.compile(r'\[(\d+)/\d+\]\s+(.+)\n-+', re.MULTILINE) + sections_found = list(section_pattern.finditer(content)) + + if not sections_found: + return {} + + stats = {} + for i, match in enumerate(sections_found): + step_name = match.group(2).strip() + start = match.end() + end = sections_found[i + 1].start() if i + 1 < len(sections_found) else len(content) + section_text = content[start:end] + + # Identify which sources this section covers + # "PSN + CityHome" covers both + source_names = [] + for name in SOURCE_FILES: + if name.lower() in step_name.lower(): + source_names.append(name) + if not source_names: + continue + + # Parse numeric summary lines + def extract(pattern: str) -> Optional[int]: + m = re.search(pattern, section_text) + return int(m.group(1)) if m else None + + # Lines present in all/most scrapers + accepted = extract(r'Vyhovující byty[:\s]+(\d+)') + fetched = extract(r'Staženo inzerátů[:\s]+(\d+)') + pages = extract(r'Staženo stránek[:\s]+(\d+)') + cached = extract(r'Celkem bytů v cache[:\s]+(\d+)') + cache_hits = extract(r'Cache hit[:\s]+(\d+)') + + # Rejection reasons — collect all into a dict + excluded = {} + for m in re.finditer(r'Vyloučeno\s+\(([^)]+)\)[:\s]+(\d+)', section_text): + excluded[m.group(1)] = int(m.group(2)) + # Also PSN-style "Vyloučeno (prodáno): N" + total_excluded = sum(excluded.values()) if excluded else extract(r'Vyloučen\w*[:\s]+(\d+)') + + entry = {} + if accepted is not None: + entry["accepted"] = accepted + if fetched is not None: + entry["fetched"] = fetched + if pages is not None: + entry["pages"] = pages + if cached is not None: + entry["cached"] = cached + if cache_hits is not None: + entry["cache_hits"] = cache_hits + if excluded: + entry["excluded"] = excluded + elif total_excluded is not None: + entry["excluded_total"] = total_excluded + + for name in source_names: + stats[name] = entry + + return stats + + +def main(): + start_time = None + duration_sec = None + + if len(sys.argv) >= 3: + start_time = sys.argv[1] + try: + duration_sec = int(sys.argv[2]) + except ValueError: + pass + + if not start_time: + start_time = datetime.now().isoformat(timespec="seconds") + + log_path = sys.argv[3] if len(sys.argv) >= 4 else None + log_stats = parse_log(log_path) + + sources = [] + for name, filename in SOURCE_FILES.items(): + path = HERE / filename + info = count_source(path) + info["name"] = name + + # Merge log stats + ls = log_stats.get(name, {}) + for k in ("fetched", "pages", "cached", "cache_hits", "excluded", "excluded_total"): + if k in ls: + info[k] = ls[k] + # Override accepted from log if available (log is authoritative for latest run) + if "accepted" in ls: + info["accepted"] = ls["accepted"] + + sources.append(info) + + # Total accepted before dedup + total_accepted = sum(s.get("accepted", 0) for s in sources) + + # Merged / deduplicated count + merged_path = HERE / MERGED_FILE + deduplicated = 0 + if merged_path.exists(): + try: + merged = json.loads(merged_path.read_text(encoding="utf-8")) + deduplicated = len(merged) + except Exception: + pass + + duplicates_removed = total_accepted - deduplicated if deduplicated else 0 + + status = { + "status": "done", + "timestamp": start_time, + "duration_sec": duration_sec, + "total_accepted": total_accepted, + "deduplicated": deduplicated, + "duplicates_removed": duplicates_removed, + "sources": sources, + } + + out = HERE / "status.json" + out.write_text(json.dumps(status, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Status uložen: {out}") + print(f" Celkem bytů (před dedup): {total_accepted}") + print(f" Po deduplikaci: {deduplicated}") + if duplicates_removed: + print(f" Odstraněno duplikátů: {duplicates_removed}") + for s in sources: + acc = s.get("accepted", 0) + err = s.get("error", "") + exc = s.get("excluded", {}) + exc_total = sum(exc.values()) if exc else s.get("excluded_total", 0) + parts = [f"{s['name']:12s}: {acc} bytů"] + if exc_total: + parts.append(f"({exc_total} vyloučeno)") + if err: + parts.append(f"[CHYBA: {err}]") + print(" " + " ".join(parts)) + + +if __name__ == "__main__": + main() diff --git a/mapa_bytu.html b/mapa_bytu.html index 2bb58eb..55b4bec 100644 --- a/mapa_bytu.html +++ b/mapa_bytu.html @@ -3,7 +3,7 @@ -Byty v Praze — mapa (62 bytů) +Byty v Praze — mapa (713 bytů)
-
+ +
+

Byty v Praze

-
Celkem: 62 bytů
-
Cena: 1 000 000 Kč — 13 500 000 Kč
-
Průměr: 11 130 515 Kč
+
Celkem: 713 bytů
+
Cena: 380 000 Kč — 13 994 000 Kč
+
Průměr: 10 387 776 Kč
-
Dispozice:
-
3+kk (37)
3+1 (13)
4+kk (1)
4+1 (1)
+
Cena / m²:
< 110 000 Kč/m²
110 – 130 000 Kč/m²
130 – 150 000 Kč/m²
150 – 165 000 Kč/m²
> 165 000 Kč/m²
cena/plocha neuvedena
Nové (z dnešního scrapu) — větší
3+kk (363), 3+1 (94), 4+kk (53), 4+1 (11), 5+kk (4)
PSN / CityHome (3)
Filtry:
@@ -80,6 +118,7 @@ Skrýt zamítnuté
+
\ No newline at end of file diff --git a/ratings_server.py b/ratings_server.py new file mode 100644 index 0000000..6d53a17 --- /dev/null +++ b/ratings_server.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Minimal HTTP API server for persisting apartment ratings. + +GET /api/ratings → returns ratings.json contents +POST /api/ratings → saves entire ratings object +GET /api/ratings/export → same as GET, but with download header + +Ratings file: /app/data/ratings.json (or ./ratings.json locally) +""" + +import json +import logging +import os +import sys +from http.server import BaseHTTPRequestHandler, HTTPServer +from pathlib import Path + +PORT = int(os.environ.get("RATINGS_PORT", 8081)) +DATA_DIR = Path(os.environ.get("DATA_DIR", ".")) +RATINGS_FILE = DATA_DIR / "ratings.json" + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [ratings] %(levelname)s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", +) +log = logging.getLogger(__name__) + + +def load_ratings() -> dict: + try: + if RATINGS_FILE.exists(): + return json.loads(RATINGS_FILE.read_text(encoding="utf-8")) + except Exception as e: + log.error("Failed to load ratings: %s", e) + return {} + + +def save_ratings(data: dict) -> None: + RATINGS_FILE.write_text( + json.dumps(data, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + +class RatingsHandler(BaseHTTPRequestHandler): + def log_message(self, format, *args): + # Suppress default HTTP access log (we use our own) + pass + + def _send_json(self, status: int, body: dict, extra_headers=None): + payload = json.dumps(body, ensure_ascii=False).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Content-Length", str(len(payload))) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + if extra_headers: + for k, v in extra_headers.items(): + self.send_header(k, v) + self.end_headers() + self.wfile.write(payload) + + def do_OPTIONS(self): + # CORS preflight + self.send_response(204) + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + self.send_header("Access-Control-Allow-Headers", "Content-Type") + self.end_headers() + + def do_GET(self): + if self.path in ("/api/ratings", "/api/ratings/export"): + ratings = load_ratings() + extra = None + if self.path == "/api/ratings/export": + extra = {"Content-Disposition": 'attachment; filename="ratings.json"'} + log.info("GET %s → %d ratings", self.path, len(ratings)) + self._send_json(200, ratings, extra) + else: + self._send_json(404, {"error": "not found"}) + + def do_POST(self): + if self.path == "/api/ratings": + length = int(self.headers.get("Content-Length", 0)) + if length == 0: + self._send_json(400, {"error": "empty body"}) + return + try: + raw = self.rfile.read(length) + data = json.loads(raw.decode("utf-8")) + except Exception as e: + log.warning("Bad request body: %s", e) + self._send_json(400, {"error": "invalid JSON"}) + return + if not isinstance(data, dict): + self._send_json(400, {"error": "expected JSON object"}) + return + save_ratings(data) + log.info("POST /api/ratings → saved %d ratings", len(data)) + self._send_json(200, {"ok": True, "count": len(data)}) + else: + self._send_json(404, {"error": "not found"}) + + +if __name__ == "__main__": + log.info("Ratings server starting on port %d, data dir: %s", PORT, DATA_DIR) + log.info("Ratings file: %s", RATINGS_FILE) + server = HTTPServer(("0.0.0.0", PORT), RatingsHandler) + try: + server.serve_forever() + except KeyboardInterrupt: + log.info("Stopped.") + sys.exit(0) diff --git a/run_all.sh b/run_all.sh index 1e347ec..7b7fa45 100755 --- a/run_all.sh +++ b/run_all.sh @@ -16,6 +16,12 @@ NC='\033[0m' TOTAL=6 CURRENT=0 FAILED=0 +START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S") +START_EPOCH=$(date +%s) +LOG_FILE="$(pwd)/scrape_run.log" + +# Mark status as running +echo '{"status":"running"}' > status.json show_help() { echo "Usage: ./run_all.sh [OPTIONS]" @@ -63,6 +69,8 @@ step() { } # ── Scrapery (paralelně kde to jde) ───────────────────────── +# Tee all output to log file for status generation +exec > >(tee -a "$LOG_FILE") 2>&1 step "Sreality" python3 scrape_and_map.py $SCRAPER_ARGS || { echo -e "${RED}✗ Sreality selhalo${NC}"; FAILED=$((FAILED + 1)); } @@ -91,6 +99,12 @@ python3 merge_and_map.py || { echo -e "${RED}✗ Merge selhal${NC}"; FAILED=$((F # ── Otevření mapy ──────────────────────────────────────────── +# ── Generování statusu ───────────────────────────────────── + +END_EPOCH=$(date +%s) +DURATION=$((END_EPOCH - START_EPOCH)) +python3 generate_status.py "$START_TIME" "$DURATION" "$LOG_FILE" + echo "" echo "============================================================" if [ $FAILED -eq 0 ]; then diff --git a/scrape_and_map.py b/scrape_and_map.py index 6436a19..a983d56 100644 --- a/scrape_and_map.py +++ b/scrape_and_map.py @@ -347,6 +347,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "ownership": ownership, "url": sreality_url(hash_id, seo), "image": (estate.get("_links", {}).get("images", [{}])[0].get("href", "") if estate.get("_links", {}).get("images") else ""), + "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) details_fetched += 1 @@ -373,20 +374,58 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): """Generate an interactive Leaflet.js HTML map.""" - # Color by disposition - color_map = { - "3+kk": "#2196F3", # blue - "3+1": "#4CAF50", # green - "4+kk": "#FF9800", # orange - "4+1": "#F44336", # red - "5+kk": "#9C27B0", # purple - "5+1": "#795548", # brown - "6+": "#607D8B", # grey-blue - } + # Color by price per m² — cool blue→warm red scale, no yellow + # Thresholds based on Prague market distribution (p25=120k, p50=144k, p75=162k) + price_color_scale = [ + (110_000, "#1565C0"), # < 110k/m² → deep blue (levné) + (130_000, "#42A5F5"), # 110–130k → light blue + (150_000, "#66BB6A"), # 130–150k → green (střed) + (165_000, "#EF6C00"), # 150–165k → dark orange + (float("inf"), "#C62828"), # > 165k → dark red (drahé) + ] + + def price_color(estate: dict) -> str: + price = estate.get("price") or 0 + area = estate.get("area") or 0 + if not area: + return "#9E9E9E" + ppm2 = price / area + for threshold, color in price_color_scale: + if ppm2 < threshold: + return color + return "#E53935" + + # Legend bands for info panel (built once) + price_legend_items = ( + '
Cena / m²:
' + ) + bands = [ + ("#1565C0", "< 110 000 Kč/m²"), + ("#42A5F5", "110 – 130 000 Kč/m²"), + ("#66BB6A", "130 – 150 000 Kč/m²"), + ("#EF6C00", "150 – 165 000 Kč/m²"), + ("#C62828", "> 165 000 Kč/m²"), + ("#9E9E9E", "cena/plocha neuvedena"), + ] + for bcolor, blabel in bands: + price_legend_items += ( + f'
' + f'' + f'{blabel}
' + ) + # New marker indicator — bigger dot, no extra border + price_legend_items += ( + '
' + '' + 'Nové (z dnešního scrapu) — větší
' + ) markers_js = "" for e in estates: - color = color_map.get(e["disposition"], "#999999") + color = price_color(e) floor_text = f'{e["floor"]}. NP' if e["floor"] else "neuvedeno" area_text = f'{e["area"]} m²' if e["area"] else "neuvedeno" building_text = e["building_type"] or "neuvedeno" @@ -405,11 +444,19 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): hash_id = e.get("hash_id", "") + scraped_at = e.get("scraped_at", "") + is_new = scraped_at == datetime.now().strftime("%Y-%m-%d") + + new_badge = ( + 'NOVÉ' + if is_new else "" + ) popup = ( f'
' f'{format_price(e["price"])}' f'{source_label}
' + f'padding:1px 6px;border-radius:3px;">{source_label}{new_badge}
' f'{e["disposition"]} | {area_text} | {floor_text}' f'{floor_note}

' f'{e["locality"]}
' @@ -438,27 +485,33 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): popup = popup.replace("'", "\\'").replace("\n", "") is_fav = source in ("psn", "cityhome") - marker_fn = "addHeartMarker" if is_fav else "addMarker" + + if is_fav: + marker_fn = "addHeartMarker" + elif is_new: + marker_fn = "addNewMarker" + else: + marker_fn = "addMarker" markers_js += ( f" {marker_fn}({e['lat']}, {e['lon']}, '{color}', '{popup}', '{hash_id}');\n" ) - # Build legend - legend_items = "" + # Build legend — price per m² bands + disposition counts + legend_items = price_legend_items + + # Disposition counts below the color legend disp_counts = {} for e in estates: d = e["disposition"] disp_counts[d] = disp_counts.get(d, 0) + 1 - for disp, color in color_map.items(): - count = disp_counts.get(disp, 0) - if count > 0: - legend_items += ( - f'
' - f'' - f'{disp} ({count})
' - ) + disp_order = ["3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+"] + disp_summary = ", ".join( + f"{d} ({disp_counts[d]})" for d in disp_order if d in disp_counts + ) + legend_items += ( + f'
{disp_summary}
' + ) # Heart marker legend for PSN/CityHome fav_count = sum(1 for e in estates if e.get("source") in ("psn", "cityhome")) @@ -493,6 +546,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): body {{ font-family: system-ui, -apple-system, sans-serif; }} #map {{ width: 100%; height: 100vh; }} .heart-icon {{ background: none !important; border: none !important; }} + .star-icon {{ background: none !important; border: none !important; }} .rate-btn:hover {{ background: #f0f0f0 !important; }} .rate-btn.active-fav {{ background: #FFF9C4 !important; border-color: #FFC107 !important; }} .rate-btn.active-rej {{ background: #FFEBEE !important; border-color: #F44336 !important; }} @@ -503,13 +557,42 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): }} .marker-favorite {{ animation: pulse-glow 2s ease-in-out infinite; border-radius: 50%; }} .heart-icon-fav svg path {{ stroke: gold !important; stroke-width: 2.5 !important; filter: drop-shadow(0 0 4px rgba(255,193,7,0.7)); }} - .heart-icon-rej {{ opacity: 0.2 !important; }} + .heart-icon-rej {{ opacity: 0.4 !important; filter: grayscale(1); }} + .reject-overlay {{ background: none !important; border: none !important; pointer-events: none !important; }} + @keyframes pulse-new {{ + 0% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }} + 50% {{ stroke-opacity: 0.4; stroke-width: 6px; r: 12; }} + 100% {{ stroke-opacity: 1; stroke-width: 3px; r: 11; }} + }} + .marker-new {{ animation: pulse-new 2s ease-in-out infinite; }} .info-panel {{ position: absolute; top: 10px; right: 10px; z-index: 1000; background: white; padding: 16px; border-radius: 10px; box-shadow: 0 2px 12px rgba(0,0,0,0.15); max-width: 260px; font-size: 13px; line-height: 1.5; + transition: transform 0.3s ease, opacity 0.3s ease; }} + .info-panel.collapsed {{ + transform: translateX(calc(100% + 20px)); + opacity: 0; pointer-events: none; + }} + .panel-open-btn {{ + position: absolute; top: 10px; right: 10px; z-index: 1001; + width: 40px; height: 40px; border-radius: 8px; + background: white; border: none; cursor: pointer; + box-shadow: 0 2px 12px rgba(0,0,0,0.15); + font-size: 20px; display: flex; align-items: center; justify-content: center; + transition: opacity 0.3s ease; + }} + .panel-open-btn.hidden {{ opacity: 0; pointer-events: none; }} + .panel-close-btn {{ + position: absolute; top: 8px; right: 8px; + width: 28px; height: 28px; border-radius: 6px; + background: none; border: 1px solid #ddd; cursor: pointer; + font-size: 16px; display: flex; align-items: center; justify-content: center; + color: #888; + }} + .panel-close-btn:hover {{ background: #f0f0f0; color: #333; }} .info-panel h2 {{ font-size: 16px; margin-bottom: 8px; }} .info-panel .stats {{ color: #666; margin-bottom: 10px; padding-bottom: 10px; border-bottom: 1px solid #eee; }} .filter-section {{ margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; }} @@ -517,18 +600,26 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): .filter-section input[type="checkbox"] {{ accent-color: #1976D2; }} #floor-filter {{ margin-top: 8px; }} #floor-filter select {{ width: 100%; padding: 4px; border-radius: 4px; border: 1px solid #ccc; }} + .status-link {{ display: block; margin-top: 10px; padding-top: 10px; border-top: 1px solid #eee; text-align: center; }} + .status-link a {{ color: #1976D2; text-decoration: none; font-size: 12px; }} + @media (max-width: 600px) {{ + .info-panel {{ max-width: calc(100vw - 60px); right: 10px; }} + .info-panel.collapsed {{ transform: translateX(calc(100% + 20px)); }} + .panel-close-btn {{ top: 6px; right: 6px; }} + }}
-
+ +
+

Byty v Praze

Celkem: {len(estates)} bytů
Cena: {min_price} — {max_price}
Průměr: {avg_price}
-
Dispozice:
{legend_items}
Filtry: @@ -562,6 +653,7 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): Skrýt zamítnuté
+
""" diff --git a/scrape_bezrealitky.py b/scrape_bezrealitky.py index 4de8857..88a5322 100644 --- a/scrape_bezrealitky.py +++ b/scrape_bezrealitky.py @@ -7,6 +7,7 @@ Výstup: byty_bezrealitky.json from __future__ import annotations import argparse +from datetime import datetime import json import logging import math @@ -355,6 +356,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}", "source": "bezrealitky", "image": "", + "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 diff --git a/scrape_cityhome.py b/scrape_cityhome.py index 6769654..4e330f4 100644 --- a/scrape_cityhome.py +++ b/scrape_cityhome.py @@ -12,6 +12,7 @@ import logging import re import time import urllib.request +from datetime import datetime from pathlib import Path logger = logging.getLogger(__name__) @@ -33,24 +34,26 @@ HEADERS = { BASE_URL = "https://www.city-home.cz" -def fetch_url(url: str) -> str: - """Fetch URL and return HTML string.""" - for attempt in range(3): +def fetch_url(url: str, retries: int = 3) -> str: + """Fetch URL and return HTML string. Raises HTTPError on 4xx/5xx.""" + for attempt in range(retries): try: - logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}") - logger.debug(f"Headers: {HEADERS}") + logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") return html + except urllib.error.HTTPError: + # Don't retry on HTTP errors (404, 403, etc.) — re-raise immediately + raise except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e: - if attempt < 2: + if attempt < retries - 1: wait = (attempt + 1) * 2 - logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}") + logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") time.sleep(wait) else: - logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True) + logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) raise @@ -124,31 +127,21 @@ def parse_filter_page(html: str) -> list[dict]: if detail_url and not detail_url.startswith("http"): detail_url = BASE_URL + detail_url - # Extract floor from cells — look for pattern like "3.NP" or "2.PP" + # Parse table cells: [unit_name, unit_type_label, address, floor, disposition, area, transaction, price] cells = re.findall(r']*>(.*?)', row_content, re.DOTALL) - floor = None - floor_text = "" - project_name = "" + cell_texts = [re.sub(r'<[^>]+>', '', c).strip() for c in cells] - for cell in cells: - cell_text = re.sub(r'<[^>]+>', '', cell).strip() - # Floor pattern - np_match = re.search(r'(\d+)\.\s*NP', cell_text) - pp_match = re.search(r'(\d+)\.\s*PP', cell_text) + # Cell[2] = address (e.g. "Žateckých 14"), cell[3] = floor (e.g. "3.NP") + project_address = cell_texts[2] if len(cell_texts) > 2 else "" + + floor = None + if len(cell_texts) > 3: + np_match = re.search(r'(\d+)\.\s*NP', cell_texts[3]) + pp_match = re.search(r'(\d+)\.\s*PP', cell_texts[3]) if np_match: floor = int(np_match.group(1)) - floor_text = cell_text elif pp_match: - floor = -int(pp_match.group(1)) # Underground - floor_text = cell_text - - # Extract project name — usually in a cell that's not a number/price/floor - for cell in cells: - cell_text = re.sub(r'<[^>]+>', '', cell).strip() - if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "m²" not in cell_text and "Kč" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text: - if len(cell_text) > 3 and cell_text != unit_name: - project_name = cell_text - break + floor = -int(pp_match.group(1)) listing = { "price": int(cena.group(1)), @@ -158,27 +151,55 @@ def parse_filter_page(html: str) -> list[dict]: "project_id": project.group(1) if project else "", "transaction": transaction.group(1) if transaction else "", "disposition": dispozition.group(1) if dispozition else "", - "location": location.group(1) if location else "", "url": detail_url, "unit_name": unit_name, "floor": floor, - "project_name": project_name, + "project_address": project_address, } listings.append(listing) return listings -def extract_project_gps(html: str) -> dict[str, tuple[float, float]]: - """Extract GPS coordinates for projects from locality pages.""" - # Pattern in JS: ['

Project Name

...', 'LAT', 'LON', '1', 'Name'] - gps_data = {} - for match in re.finditer(r"\['[^']*

([^<]+)

[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html): - name = match.group(1).strip() - lat = float(match.group(2)) - lon = float(match.group(3)) - gps_data[name] = (lat, lon) - return gps_data +def get_lokalita_urls(slug: str) -> list[str]: + """Return candidate lokalita URLs to try in order.""" + return [ + f"{BASE_URL}/projekty/{slug}/lokalita", + f"{BASE_URL}/bytove-domy/{slug}/lokalita", + f"{BASE_URL}/bytove-domy/{slug}/lokalita1", + ] + + +def extract_project_gps(html: str) -> tuple[float, float] | None: + """Extract project GPS from lokalita page JS variable. + + The page contains: var locations = [['

Name

...', 'LAT', 'LNG', 'CATEGORY', 'Label'], ...] + Category '1' = the project's own marker. Some projects have two cat-1 entries (data error); + in that case we pick the one whose name contains a digit and is not a transit landmark. + """ + block = re.search(r'var locations\s*=\s*\[(.*?)\];', html, re.DOTALL) + if not block: + return None + + entries = re.findall( + r"'

(.*?)

.*?',\s*'([\d.]+)',\s*'([\d.]+)',\s*'1'", + block.group(0), + re.DOTALL, + ) + if not entries: + return None + + if len(entries) == 1: + return float(entries[0][1]), float(entries[0][2]) + + # Multiple cat-1 entries: pick the real project marker + transit_re = re.compile(r'nádraží|park|metro|tramvaj|autobus|zastávka', re.IGNORECASE) + for name, lat, lng in entries: + if re.search(r'\d', name) and not transit_re.search(name): + return float(lat), float(lng) + + # Fallback: first entry + return float(entries[0][1]), float(entries[0][2]) def scrape(max_pages: int | None = None, max_properties: int | None = None): @@ -210,22 +231,24 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): # Fetch GPS for each project from locality pages project_gps = {} for slug in sorted(project_slugs): - time.sleep(0.5) - try: - locality_url = f"{BASE_URL}/projekty/{slug}/lokalita" - logger.debug(f"Fetching project GPS: {locality_url}") - loc_html = fetch_url(locality_url) - gps = extract_project_gps(loc_html) - if gps: - # Take first entry (the project itself) - first_name, (lat, lon) = next(iter(gps.items())) - project_gps[slug] = (lat, lon) - logger.info(f"✓ {slug}: {lat}, {lon}") - else: - logger.info(f"✗ {slug}: GPS nenalezeno") - except Exception as e: - logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True) - logger.info(f"✗ {slug}: chyba ({e})") + time.sleep(0.3) + gps = None + for url in get_lokalita_urls(slug): + try: + logger.debug(f"Fetching project GPS: {url}") + loc_html = fetch_url(url) + gps = extract_project_gps(loc_html) + if gps: + break + except Exception as e: + logger.debug(f"GPS fetch failed for {url}: {e}") + continue + + if gps: + project_gps[slug] = gps + logger.info(f"✓ {slug}: {gps[0]}, {gps[1]}") + else: + logger.info(f"✗ {slug}: GPS nenalezeno") # Step 3: Filter listings logger.info(f"\nFáze 3: Filtrování...") @@ -303,22 +326,37 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): lat, lon = gps + # locality: use project address from cell (e.g. "Žateckých 14") + city from GPS lookup + project_address = listing.get("project_address", "") + # derive city from slug (GPS lookup key) + city_map = { + "karlinske-namesti-5": "Praha 8", + "melnicka-12": "Praha 7", + "na-vaclavce-34": "Praha 5", + "nad-kajetankou-12": "Praha 6", + "vosmikovych-3": "Praha 9", + "zateckych-14": "Praha 2", + } + city_str = city_map.get(slug, "Praha") + locality_str = f"{project_address}, {city_str}" if project_address else city_str + result = { "hash_id": f"cityhome_{slug}_{listing['unit_name']}", - "name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}", + "name": f"Prodej bytu {disp}, {int(area)} m² — {project_address}", "price": price, "price_formatted": format_price(price), - "locality": f"{listing['project_name']}, Praha", + "locality": locality_str, "lat": lat, "lon": lon, "disposition": disp, "floor": floor, - "area": area, + "area": float(area), "building_type": "Cihlová", # CityHome renovuje cihlové domy "ownership": "neuvedeno", "url": url, "source": "cityhome", "image": "", + "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 diff --git a/scrape_idnes.py b/scrape_idnes.py index 78388cf..04f31c5 100644 --- a/scrape_idnes.py +++ b/scrape_idnes.py @@ -7,6 +7,7 @@ Výstup: byty_idnes.json from __future__ import annotations import argparse +from datetime import datetime import json import logging import math @@ -458,6 +459,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "url": item["url"], "source": "idnes", "image": "", + "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 diff --git a/scrape_psn.py b/scrape_psn.py index 91273c9..444fadf 100644 --- a/scrape_psn.py +++ b/scrape_psn.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ PSN.cz scraper. -Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií. +Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování. Výstup: byty_psn.json """ from __future__ import annotations @@ -12,7 +12,9 @@ import logging import re import subprocess import time +from datetime import datetime from pathlib import Path +from urllib.parse import urlencode logger = logging.getLogger(__name__) @@ -22,82 +24,37 @@ MAX_PRICE = 14_000_000 MIN_AREA = 69 MIN_FLOOR = 2 -WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} +WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"} + +# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit +WANTED_CITIES = {"Praha"} UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" BASE_URL = "https://psn.cz" - -# Known Prague project slugs with GPS (from research) -PRAGUE_PROJECTS = [ - {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125}, - {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463}, - {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720}, - {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990}, - {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768}, - {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517}, - {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348}, - {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692}, - {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036}, - {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653}, - {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313}, - {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342}, - {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187}, - {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414}, - {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063}, - {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033}, -] +UNITS_API = f"{BASE_URL}/api/units-list" -def fetch_url(url: str) -> str: - """Fetch URL via curl (urllib SSL too old for Cloudflare).""" - logger.debug(f"HTTP GET request (via curl): {url}") - logger.debug(f"User-Agent: {UA}") +def fetch_json(url: str) -> dict: + """Fetch JSON via curl (urllib SSL may fail on Cloudflare).""" + logger.debug(f"HTTP GET: {url}") result = subprocess.run( ["curl", "-s", "-L", "--max-time", "30", "-H", f"User-Agent: {UA}", - "-H", "Accept: text/html", + "-H", "Accept: application/json", url], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: - logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}") raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") - logger.debug(f"HTTP response: size={len(result.stdout)} bytes") - return result.stdout + return json.loads(result.stdout) -def extract_units_from_html(html: str) -> list[dict]: - """Extract unit JSON objects from raw HTML with escaped quotes.""" - # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\" - # Step 1: Unescape the double-backslash-quotes to regular quotes - cleaned = html.replace('\\"', '"') - - # Step 2: Find each unit by looking for "title":"Byt and walking back to { - units = [] - decoder = json.JSONDecoder() - - for m in re.finditer(r'"title":"Byt', cleaned): - pos = m.start() - # Walk backwards to find the opening brace - depth = 0 - found = False - for i in range(pos - 1, max(pos - 3000, 0), -1): - if cleaned[i] == '}': - depth += 1 - elif cleaned[i] == '{': - if depth == 0: - try: - obj, end = decoder.raw_decode(cleaned, i) - if isinstance(obj, dict) and 'price_czk' in obj: - units.append(obj) - found = True - except (json.JSONDecodeError, ValueError): - pass - break - depth -= 1 - - return units +def fix_gps(lat, lng): + """PSN má u některých projektů prohozené lat/lng — opravíme.""" + if lat is not None and lng is not None and lat < 20 and lng > 20: + return lng, lat + return lat, lng def format_price(price: int) -> str: @@ -109,209 +66,178 @@ def format_price(price: int) -> str: return " ".join(reversed(parts)) + " Kč" -def scrape(max_pages: int | None = None, max_properties: int | None = None): +def scrape(max_properties: int | None = None): logger.info("=" * 60) logger.info("Stahuji inzeráty z PSN.cz") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") - logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)") - if max_pages: - logger.info(f"Max. stran: {max_pages}") + logger.info(f"Region: Praha") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) - # Fetch units from each Prague project - all_units = [] + # Jediný API požadavek — vrátí všechny jednotky (cca 236) + params = urlencode({ + "locale": "cs", + "filters": "{}", + "type": "list", + "order": "price-asc", + "offset": 0, + "limit": 500, + }) + url = f"{UNITS_API}?{params}" + logger.info("Stahuji jednotky z API ...") - for proj in PRAGUE_PROJECTS: - page = 1 - project_units = [] + try: + data = fetch_json(url) + except Exception as e: + logger.error(f"Chyba při stahování: {e}", exc_info=True) + return [] - while True: - if max_pages and page > max_pages: - logger.debug(f"Max pages limit reached: {max_pages}") - break - url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}" - logger.info(f"{proj['name']} — strana {page} ...") - time.sleep(0.5) + all_units = data.get("units", {}).get("data", []) + logger.info(f"Staženo jednotek celkem: {len(all_units)}") - try: - html = fetch_url(url) - except Exception as e: - logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True) - break - - units = extract_units_from_html(html) - logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units") - - if not units: - if page == 1: - logger.info(f"→ 0 jednotek") - break - - # Add project info to each unit - for unit in units: - if not unit.get("latitude") or not unit.get("longitude"): - unit["latitude"] = proj["lat"] - unit["longitude"] = proj["lon"] - unit["_project_name"] = proj["name"] - unit["_project_slug"] = proj["slug"] - - project_units.extend(units) - - if page == 1: - logger.info(f"→ {len(units)} jednotek na stránce") - - # Check if there might be more pages - # If we got fewer than expected or same units, stop - if len(units) < 10: - break - - page += 1 - if page > 10: # Safety limit - break - - all_units.extend(project_units) - - # Deduplicate by slug - seen_slugs = set() - unique_units = [] - for u in all_units: - slug = u.get("slug", "") - if slug and slug not in seen_slugs: - seen_slugs.add(slug) - unique_units.append(u) - elif not slug: - unique_units.append(u) - - logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek") - - # Filter - logger.info(f"\nFiltrování...") + # Filtrování results = [] - excluded_sold = 0 - excluded_type = 0 - excluded_disp = 0 - excluded_price = 0 - excluded_area = 0 - excluded_floor = 0 - excluded_panel = 0 + excluded = { + "prodáno": 0, + "typ": 0, + "město": 0, + "dispozice": 0, + "cena": 0, + "plocha": 0, + "patro": 0, + } properties_fetched = 0 - for unit in unique_units: + for unit in all_units: if max_properties and properties_fetched >= max_properties: - logger.debug(f"Max properties limit reached: {max_properties}") break - unit_id = unit.get("id", unit.get("slug", "unknown")) - # Only free units + + unit_id = unit.get("id", "?") + + # Pouze prodej bytů (type_id=0) + if unit.get("type_id") != 0: + excluded["typ"] += 1 + logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)") + continue + + # Pouze volné (ne rezervované, prodané, v přípravě) + sale_status = unit.get("sale_status", "") is_free = unit.get("is_free", False) is_sold = unit.get("is_sold", False) if is_sold or not is_free: - excluded_sold += 1 - logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)") + excluded["prodáno"] += 1 + logger.debug(f"id={unit_id}: přeskočen (status={sale_status})") continue - # Only apartments - category = str(unit.get("category", "")).lower() - if "byt" not in category and "ateliér" not in category: - excluded_type += 1 - logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})") + # Pouze Praha + city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip() + # location field je typicky "Praha 4", "Praha 7" atd. + city_base = city.split(" ")[0] if city else "" + if city_base not in WANTED_CITIES: + excluded["město"] += 1 + logger.debug(f"id={unit_id}: přeskočen (město={city})") continue - # Disposition + # Dispozice disp = unit.get("disposition", "") if disp not in WANTED_DISPOSITIONS: - excluded_disp += 1 - logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})") + excluded["dispozice"] += 1 + logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})") continue - # Price - price = unit.get("price_czk") or unit.get("action_price_czk") or 0 - if price <= 0 or price > MAX_PRICE: - excluded_price += 1 - logger.debug(f"Filter: id={unit_id} - excluded (price {price})") + # Cena + price = unit.get("action_price_czk") or unit.get("price_czk") or 0 + if not price or price <= 0 or price > MAX_PRICE: + excluded["cena"] += 1 + logger.debug(f"id={unit_id}: přeskočen (cena={price})") continue - # Area + # Plocha area = unit.get("total_area") or unit.get("floor_area") or 0 if area < MIN_AREA: - excluded_area += 1 - logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)") + excluded["plocha"] += 1 + logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)") continue - # Floor + # Patro floor_str = str(unit.get("floor", "")) floor = None if floor_str: try: floor = int(floor_str) except ValueError: - floor_match = re.search(r'(-?\d+)', floor_str) - if floor_match: - floor = int(floor_match.group(1)) + m = re.search(r'(-?\d+)', floor_str) + if m: + floor = int(m.group(1)) if floor is not None and floor < MIN_FLOOR: - excluded_floor += 1 - logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})") + excluded["patro"] += 1 + logger.debug(f"id={unit_id}: přeskočen (patro={floor})") continue - # Construction — check for panel - build_type = str(unit.get("build_type", "")).lower() - if "panel" in build_type: - excluded_panel += 1 - logger.debug(f"Filter: id={unit_id} - excluded (panel construction)") - logger.info(f"✗ Vyloučen: panel ({build_type})") + # GPS — opravit prohozené souřadnice + lat_raw = unit.get("latitude") + lng_raw = unit.get("longitude") + lat, lng = fix_gps(lat_raw, lng_raw) + if not lat or not lng: + logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji") continue - # Build construction label - building_type = "neuvedeno" - if build_type and build_type != "nevybráno": - if "cihlo" in build_type or "cihla" in build_type: - building_type = "Cihlová" - elif "skelet" in build_type: - building_type = "Skeletová" - else: - building_type = build_type.capitalize() + # Sestavit adresu pro locality + addr = unit.get("address") or {} + street = addr.get("street", "") + street_no = addr.get("street_no", "") + if street and street_no: + locality_str = f"{street} {street_no}, {city}" + elif street: + locality_str = f"{street}, {city}" + else: + project_name = unit.get("project", "") + locality_str = f"{project_name}, {city}" if project_name else city - lat = unit.get("latitude", 0) - lon = unit.get("longitude", 0) - - slug = unit.get("slug", "") - project_slug = unit.get("_project_slug", "") - detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}" + # URL na detail jednotky + unit_slug = unit.get("slug", "") + project_slug = "" + # project_slug lze odvodit z projektu nebo z reference_no + # API nevrací project_slug přímo — použijeme reference_no nebo jen ID + reference_no = unit.get("reference_no", "") + if unit_slug: + detail_url = f"{BASE_URL}/prodej/{unit_slug}" + elif reference_no: + detail_url = f"{BASE_URL}/prodej/{reference_no}" + else: + detail_url = BASE_URL result = { - "hash_id": unit.get("id", slug), - "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}", + "hash_id": str(unit_id), + "name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}", "price": int(price), "price_formatted": format_price(int(price)), - "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha", + "locality": locality_str, "lat": lat, - "lon": lon, + "lon": lng, "disposition": disp, "floor": floor, - "area": area, - "building_type": building_type, - "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno", + "area": float(area), + "building_type": "neuvedeno", + "ownership": "osobní", "url": detail_url, "source": "psn", "image": "", + "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky PSN:") - logger.info(f" Celkem jednotek: {len(unique_units)}") - logger.info(f" Vyloučeno (prodáno): {excluded_sold}") - logger.info(f" Vyloučeno (typ): {excluded_type}") - logger.info(f" Vyloučeno (dispozice): {excluded_disp}") - logger.info(f" Vyloučeno (cena): {excluded_price}") - logger.info(f" Vyloučeno (plocha): {excluded_area}") - logger.info(f" Vyloučeno (patro): {excluded_floor}") - logger.info(f" Vyloučeno (panel): {excluded_panel}") + logger.info(f" Staženo jednotek: {len(all_units)}") + for reason, count in excluded.items(): + if count: + logger.info(f" Vyloučeno ({reason}): {count}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") @@ -320,15 +246,13 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") - parser.add_argument("--max-pages", type=int, default=None, - help="Maximum number of listing pages per project to scrape") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to include in results") - parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], + parser.add_argument("--log-level", type=str, default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() - # Configure logging logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", @@ -336,7 +260,7 @@ if __name__ == "__main__": ) start = time.time() - estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + estates = scrape(max_properties=args.max_properties) if estates: json_path = Path("byty_psn.json") @@ -346,6 +270,6 @@ if __name__ == "__main__": ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") - logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") + logger.info(f"⏱ Celkový čas: {elapsed:.1f} s") else: logger.info("\nŽádné byty z PSN neodpovídají kritériím :(") diff --git a/scrape_realingo.py b/scrape_realingo.py index a6dce95..2c6c846 100644 --- a/scrape_realingo.py +++ b/scrape_realingo.py @@ -7,6 +7,7 @@ Výstup: byty_realingo.json from __future__ import annotations import argparse +from datetime import datetime import json import logging import math @@ -314,6 +315,7 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None): "url": f"{BASE_URL}{item['url']}", "source": "realingo", "image": "", + "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 diff --git a/status.html b/status.html new file mode 100644 index 0000000..3f6da1b --- /dev/null +++ b/status.html @@ -0,0 +1,204 @@ + + + + + +Scraper status + + + + +

Scraper status

+
maru-hleda-byt
+ +
+
+
+
Nacitam status...
+
+
+ + + + + +