Upload files to "/"

v1 scrapery
2026-02-13 16:11:28 +00:00
parent 82d1f94104
commit 846d0bd9f2
5 changed files with 1760 additions and 0 deletions
--- a/scrape_psn.py
+++ b/scrape_psn.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+PSN.cz scraper.
+Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
+Výstup: byty_psn.json
+"""
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+import time
+from pathlib import Path
+
+# ── Konfigurace ─────────────────────────────────────────────────────────────
+
+MAX_PRICE = 14_000_000
+MIN_AREA = 69
+MIN_FLOOR = 2
+
+WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
+
+UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
+BASE_URL = "https://psn.cz"
+
+# Known Prague project slugs with GPS (from research)
+PRAGUE_PROJECTS = [
+    {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
+    {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
+    {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
+    {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
+    {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
+    {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
+    {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
+    {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
+    {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
+    {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
+    {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
+    {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
+    {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
+    {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
+    {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
+    {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
+]
+
+
+def fetch_url(url: str) -> str:
+    """Fetch URL via curl (urllib SSL too old for Cloudflare)."""
+    result = subprocess.run(
+        ["curl", "-s", "-L", "--max-time", "30",
+         "-H", f"User-Agent: {UA}",
+         "-H", "Accept: text/html",
+         url],
+        capture_output=True, text=True, timeout=60
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
+    return result.stdout
+
+
+def extract_units_from_html(html: str) -> list[dict]:
+    """Extract unit JSON objects from raw HTML with escaped quotes."""
+    # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
+    # Step 1: Unescape the double-backslash-quotes to regular quotes
+    cleaned = html.replace('\\"', '"')
+
+    # Step 2: Find each unit by looking for "title":"Byt and walking back to {
+    units = []
+    decoder = json.JSONDecoder()
+
+    for m in re.finditer(r'"title":"Byt', cleaned):
+        pos = m.start()
+        # Walk backwards to find the opening brace
+        depth = 0
+        found = False
+        for i in range(pos - 1, max(pos - 3000, 0), -1):
+            if cleaned[i] == '}':
+                depth += 1
+            elif cleaned[i] == '{':
+                if depth == 0:
+                    try:
+                        obj, end = decoder.raw_decode(cleaned, i)
+                        if isinstance(obj, dict) and 'price_czk' in obj:
+                            units.append(obj)
+                            found = True
+                    except (json.JSONDecodeError, ValueError):
+                        pass
+                    break
+                depth -= 1
+
+    return units
+
+
+def format_price(price: int) -> str:
+    s = str(price)
+    parts = []
+    while s:
+        parts.append(s[-3:])
+        s = s[:-3]
+    return " ".join(reversed(parts)) + " Kč"
+
+
+def scrape():
+    print("=" * 60)
+    print("Stahuji inzeráty z PSN.cz")
+    print(f"Cena: do {format_price(MAX_PRICE)}")
+    print(f"Min. plocha: {MIN_AREA} m²")
+    print(f"Patro: od {MIN_FLOOR}. NP")
+    print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
+    print("=" * 60)
+
+    # Fetch units from each Prague project
+    all_units = []
+
+    for proj in PRAGUE_PROJECTS:
+        page = 1
+        project_units = []
+
+        while True:
+            url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
+            print(f"  {proj['name']} — strana {page} ...")
+            time.sleep(0.5)
+
+            try:
+                html = fetch_url(url)
+            except Exception as e:
+                print(f"    Chyba: {e}")
+                break
+
+            units = extract_units_from_html(html)
+
+            if not units:
+                if page == 1:
+                    print(f"    → 0 jednotek")
+                break
+
+            # Add project info to each unit
+            for unit in units:
+                if not unit.get("latitude") or not unit.get("longitude"):
+                    unit["latitude"] = proj["lat"]
+                    unit["longitude"] = proj["lon"]
+                unit["_project_name"] = proj["name"]
+                unit["_project_slug"] = proj["slug"]
+
+            project_units.extend(units)
+
+            if page == 1:
+                print(f"    → {len(units)} jednotek na stránce")
+
+            # Check if there might be more pages
+            # If we got fewer than expected or same units, stop
+            if len(units) < 10:
+                break
+
+            page += 1
+            if page > 10:  # Safety limit
+                break
+
+        all_units.extend(project_units)
+
+    # Deduplicate by slug
+    seen_slugs = set()
+    unique_units = []
+    for u in all_units:
+        slug = u.get("slug", "")
+        if slug and slug not in seen_slugs:
+            seen_slugs.add(slug)
+            unique_units.append(u)
+        elif not slug:
+            unique_units.append(u)
+
+    print(f"\n  Staženo celkem: {len(unique_units)} unikátních jednotek")
+
+    # Filter
+    print(f"\nFiltrování...")
+    results = []
+    excluded_sold = 0
+    excluded_type = 0
+    excluded_disp = 0
+    excluded_price = 0
+    excluded_area = 0
+    excluded_floor = 0
+    excluded_panel = 0
+
+    for unit in unique_units:
+        # Only free units
+        is_free = unit.get("is_free", False)
+        is_sold = unit.get("is_sold", False)
+        if is_sold or not is_free:
+            excluded_sold += 1
+            continue
+
+        # Only apartments
+        category = str(unit.get("category", "")).lower()
+        if "byt" not in category and "ateliér" not in category:
+            excluded_type += 1
+            continue
+
+        # Disposition
+        disp = unit.get("disposition", "")
+        if disp not in WANTED_DISPOSITIONS:
+            excluded_disp += 1
+            continue
+
+        # Price
+        price = unit.get("price_czk") or unit.get("action_price_czk") or 0
+        if price <= 0 or price > MAX_PRICE:
+            excluded_price += 1
+            continue
+
+        # Area
+        area = unit.get("total_area") or unit.get("floor_area") or 0
+        if area < MIN_AREA:
+            excluded_area += 1
+            continue
+
+        # Floor
+        floor_str = str(unit.get("floor", ""))
+        floor = None
+        if floor_str:
+            try:
+                floor = int(floor_str)
+            except ValueError:
+                floor_match = re.search(r'(-?\d+)', floor_str)
+                if floor_match:
+                    floor = int(floor_match.group(1))
+
+        if floor is not None and floor < MIN_FLOOR:
+            excluded_floor += 1
+            continue
+
+        # Construction — check for panel
+        build_type = str(unit.get("build_type", "")).lower()
+        if "panel" in build_type:
+            excluded_panel += 1
+            print(f"  ✗ Vyloučen: panel ({build_type})")
+            continue
+
+        # Build construction label
+        building_type = "neuvedeno"
+        if build_type and build_type != "nevybráno":
+            if "cihlo" in build_type or "cihla" in build_type:
+                building_type = "Cihlová"
+            elif "skelet" in build_type:
+                building_type = "Skeletová"
+            else:
+                building_type = build_type.capitalize()
+
+        lat = unit.get("latitude", 0)
+        lon = unit.get("longitude", 0)
+
+        slug = unit.get("slug", "")
+        project_slug = unit.get("_project_slug", "")
+        detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
+
+        result = {
+            "hash_id": unit.get("id", slug),
+            "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
+            "price": int(price),
+            "price_formatted": format_price(int(price)),
+            "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
+            "lat": lat,
+            "lon": lon,
+            "disposition": disp,
+            "floor": floor,
+            "area": area,
+            "building_type": building_type,
+            "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
+            "url": detail_url,
+            "source": "psn",
+            "image": "",
+        }
+        results.append(result)
+
+    print(f"\n{'=' * 60}")
+    print(f"Výsledky PSN:")
+    print(f"  Celkem jednotek:       {len(unique_units)}")
+    print(f"  Vyloučeno (prodáno):   {excluded_sold}")
+    print(f"  Vyloučeno (typ):       {excluded_type}")
+    print(f"  Vyloučeno (dispozice): {excluded_disp}")
+    print(f"  Vyloučeno (cena):      {excluded_price}")
+    print(f"  Vyloučeno (plocha):    {excluded_area}")
+    print(f"  Vyloučeno (patro):     {excluded_floor}")
+    print(f"  Vyloučeno (panel):     {excluded_panel}")
+    print(f"  ✓ Vyhovující byty:    {len(results)}")
+    print(f"{'=' * 60}")
+
+    return results
+
+
+if __name__ == "__main__":
+    start = time.time()
+    estates = scrape()
+
+    if estates:
+        json_path = Path("byty_psn.json")
+        json_path.write_text(
+            json.dumps(estates, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+        elapsed = time.time() - start
+        print(f"\n✓ Data uložena: {json_path.resolve()}")
+        print(f"⏱  Celkový čas: {elapsed:.0f} s")
+    else:
+        print("\nŽádné byty z PSN neodpovídají kritériím :(")