maru-hleda-byt/scrape_psn.py

#!/usr/bin/env python3
"""
PSN.cz scraper.
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
Výstup: byty_psn.json
"""
from __future__ import annotations

import argparse
import json
import logging
import re
import subprocess
import time
from pathlib import Path

logger = logging.getLogger(__name__)

# ── Konfigurace ─────────────────────────────────────────────────────────────

MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2

WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}

UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

BASE_URL = "https://psn.cz"

# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
    {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
    {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
    {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
    {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
    {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
    {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
    {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
    {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
    {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
    {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
    {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
    {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
    {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
    {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
    {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
    {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]


def fetch_url(url: str) -> str:
    """Fetch URL via curl (urllib SSL too old for Cloudflare)."""
    logger.debug(f"HTTP GET request (via curl): {url}")
    logger.debug(f"User-Agent: {UA}")
    result = subprocess.run(
        ["curl", "-s", "-L", "--max-time", "30",
         "-H", f"User-Agent: {UA}",
         "-H", "Accept: text/html",
         url],
        capture_output=True, text=True, timeout=60
    )
    if result.returncode != 0:
        logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
        raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
    logger.debug(f"HTTP response: size={len(result.stdout)} bytes")
    return result.stdout


def extract_units_from_html(html: str) -> list[dict]:
    """Extract unit JSON objects from raw HTML with escaped quotes."""
    # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
    # Step 1: Unescape the double-backslash-quotes to regular quotes
    cleaned = html.replace('\\"', '"')

    # Step 2: Find each unit by looking for "title":"Byt and walking back to {
    units = []
    decoder = json.JSONDecoder()

    for m in re.finditer(r'"title":"Byt', cleaned):
        pos = m.start()
        # Walk backwards to find the opening brace
        depth = 0
        found = False
        for i in range(pos - 1, max(pos - 3000, 0), -1):
            if cleaned[i] == '}':
                depth += 1
            elif cleaned[i] == '{':
                if depth == 0:
                    try:
                        obj, end = decoder.raw_decode(cleaned, i)
                        if isinstance(obj, dict) and 'price_czk' in obj:
                            units.append(obj)
                            found = True
                    except (json.JSONDecodeError, ValueError):
                        pass
                    break
                depth -= 1

    return units


def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"


def scrape(max_pages: int | None = None, max_properties: int | None = None):
    logger.info("=" * 60)
    logger.info("Stahuji inzeráty z PSN.cz")
    logger.info(f"Cena: do {format_price(MAX_PRICE)}")
    logger.info(f"Min. plocha: {MIN_AREA} m²")
    logger.info(f"Patro: od {MIN_FLOOR}. NP")
    logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
    if max_pages:
        logger.info(f"Max. stran: {max_pages}")
    if max_properties:
        logger.info(f"Max. bytů: {max_properties}")
    logger.info("=" * 60)

    # Fetch units from each Prague project
    all_units = []

    for proj in PRAGUE_PROJECTS:
        page = 1
        project_units = []

        while True:
            if max_pages and page > max_pages:
                logger.debug(f"Max pages limit reached: {max_pages}")
                break
            url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
            logger.info(f"{proj['name']} — strana {page} ...")
            time.sleep(0.5)

            try:
                html = fetch_url(url)
            except Exception as e:
                logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
                break

            units = extract_units_from_html(html)
            logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")

            if not units:
                if page == 1:
                    logger.info(f"→ 0 jednotek")
                break

            # Add project info to each unit
            for unit in units:
                if not unit.get("latitude") or not unit.get("longitude"):
                    unit["latitude"] = proj["lat"]
                    unit["longitude"] = proj["lon"]
                unit["_project_name"] = proj["name"]
                unit["_project_slug"] = proj["slug"]

            project_units.extend(units)

            if page == 1:
                logger.info(f"→ {len(units)} jednotek na stránce")

            # Check if there might be more pages
            # If we got fewer than expected or same units, stop
            if len(units) < 10:
                break

            page += 1
            if page > 10:  # Safety limit
                break

        all_units.extend(project_units)

    # Deduplicate by slug
    seen_slugs = set()
    unique_units = []
    for u in all_units:
        slug = u.get("slug", "")
        if slug and slug not in seen_slugs:
            seen_slugs.add(slug)
            unique_units.append(u)
        elif not slug:
            unique_units.append(u)

    logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")

    # Filter
    logger.info(f"\nFiltrování...")
    results = []
    excluded_sold = 0
    excluded_type = 0
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_floor = 0
    excluded_panel = 0
    properties_fetched = 0

    for unit in unique_units:
        if max_properties and properties_fetched >= max_properties:
            logger.debug(f"Max properties limit reached: {max_properties}")
            break
        unit_id = unit.get("id", unit.get("slug", "unknown"))
        # Only free units
        is_free = unit.get("is_free", False)
        is_sold = unit.get("is_sold", False)
        if is_sold or not is_free:
            excluded_sold += 1
            logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)")
            continue

        # Only apartments
        category = str(unit.get("category", "")).lower()
        if "byt" not in category and "ateliér" not in category:
            excluded_type += 1
            logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})")
            continue

        # Disposition
        disp = unit.get("disposition", "")
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})")
            continue

        # Price
        price = unit.get("price_czk") or unit.get("action_price_czk") or 0
        if price <= 0 or price > MAX_PRICE:
            excluded_price += 1
            logger.debug(f"Filter: id={unit_id} - excluded (price {price})")
            continue

        # Area
        area = unit.get("total_area") or unit.get("floor_area") or 0
        if area < MIN_AREA:
            excluded_area += 1
            logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)")
            continue

        # Floor
        floor_str = str(unit.get("floor", ""))
        floor = None
        if floor_str:
            try:
                floor = int(floor_str)
            except ValueError:
                floor_match = re.search(r'(-?\d+)', floor_str)
                if floor_match:
                    floor = int(floor_match.group(1))

        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})")
            continue

        # Construction — check for panel
        build_type = str(unit.get("build_type", "")).lower()
        if "panel" in build_type:
            excluded_panel += 1
            logger.debug(f"Filter: id={unit_id} - excluded (panel construction)")
            logger.info(f"✗ Vyloučen: panel ({build_type})")
            continue

        # Build construction label
        building_type = "neuvedeno"
        if build_type and build_type != "nevybráno":
            if "cihlo" in build_type or "cihla" in build_type:
                building_type = "Cihlová"
            elif "skelet" in build_type:
                building_type = "Skeletová"
            else:
                building_type = build_type.capitalize()

        lat = unit.get("latitude", 0)
        lon = unit.get("longitude", 0)

        slug = unit.get("slug", "")
        project_slug = unit.get("_project_slug", "")
        detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"

        result = {
            "hash_id": unit.get("id", slug),
            "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
            "price": int(price),
            "price_formatted": format_price(int(price)),
            "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
            "lat": lat,
            "lon": lon,
            "disposition": disp,
            "floor": floor,
            "area": area,
            "building_type": building_type,
            "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
            "url": detail_url,
            "source": "psn",
            "image": "",
        }
        results.append(result)
        properties_fetched += 1

    logger.info(f"\n{'=' * 60}")
    logger.info(f"Výsledky PSN:")
    logger.info(f"  Celkem jednotek:       {len(unique_units)}")
    logger.info(f"  Vyloučeno (prodáno):   {excluded_sold}")
    logger.info(f"  Vyloučeno (typ):       {excluded_type}")
    logger.info(f"  Vyloučeno (dispozice): {excluded_disp}")
    logger.info(f"  Vyloučeno (cena):      {excluded_price}")
    logger.info(f"  Vyloučeno (plocha):    {excluded_area}")
    logger.info(f"  Vyloučeno (patro):     {excluded_floor}")
    logger.info(f"  Vyloučeno (panel):     {excluded_panel}")
    logger.info(f"  ✓ Vyhovující byty:    {len(results)}")
    logger.info(f"{'=' * 60}")

    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
    parser.add_argument("--max-pages", type=int, default=None,
                        help="Maximum number of listing pages per project to scrape")
    parser.add_argument("--max-properties", type=int, default=None,
                        help="Maximum number of properties to include in results")
    parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
                        help="Logging level (default: INFO)")
    args = parser.parse_args()

    # Configure logging
    logging.basicConfig(
        level=getattr(logging, args.log_level),
        format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
        handlers=[logging.StreamHandler()]
    )

    start = time.time()
    estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)

    if estates:
        json_path = Path("byty_psn.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
        logger.info(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")