#!/usr/bin/env python3 """ PSN.cz scraper. Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií. Výstup: byty_psn.json """ from __future__ import annotations import argparse import json import logging import re import subprocess import time from pathlib import Path logger = logging.getLogger(__name__) # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 14_000_000 MIN_AREA = 69 MIN_FLOOR = 2 WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" BASE_URL = "https://psn.cz" # Known Prague project slugs with GPS (from research) PRAGUE_PROJECTS = [ {"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125}, {"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463}, {"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720}, {"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990}, {"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768}, {"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517}, {"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348}, {"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692}, {"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036}, {"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653}, {"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313}, {"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342}, {"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187}, {"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414}, {"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063}, {"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033}, ] def fetch_url(url: str) -> str: """Fetch URL via curl (urllib SSL too old for Cloudflare).""" logger.debug(f"HTTP GET request (via curl): {url}") logger.debug(f"User-Agent: {UA}") result = subprocess.run( ["curl", "-s", "-L", "--max-time", "30", "-H", f"User-Agent: {UA}", "-H", "Accept: text/html", url], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}") raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") logger.debug(f"HTTP response: size={len(result.stdout)} bytes") return result.stdout def extract_units_from_html(html: str) -> list[dict]: """Extract unit JSON objects from raw HTML with escaped quotes.""" # The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\" # Step 1: Unescape the double-backslash-quotes to regular quotes cleaned = html.replace('\\"', '"') # Step 2: Find each unit by looking for "title":"Byt and walking back to { units = [] decoder = json.JSONDecoder() for m in re.finditer(r'"title":"Byt', cleaned): pos = m.start() # Walk backwards to find the opening brace depth = 0 found = False for i in range(pos - 1, max(pos - 3000, 0), -1): if cleaned[i] == '}': depth += 1 elif cleaned[i] == '{': if depth == 0: try: obj, end = decoder.raw_decode(cleaned, i) if isinstance(obj, dict) and 'price_czk' in obj: units.append(obj) found = True except (json.JSONDecodeError, ValueError): pass break depth -= 1 return units def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def scrape(max_pages: int | None = None, max_properties: int | None = None): logger.info("=" * 60) logger.info("Stahuji inzeráty z PSN.cz") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)") if max_pages: logger.info(f"Max. stran: {max_pages}") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) # Fetch units from each Prague project all_units = [] for proj in PRAGUE_PROJECTS: page = 1 project_units = [] while True: if max_pages and page > max_pages: logger.debug(f"Max pages limit reached: {max_pages}") break url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}" logger.info(f"{proj['name']} — strana {page} ...") time.sleep(0.5) try: html = fetch_url(url) except Exception as e: logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True) break units = extract_units_from_html(html) logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units") if not units: if page == 1: logger.info(f"→ 0 jednotek") break # Add project info to each unit for unit in units: if not unit.get("latitude") or not unit.get("longitude"): unit["latitude"] = proj["lat"] unit["longitude"] = proj["lon"] unit["_project_name"] = proj["name"] unit["_project_slug"] = proj["slug"] project_units.extend(units) if page == 1: logger.info(f"→ {len(units)} jednotek na stránce") # Check if there might be more pages # If we got fewer than expected or same units, stop if len(units) < 10: break page += 1 if page > 10: # Safety limit break all_units.extend(project_units) # Deduplicate by slug seen_slugs = set() unique_units = [] for u in all_units: slug = u.get("slug", "") if slug and slug not in seen_slugs: seen_slugs.add(slug) unique_units.append(u) elif not slug: unique_units.append(u) logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek") # Filter logger.info(f"\nFiltrování...") results = [] excluded_sold = 0 excluded_type = 0 excluded_disp = 0 excluded_price = 0 excluded_area = 0 excluded_floor = 0 excluded_panel = 0 properties_fetched = 0 for unit in unique_units: if max_properties and properties_fetched >= max_properties: logger.debug(f"Max properties limit reached: {max_properties}") break unit_id = unit.get("id", unit.get("slug", "unknown")) # Only free units is_free = unit.get("is_free", False) is_sold = unit.get("is_sold", False) if is_sold or not is_free: excluded_sold += 1 logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)") continue # Only apartments category = str(unit.get("category", "")).lower() if "byt" not in category and "ateliér" not in category: excluded_type += 1 logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})") continue # Disposition disp = unit.get("disposition", "") if disp not in WANTED_DISPOSITIONS: excluded_disp += 1 logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})") continue # Price price = unit.get("price_czk") or unit.get("action_price_czk") or 0 if price <= 0 or price > MAX_PRICE: excluded_price += 1 logger.debug(f"Filter: id={unit_id} - excluded (price {price})") continue # Area area = unit.get("total_area") or unit.get("floor_area") or 0 if area < MIN_AREA: excluded_area += 1 logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)") continue # Floor floor_str = str(unit.get("floor", "")) floor = None if floor_str: try: floor = int(floor_str) except ValueError: floor_match = re.search(r'(-?\d+)', floor_str) if floor_match: floor = int(floor_match.group(1)) if floor is not None and floor < MIN_FLOOR: excluded_floor += 1 logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})") continue # Construction — check for panel build_type = str(unit.get("build_type", "")).lower() if "panel" in build_type: excluded_panel += 1 logger.debug(f"Filter: id={unit_id} - excluded (panel construction)") logger.info(f"✗ Vyloučen: panel ({build_type})") continue # Build construction label building_type = "neuvedeno" if build_type and build_type != "nevybráno": if "cihlo" in build_type or "cihla" in build_type: building_type = "Cihlová" elif "skelet" in build_type: building_type = "Skeletová" else: building_type = build_type.capitalize() lat = unit.get("latitude", 0) lon = unit.get("longitude", 0) slug = unit.get("slug", "") project_slug = unit.get("_project_slug", "") detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}" result = { "hash_id": unit.get("id", slug), "name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}", "price": int(price), "price_formatted": format_price(int(price)), "locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha", "lat": lat, "lon": lon, "disposition": disp, "floor": floor, "area": area, "building_type": building_type, "ownership": unit.get("ownership", "neuvedeno") or "neuvedeno", "url": detail_url, "source": "psn", "image": "", } results.append(result) properties_fetched += 1 logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky PSN:") logger.info(f" Celkem jednotek: {len(unique_units)}") logger.info(f" Vyloučeno (prodáno): {excluded_sold}") logger.info(f" Vyloučeno (typ): {excluded_type}") logger.info(f" Vyloučeno (dispozice): {excluded_disp}") logger.info(f" Vyloučeno (cena): {excluded_price}") logger.info(f" Vyloučeno (plocha): {excluded_area}") logger.info(f" Vyloučeno (patro): {excluded_floor}") logger.info(f" Vyloučeno (panel): {excluded_panel}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") parser.add_argument("--max-pages", type=int, default=None, help="Maximum number of listing pages per project to scrape") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to include in results") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() # Configure logging logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()] ) start = time.time() estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) if estates: json_path = Path("byty_psn.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") else: logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")