#!/usr/bin/env python3 """ PSN.cz scraper. Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování. Výstup: byty_psn.json """ from __future__ import annotations import argparse import json import logging import re import subprocess import time from datetime import datetime from pathlib import Path from urllib.parse import urlencode logger = logging.getLogger(__name__) # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 14_000_000 MIN_AREA = 69 MIN_FLOOR = 2 WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"} # Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit WANTED_CITIES = {"Praha"} UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" BASE_URL = "https://psn.cz" UNITS_API = f"{BASE_URL}/api/units-list" def fetch_json(url: str) -> dict: """Fetch JSON via curl (urllib SSL may fail on Cloudflare).""" logger.debug(f"HTTP GET: {url}") result = subprocess.run( ["curl", "-s", "-L", "--max-time", "30", "-H", f"User-Agent: {UA}", "-H", "Accept: application/json", url], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}") return json.loads(result.stdout) def fix_gps(lat, lng): """PSN má u některých projektů prohozené lat/lng — opravíme.""" if lat is not None and lng is not None and lat < 20 and lng > 20: return lng, lat return lat, lng def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def scrape(max_properties: int | None = None): logger.info("=" * 60) logger.info("Stahuji inzeráty z PSN.cz") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Region: Praha") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) # Jediný API požadavek — vrátí všechny jednotky (cca 236) params = urlencode({ "locale": "cs", "filters": "{}", "type": "list", "order": "price-asc", "offset": 0, "limit": 500, }) url = f"{UNITS_API}?{params}" logger.info("Stahuji jednotky z API ...") try: data = fetch_json(url) except Exception as e: logger.error(f"Chyba při stahování: {e}", exc_info=True) return [] all_units = data.get("units", {}).get("data", []) logger.info(f"Staženo jednotek celkem: {len(all_units)}") # Filtrování results = [] excluded = { "prodáno": 0, "typ": 0, "město": 0, "dispozice": 0, "cena": 0, "plocha": 0, "patro": 0, } properties_fetched = 0 for unit in all_units: if max_properties and properties_fetched >= max_properties: break unit_id = unit.get("id", "?") # Pouze prodej bytů (type_id=0) if unit.get("type_id") != 0: excluded["typ"] += 1 logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)") continue # Pouze volné (ne rezervované, prodané, v přípravě) sale_status = unit.get("sale_status", "") is_free = unit.get("is_free", False) is_sold = unit.get("is_sold", False) if is_sold or not is_free: excluded["prodáno"] += 1 logger.debug(f"id={unit_id}: přeskočen (status={sale_status})") continue # Pouze Praha city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip() # location field je typicky "Praha 4", "Praha 7" atd. city_base = city.split(" ")[0] if city else "" if city_base not in WANTED_CITIES: excluded["město"] += 1 logger.debug(f"id={unit_id}: přeskočen (město={city})") continue # Dispozice disp = unit.get("disposition", "") if disp not in WANTED_DISPOSITIONS: excluded["dispozice"] += 1 logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})") continue # Cena price = unit.get("action_price_czk") or unit.get("price_czk") or 0 if not price or price <= 0 or price > MAX_PRICE: excluded["cena"] += 1 logger.debug(f"id={unit_id}: přeskočen (cena={price})") continue # Plocha area = unit.get("total_area") or unit.get("floor_area") or 0 if area < MIN_AREA: excluded["plocha"] += 1 logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)") continue # Patro floor_str = str(unit.get("floor", "")) floor = None if floor_str: try: floor = int(floor_str) except ValueError: m = re.search(r'(-?\d+)', floor_str) if m: floor = int(m.group(1)) if floor is not None and floor < MIN_FLOOR: excluded["patro"] += 1 logger.debug(f"id={unit_id}: přeskočen (patro={floor})") continue # GPS — opravit prohozené souřadnice lat_raw = unit.get("latitude") lng_raw = unit.get("longitude") lat, lng = fix_gps(lat_raw, lng_raw) if not lat or not lng: logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji") continue # Sestavit adresu pro locality addr = unit.get("address") or {} street = addr.get("street", "") street_no = addr.get("street_no", "") if street and street_no: locality_str = f"{street} {street_no}, {city}" elif street: locality_str = f"{street}, {city}" else: project_name = unit.get("project", "") locality_str = f"{project_name}, {city}" if project_name else city # URL na detail jednotky unit_slug = unit.get("slug", "") project_slug = "" # project_slug lze odvodit z projektu nebo z reference_no # API nevrací project_slug přímo — použijeme reference_no nebo jen ID reference_no = unit.get("reference_no", "") if unit_slug: detail_url = f"{BASE_URL}/prodej/{unit_slug}" elif reference_no: detail_url = f"{BASE_URL}/prodej/{reference_no}" else: detail_url = BASE_URL result = { "hash_id": str(unit_id), "name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}", "price": int(price), "price_formatted": format_price(int(price)), "locality": locality_str, "lat": lat, "lon": lng, "disposition": disp, "floor": floor, "area": float(area), "building_type": "neuvedeno", "ownership": "osobní", "url": detail_url, "source": "psn", "image": "", "scraped_at": datetime.now().strftime("%Y-%m-%d"), } results.append(result) properties_fetched += 1 logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky PSN:") logger.info(f" Staženo jednotek: {len(all_units)}") for reason, count in excluded.items(): if count: logger.info(f" Vyloučeno ({reason}): {count}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to include in results") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()] ) start = time.time() estates = scrape(max_properties=args.max_properties) if estates: json_path = Path("byty_psn.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"⏱ Celkový čas: {elapsed:.1f} s") else: logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")