Files
maru-hleda-byt/scrape_psn.py
Jan Novak 27a7834eb6
Some checks failed
Build and Push / build (push) Failing after 4s
Reliability improvements: retry logic, validation, ratings sync
- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
  Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
  validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
  localStorage) and save (async POST); ratings now persist across
  browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
  between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
  in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 10:36:37 +01:00

333 lines
11 KiB
Python

#!/usr/bin/env python3
"""
PSN.cz scraper.
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
Výstup: byty_psn.json
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import subprocess
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import urlencode
from scraper_stats import write_stats, validate_listing
STATS_FILE = "stats_psn.json"
logger = logging.getLogger(__name__)
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
WANTED_CITIES = {"Praha"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list"
def fetch_json(url: str, retries: int = 3) -> dict:
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare) with retry."""
for attempt in range(retries):
logger.debug(f"HTTP GET (attempt {attempt + 1}/{retries}): {url}")
result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}",
"-H", "Accept: application/json",
url],
capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
return json.loads(result.stdout)
if attempt < retries - 1:
wait = (attempt + 1) * 2
logger.warning(f"curl failed (retry {attempt + 1}/{retries} after {wait}s): {result.stderr[:200]}")
time.sleep(wait)
else:
raise RuntimeError(f"curl failed after {retries} attempts ({result.returncode}): {result.stderr[:200]}")
def fix_gps(lat, lng):
"""PSN má u některých projektů prohozené lat/lng — opravíme."""
if lat is not None and lng is not None and lat < 20 and lng > 20:
return lng, lat
return lat, lng
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def scrape(max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Jediný API požadavek — vrátí všechny jednotky (cca 236)
params = urlencode({
"locale": "cs",
"filters": "{}",
"type": "list",
"order": "price-asc",
"offset": 0,
"limit": 500,
})
url = f"{UNITS_API}?{params}"
logger.info("Stahuji jednotky z API ...")
try:
data = fetch_json(url)
except Exception as e:
logger.error(f"Chyba při stahování: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "PSN",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
return []
all_units = data.get("units", {}).get("data", [])
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
# Load previous output for first_seen/last_changed tracking
_prev_cache: dict[str, dict] = {}
_prev_path = Path("byty_psn.json")
if _prev_path.exists():
try:
for _item in json.loads(_prev_path.read_text(encoding="utf-8")):
_prev_cache[str(_item["hash_id"])] = _item
except Exception:
pass
# Filtrování
results = []
excluded = {
"prodáno": 0,
"typ": 0,
"město": 0,
"dispozice": 0,
"cena": 0,
"plocha": 0,
"patro": 0,
}
properties_fetched = 0
for unit in all_units:
if max_properties and properties_fetched >= max_properties:
break
unit_id = unit.get("id", "?")
# Pouze prodej bytů (type_id=0)
if unit.get("type_id") != 0:
excluded["typ"] += 1
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
continue
# Pouze volné (ne rezervované, prodané, v přípravě)
sale_status = unit.get("sale_status", "")
is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False)
if is_sold or not is_free:
excluded["prodáno"] += 1
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
continue
# Pouze Praha
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
# location field je typicky "Praha 4", "Praha 7" atd.
city_base = city.split(" ")[0] if city else ""
if city_base not in WANTED_CITIES:
excluded["město"] += 1
logger.debug(f"id={unit_id}: přeskočen (město={city})")
continue
# Dispozice
disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded["dispozice"] += 1
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
continue
# Cena
price = unit.get("action_price_czk") or unit.get("price_czk") or 0
if not price or price <= 0 or price > MAX_PRICE:
excluded["cena"] += 1
logger.debug(f"id={unit_id}: přeskočen (cena={price})")
continue
# Plocha
area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA:
excluded["plocha"] += 1
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
continue
# Patro
floor_str = str(unit.get("floor", ""))
floor = None
if floor_str:
try:
floor = int(floor_str)
except ValueError:
m = re.search(r'(-?\d+)', floor_str)
if m:
floor = int(m.group(1))
if floor is not None and floor < MIN_FLOOR:
excluded["patro"] += 1
logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
continue
# GPS — opravit prohozené souřadnice
lat_raw = unit.get("latitude")
lng_raw = unit.get("longitude")
lat, lng = fix_gps(lat_raw, lng_raw)
if not lat or not lng:
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
continue
# Sestavit adresu pro locality
addr = unit.get("address") or {}
street = addr.get("street", "")
street_no = addr.get("street_no", "")
if street and street_no:
locality_str = f"{street} {street_no}, {city}"
elif street:
locality_str = f"{street}, {city}"
else:
project_name = unit.get("project", "")
locality_str = f"{project_name}, {city}" if project_name else city
# URL na detail jednotky
unit_slug = unit.get("slug", "")
project_slug = ""
# project_slug lze odvodit z projektu nebo z reference_no
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID
reference_no = unit.get("reference_no", "")
if unit_slug:
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
elif reference_no:
detail_url = f"{BASE_URL}/prodej/{reference_no}"
else:
detail_url = BASE_URL
result = {
"hash_id": str(unit_id),
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
"price": int(price),
"price_formatted": format_price(int(price)),
"locality": locality_str,
"lat": lat,
"lon": lng,
"disposition": disp,
"floor": floor,
"area": float(area),
"building_type": "neuvedeno",
"ownership": "osobní",
"url": detail_url,
"source": "psn",
"image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
"first_seen": _prev_cache.get(str(unit_id), {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
"last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(str(unit_id), {}).get("price") != int(price) else _prev_cache[str(unit_id)].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
}
if not validate_listing(result, "psn"):
continue
results.append(result)
properties_fetched += 1
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky PSN:")
logger.info(f" Staženo jednotek: {len(all_units)}")
for reason, count in excluded.items():
if count:
logger.info(f" Vyloučeno ({reason}): {count}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "PSN",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_units),
"excluded": excluded,
})
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time()
try:
estates = scrape(max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "PSN",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates:
json_path = Path("byty_psn.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
else:
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")