Files
maru-hleda-byt/scrape_psn.py
Jan Novak c2bc3f452f
All checks were successful
Build and Push / build (push) Successful in 13s
Unify server, persist ratings via API, refresh scraper data
- Replace split setup (ratings_server.py on :8081 + http.server on :8080)
  with a single combined Flask server (server.py) on :8080 that serves
  static files and the /api/ratings GET/POST endpoints
- Ratings are now persisted server-side: mapa_bytu.html loads ratings
  from GET /api/ratings on startup (API as source of truth) and POSTs
  on every change — enables cross-browser and cross-device state sharing
  while keeping localStorage as a synchronous read cache
- Dockerfile: install flask, copy server.py instead of ratings_server.py,
  expose only port 8080
- entrypoint.sh: start single server process instead of two
- Makefile: add serve / serve-debug targets for local development
- scrape_psn.py: fix log label, add --max-pages stub arg for CLI parity
- Refresh all scraped property data

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 13:51:02 +01:00

278 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""
PSN.cz scraper.
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
Výstup: byty_psn.json
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import subprocess
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import urlencode
logger = logging.getLogger(__name__)
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
WANTED_CITIES = {"Praha"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz"
UNITS_API = f"{BASE_URL}/api/units-list"
def fetch_json(url: str) -> dict:
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
logger.debug(f"HTTP GET: {url}")
result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}",
"-H", "Accept: application/json",
url],
capture_output=True, text=True, timeout=60
)
if result.returncode != 0:
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
return json.loads(result.stdout)
def fix_gps(lat, lng):
"""PSN má u některých projektů prohozené lat/lng — opravíme."""
if lat is not None and lng is not None and lat < 20 and lng > 20:
return lng, lat
return lat, lng
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def scrape(max_properties: int | None = None):
logger.info("=" * 60)
logger.info("Stahuji inzeráty z PSN.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Jediný API požadavek — vrátí všechny jednotky (cca 236)
params = urlencode({
"locale": "cs",
"filters": "{}",
"type": "list",
"order": "price-asc",
"offset": 0,
"limit": 500,
})
url = f"{UNITS_API}?{params}"
logger.info("Stahuji jednotky z API ...")
try:
data = fetch_json(url)
except Exception as e:
logger.error(f"Chyba při stahování: {e}", exc_info=True)
return []
all_units = data.get("units", {}).get("data", [])
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
# Filtrování
results = []
excluded = {
"prodáno": 0,
"typ": 0,
"město": 0,
"dispozice": 0,
"cena": 0,
"plocha": 0,
"patro": 0,
}
properties_fetched = 0
for unit in all_units:
if max_properties and properties_fetched >= max_properties:
break
unit_id = unit.get("id", "?")
# Pouze prodej bytů (type_id=0)
if unit.get("type_id") != 0:
excluded["typ"] += 1
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
continue
# Pouze volné (ne rezervované, prodané, v přípravě)
sale_status = unit.get("sale_status", "")
is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False)
if is_sold or not is_free:
excluded["prodáno"] += 1
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
continue
# Pouze Praha
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
# location field je typicky "Praha 4", "Praha 7" atd.
city_base = city.split(" ")[0] if city else ""
if city_base not in WANTED_CITIES:
excluded["město"] += 1
logger.debug(f"id={unit_id}: přeskočen (město={city})")
continue
# Dispozice
disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded["dispozice"] += 1
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
continue
# Cena
price = unit.get("action_price_czk") or unit.get("price_czk") or 0
if not price or price <= 0 or price > MAX_PRICE:
excluded["cena"] += 1
logger.debug(f"id={unit_id}: přeskočen (cena={price})")
continue
# Plocha
area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA:
excluded["plocha"] += 1
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
continue
# Patro
floor_str = str(unit.get("floor", ""))
floor = None
if floor_str:
try:
floor = int(floor_str)
except ValueError:
m = re.search(r'(-?\d+)', floor_str)
if m:
floor = int(m.group(1))
if floor is not None and floor < MIN_FLOOR:
excluded["patro"] += 1
logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
continue
# GPS — opravit prohozené souřadnice
lat_raw = unit.get("latitude")
lng_raw = unit.get("longitude")
lat, lng = fix_gps(lat_raw, lng_raw)
if not lat or not lng:
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
continue
# Sestavit adresu pro locality
addr = unit.get("address") or {}
street = addr.get("street", "")
street_no = addr.get("street_no", "")
if street and street_no:
locality_str = f"{street} {street_no}, {city}"
elif street:
locality_str = f"{street}, {city}"
else:
project_name = unit.get("project", "")
locality_str = f"{project_name}, {city}" if project_name else city
# URL na detail jednotky
unit_slug = unit.get("slug", "")
project_slug = ""
# project_slug lze odvodit z projektu nebo z reference_no
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID
reference_no = unit.get("reference_no", "")
if unit_slug:
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
elif reference_no:
detail_url = f"{BASE_URL}/prodej/{reference_no}"
else:
detail_url = BASE_URL
result = {
"hash_id": str(unit_id),
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
"price": int(price),
"price_formatted": format_price(int(price)),
"locality": locality_str,
"lat": lat,
"lon": lng,
"disposition": disp,
"floor": floor,
"area": float(area),
"building_type": "neuvedeno",
"ownership": "osobní",
"url": detail_url,
"source": "psn",
"image": "",
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
}
results.append(result)
properties_fetched += 1
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky PSN:")
logger.info(f" Staženo inzerátů: {len(all_units)}")
for reason, count in excluded.items():
if count:
logger.info(f" Vyloučeno ({reason}): {count}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Ignored — PSN uses a single API call, no pagination")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to include in results")
parser.add_argument("--log-level", type=str, default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
start = time.time()
estates = scrape(max_properties=args.max_properties)
if estates:
json_path = Path("byty_psn.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
else:
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")