- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing) - Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries - Color map markers by price/m² instead of disposition (blue→green→orange→red scale) - Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback - Rejected markers show original color at reduced opacity with 🚫 SVG overlay - Favorite markers shown as ⭐ star icons with gold pulse animation - Add "new today" marker logic (scraped_at == today) with larger pulsing green outline - Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons - Add generate_status.py for scraper run statistics and status.html dashboard - Add scraped_at field to all scrapers for freshness tracking - Update run_all.sh with log capture and status generation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
276 lines
9.0 KiB
Python
276 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PSN.cz scraper.
|
|
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
|
|
Výstup: byty_psn.json
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import subprocess
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from urllib.parse import urlencode
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
|
|
|
MAX_PRICE = 14_000_000
|
|
MIN_AREA = 69
|
|
MIN_FLOOR = 2
|
|
|
|
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
|
|
|
|
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
|
|
WANTED_CITIES = {"Praha"}
|
|
|
|
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
|
|
BASE_URL = "https://psn.cz"
|
|
UNITS_API = f"{BASE_URL}/api/units-list"
|
|
|
|
|
|
def fetch_json(url: str) -> dict:
|
|
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
|
|
logger.debug(f"HTTP GET: {url}")
|
|
result = subprocess.run(
|
|
["curl", "-s", "-L", "--max-time", "30",
|
|
"-H", f"User-Agent: {UA}",
|
|
"-H", "Accept: application/json",
|
|
url],
|
|
capture_output=True, text=True, timeout=60
|
|
)
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
|
|
return json.loads(result.stdout)
|
|
|
|
|
|
def fix_gps(lat, lng):
|
|
"""PSN má u některých projektů prohozené lat/lng — opravíme."""
|
|
if lat is not None and lng is not None and lat < 20 and lng > 20:
|
|
return lng, lat
|
|
return lat, lng
|
|
|
|
|
|
def format_price(price: int) -> str:
|
|
s = str(price)
|
|
parts = []
|
|
while s:
|
|
parts.append(s[-3:])
|
|
s = s[:-3]
|
|
return " ".join(reversed(parts)) + " Kč"
|
|
|
|
|
|
def scrape(max_properties: int | None = None):
|
|
logger.info("=" * 60)
|
|
logger.info("Stahuji inzeráty z PSN.cz")
|
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
|
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
|
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
|
logger.info(f"Region: Praha")
|
|
if max_properties:
|
|
logger.info(f"Max. bytů: {max_properties}")
|
|
logger.info("=" * 60)
|
|
|
|
# Jediný API požadavek — vrátí všechny jednotky (cca 236)
|
|
params = urlencode({
|
|
"locale": "cs",
|
|
"filters": "{}",
|
|
"type": "list",
|
|
"order": "price-asc",
|
|
"offset": 0,
|
|
"limit": 500,
|
|
})
|
|
url = f"{UNITS_API}?{params}"
|
|
logger.info("Stahuji jednotky z API ...")
|
|
|
|
try:
|
|
data = fetch_json(url)
|
|
except Exception as e:
|
|
logger.error(f"Chyba při stahování: {e}", exc_info=True)
|
|
return []
|
|
|
|
all_units = data.get("units", {}).get("data", [])
|
|
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
|
|
|
|
# Filtrování
|
|
results = []
|
|
excluded = {
|
|
"prodáno": 0,
|
|
"typ": 0,
|
|
"město": 0,
|
|
"dispozice": 0,
|
|
"cena": 0,
|
|
"plocha": 0,
|
|
"patro": 0,
|
|
}
|
|
properties_fetched = 0
|
|
|
|
for unit in all_units:
|
|
if max_properties and properties_fetched >= max_properties:
|
|
break
|
|
|
|
unit_id = unit.get("id", "?")
|
|
|
|
# Pouze prodej bytů (type_id=0)
|
|
if unit.get("type_id") != 0:
|
|
excluded["typ"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
|
|
continue
|
|
|
|
# Pouze volné (ne rezervované, prodané, v přípravě)
|
|
sale_status = unit.get("sale_status", "")
|
|
is_free = unit.get("is_free", False)
|
|
is_sold = unit.get("is_sold", False)
|
|
if is_sold or not is_free:
|
|
excluded["prodáno"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
|
|
continue
|
|
|
|
# Pouze Praha
|
|
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
|
|
# location field je typicky "Praha 4", "Praha 7" atd.
|
|
city_base = city.split(" ")[0] if city else ""
|
|
if city_base not in WANTED_CITIES:
|
|
excluded["město"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (město={city})")
|
|
continue
|
|
|
|
# Dispozice
|
|
disp = unit.get("disposition", "")
|
|
if disp not in WANTED_DISPOSITIONS:
|
|
excluded["dispozice"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
|
|
continue
|
|
|
|
# Cena
|
|
price = unit.get("action_price_czk") or unit.get("price_czk") or 0
|
|
if not price or price <= 0 or price > MAX_PRICE:
|
|
excluded["cena"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (cena={price})")
|
|
continue
|
|
|
|
# Plocha
|
|
area = unit.get("total_area") or unit.get("floor_area") or 0
|
|
if area < MIN_AREA:
|
|
excluded["plocha"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
|
|
continue
|
|
|
|
# Patro
|
|
floor_str = str(unit.get("floor", ""))
|
|
floor = None
|
|
if floor_str:
|
|
try:
|
|
floor = int(floor_str)
|
|
except ValueError:
|
|
m = re.search(r'(-?\d+)', floor_str)
|
|
if m:
|
|
floor = int(m.group(1))
|
|
|
|
if floor is not None and floor < MIN_FLOOR:
|
|
excluded["patro"] += 1
|
|
logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
|
|
continue
|
|
|
|
# GPS — opravit prohozené souřadnice
|
|
lat_raw = unit.get("latitude")
|
|
lng_raw = unit.get("longitude")
|
|
lat, lng = fix_gps(lat_raw, lng_raw)
|
|
if not lat or not lng:
|
|
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
|
|
continue
|
|
|
|
# Sestavit adresu pro locality
|
|
addr = unit.get("address") or {}
|
|
street = addr.get("street", "")
|
|
street_no = addr.get("street_no", "")
|
|
if street and street_no:
|
|
locality_str = f"{street} {street_no}, {city}"
|
|
elif street:
|
|
locality_str = f"{street}, {city}"
|
|
else:
|
|
project_name = unit.get("project", "")
|
|
locality_str = f"{project_name}, {city}" if project_name else city
|
|
|
|
# URL na detail jednotky
|
|
unit_slug = unit.get("slug", "")
|
|
project_slug = ""
|
|
# project_slug lze odvodit z projektu nebo z reference_no
|
|
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID
|
|
reference_no = unit.get("reference_no", "")
|
|
if unit_slug:
|
|
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
|
|
elif reference_no:
|
|
detail_url = f"{BASE_URL}/prodej/{reference_no}"
|
|
else:
|
|
detail_url = BASE_URL
|
|
|
|
result = {
|
|
"hash_id": str(unit_id),
|
|
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
|
|
"price": int(price),
|
|
"price_formatted": format_price(int(price)),
|
|
"locality": locality_str,
|
|
"lat": lat,
|
|
"lon": lng,
|
|
"disposition": disp,
|
|
"floor": floor,
|
|
"area": float(area),
|
|
"building_type": "neuvedeno",
|
|
"ownership": "osobní",
|
|
"url": detail_url,
|
|
"source": "psn",
|
|
"image": "",
|
|
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
|
}
|
|
results.append(result)
|
|
properties_fetched += 1
|
|
|
|
logger.info(f"\n{'=' * 60}")
|
|
logger.info(f"Výsledky PSN:")
|
|
logger.info(f" Staženo jednotek: {len(all_units)}")
|
|
for reason, count in excluded.items():
|
|
if count:
|
|
logger.info(f" Vyloučeno ({reason}): {count}")
|
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
|
logger.info(f"{'=' * 60}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
|
|
parser.add_argument("--max-properties", type=int, default=None,
|
|
help="Maximum number of properties to include in results")
|
|
parser.add_argument("--log-level", type=str, default="INFO",
|
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
help="Logging level (default: INFO)")
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=getattr(logging, args.log_level),
|
|
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
|
handlers=[logging.StreamHandler()]
|
|
)
|
|
|
|
start = time.time()
|
|
estates = scrape(max_properties=args.max_properties)
|
|
|
|
if estates:
|
|
json_path = Path("byty_psn.json")
|
|
json_path.write_text(
|
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
elapsed = time.time() - start
|
|
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
|
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
|
|
else:
|
|
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")
|