Rewrite PSN + CityHome scrapers, add price/m² map coloring, ratings system, and status dashboard
- Rewrite PSN scraper to use /api/units-list endpoint (single API call, no HTML parsing) - Fix CityHome scraper: GPS from multiple URL patterns, address from table cells, no 404 retries - Color map markers by price/m² instead of disposition (blue→green→orange→red scale) - Add persistent rating system (favorite/reject) with Flask ratings server and localStorage fallback - Rejected markers show original color at reduced opacity with 🚫 SVG overlay - Favorite markers shown as ⭐ star icons with gold pulse animation - Add "new today" marker logic (scraped_at == today) with larger pulsing green outline - Add filter panel with floor, price, hide-rejected controls and ☰/✕ toggle buttons - Add generate_status.py for scraper run statistics and status.html dashboard - Add scraped_at field to all scrapers for freshness tracking - Update run_all.sh with log capture and status generation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
338
scrape_psn.py
338
scrape_psn.py
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PSN.cz scraper.
|
||||
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
|
||||
Stáhne byty na prodej z API /api/units-list — jeden požadavek, žádné stránkování.
|
||||
Výstup: byty_psn.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
@@ -12,7 +12,9 @@ import logging
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -22,82 +24,37 @@ MAX_PRICE = 14_000_000
|
||||
MIN_AREA = 69
|
||||
MIN_FLOOR = 2
|
||||
|
||||
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
|
||||
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1", "5+kk a větší"}
|
||||
|
||||
# Pouze Praha — ostatní města (Brno, Pardubice, Špindlerův Mlýn) přeskočit
|
||||
WANTED_CITIES = {"Praha"}
|
||||
|
||||
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
BASE_URL = "https://psn.cz"
|
||||
|
||||
# Known Prague project slugs with GPS (from research)
|
||||
PRAGUE_PROJECTS = [
|
||||
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
|
||||
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
|
||||
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
|
||||
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
|
||||
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
|
||||
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
|
||||
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
|
||||
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
|
||||
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
|
||||
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
|
||||
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
|
||||
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
|
||||
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
|
||||
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
|
||||
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
|
||||
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
|
||||
]
|
||||
UNITS_API = f"{BASE_URL}/api/units-list"
|
||||
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch URL via curl (urllib SSL too old for Cloudflare)."""
|
||||
logger.debug(f"HTTP GET request (via curl): {url}")
|
||||
logger.debug(f"User-Agent: {UA}")
|
||||
def fetch_json(url: str) -> dict:
|
||||
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
|
||||
logger.debug(f"HTTP GET: {url}")
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-L", "--max-time", "30",
|
||||
"-H", f"User-Agent: {UA}",
|
||||
"-H", "Accept: text/html",
|
||||
"-H", "Accept: application/json",
|
||||
url],
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
|
||||
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
|
||||
logger.debug(f"HTTP response: size={len(result.stdout)} bytes")
|
||||
return result.stdout
|
||||
return json.loads(result.stdout)
|
||||
|
||||
|
||||
def extract_units_from_html(html: str) -> list[dict]:
|
||||
"""Extract unit JSON objects from raw HTML with escaped quotes."""
|
||||
# The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
|
||||
# Step 1: Unescape the double-backslash-quotes to regular quotes
|
||||
cleaned = html.replace('\\"', '"')
|
||||
|
||||
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
|
||||
units = []
|
||||
decoder = json.JSONDecoder()
|
||||
|
||||
for m in re.finditer(r'"title":"Byt', cleaned):
|
||||
pos = m.start()
|
||||
# Walk backwards to find the opening brace
|
||||
depth = 0
|
||||
found = False
|
||||
for i in range(pos - 1, max(pos - 3000, 0), -1):
|
||||
if cleaned[i] == '}':
|
||||
depth += 1
|
||||
elif cleaned[i] == '{':
|
||||
if depth == 0:
|
||||
try:
|
||||
obj, end = decoder.raw_decode(cleaned, i)
|
||||
if isinstance(obj, dict) and 'price_czk' in obj:
|
||||
units.append(obj)
|
||||
found = True
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
break
|
||||
depth -= 1
|
||||
|
||||
return units
|
||||
def fix_gps(lat, lng):
|
||||
"""PSN má u některých projektů prohozené lat/lng — opravíme."""
|
||||
if lat is not None and lng is not None and lat < 20 and lng > 20:
|
||||
return lng, lat
|
||||
return lat, lng
|
||||
|
||||
|
||||
def format_price(price: int) -> str:
|
||||
@@ -109,209 +66,178 @@ def format_price(price: int) -> str:
|
||||
return " ".join(reversed(parts)) + " Kč"
|
||||
|
||||
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
def scrape(max_properties: int | None = None):
|
||||
logger.info("=" * 60)
|
||||
logger.info("Stahuji inzeráty z PSN.cz")
|
||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
||||
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
||||
logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
|
||||
if max_pages:
|
||||
logger.info(f"Max. stran: {max_pages}")
|
||||
logger.info(f"Region: Praha")
|
||||
if max_properties:
|
||||
logger.info(f"Max. bytů: {max_properties}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Fetch units from each Prague project
|
||||
all_units = []
|
||||
# Jediný API požadavek — vrátí všechny jednotky (cca 236)
|
||||
params = urlencode({
|
||||
"locale": "cs",
|
||||
"filters": "{}",
|
||||
"type": "list",
|
||||
"order": "price-asc",
|
||||
"offset": 0,
|
||||
"limit": 500,
|
||||
})
|
||||
url = f"{UNITS_API}?{params}"
|
||||
logger.info("Stahuji jednotky z API ...")
|
||||
|
||||
for proj in PRAGUE_PROJECTS:
|
||||
page = 1
|
||||
project_units = []
|
||||
try:
|
||||
data = fetch_json(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Chyba při stahování: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
while True:
|
||||
if max_pages and page > max_pages:
|
||||
logger.debug(f"Max pages limit reached: {max_pages}")
|
||||
break
|
||||
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
|
||||
logger.info(f"{proj['name']} — strana {page} ...")
|
||||
time.sleep(0.5)
|
||||
all_units = data.get("units", {}).get("data", [])
|
||||
logger.info(f"Staženo jednotek celkem: {len(all_units)}")
|
||||
|
||||
try:
|
||||
html = fetch_url(url)
|
||||
except Exception as e:
|
||||
logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
|
||||
break
|
||||
|
||||
units = extract_units_from_html(html)
|
||||
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")
|
||||
|
||||
if not units:
|
||||
if page == 1:
|
||||
logger.info(f"→ 0 jednotek")
|
||||
break
|
||||
|
||||
# Add project info to each unit
|
||||
for unit in units:
|
||||
if not unit.get("latitude") or not unit.get("longitude"):
|
||||
unit["latitude"] = proj["lat"]
|
||||
unit["longitude"] = proj["lon"]
|
||||
unit["_project_name"] = proj["name"]
|
||||
unit["_project_slug"] = proj["slug"]
|
||||
|
||||
project_units.extend(units)
|
||||
|
||||
if page == 1:
|
||||
logger.info(f"→ {len(units)} jednotek na stránce")
|
||||
|
||||
# Check if there might be more pages
|
||||
# If we got fewer than expected or same units, stop
|
||||
if len(units) < 10:
|
||||
break
|
||||
|
||||
page += 1
|
||||
if page > 10: # Safety limit
|
||||
break
|
||||
|
||||
all_units.extend(project_units)
|
||||
|
||||
# Deduplicate by slug
|
||||
seen_slugs = set()
|
||||
unique_units = []
|
||||
for u in all_units:
|
||||
slug = u.get("slug", "")
|
||||
if slug and slug not in seen_slugs:
|
||||
seen_slugs.add(slug)
|
||||
unique_units.append(u)
|
||||
elif not slug:
|
||||
unique_units.append(u)
|
||||
|
||||
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
|
||||
|
||||
# Filter
|
||||
logger.info(f"\nFiltrování...")
|
||||
# Filtrování
|
||||
results = []
|
||||
excluded_sold = 0
|
||||
excluded_type = 0
|
||||
excluded_disp = 0
|
||||
excluded_price = 0
|
||||
excluded_area = 0
|
||||
excluded_floor = 0
|
||||
excluded_panel = 0
|
||||
excluded = {
|
||||
"prodáno": 0,
|
||||
"typ": 0,
|
||||
"město": 0,
|
||||
"dispozice": 0,
|
||||
"cena": 0,
|
||||
"plocha": 0,
|
||||
"patro": 0,
|
||||
}
|
||||
properties_fetched = 0
|
||||
|
||||
for unit in unique_units:
|
||||
for unit in all_units:
|
||||
if max_properties and properties_fetched >= max_properties:
|
||||
logger.debug(f"Max properties limit reached: {max_properties}")
|
||||
break
|
||||
unit_id = unit.get("id", unit.get("slug", "unknown"))
|
||||
# Only free units
|
||||
|
||||
unit_id = unit.get("id", "?")
|
||||
|
||||
# Pouze prodej bytů (type_id=0)
|
||||
if unit.get("type_id") != 0:
|
||||
excluded["typ"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (type_id={unit.get('type_id')}, není prodej bytu)")
|
||||
continue
|
||||
|
||||
# Pouze volné (ne rezervované, prodané, v přípravě)
|
||||
sale_status = unit.get("sale_status", "")
|
||||
is_free = unit.get("is_free", False)
|
||||
is_sold = unit.get("is_sold", False)
|
||||
if is_sold or not is_free:
|
||||
excluded_sold += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)")
|
||||
excluded["prodáno"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (status={sale_status})")
|
||||
continue
|
||||
|
||||
# Only apartments
|
||||
category = str(unit.get("category", "")).lower()
|
||||
if "byt" not in category and "ateliér" not in category:
|
||||
excluded_type += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})")
|
||||
# Pouze Praha
|
||||
city = (unit.get("location") or unit.get("address", {}).get("city") or "").strip()
|
||||
# location field je typicky "Praha 4", "Praha 7" atd.
|
||||
city_base = city.split(" ")[0] if city else ""
|
||||
if city_base not in WANTED_CITIES:
|
||||
excluded["město"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (město={city})")
|
||||
continue
|
||||
|
||||
# Disposition
|
||||
# Dispozice
|
||||
disp = unit.get("disposition", "")
|
||||
if disp not in WANTED_DISPOSITIONS:
|
||||
excluded_disp += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})")
|
||||
excluded["dispozice"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (dispozice={disp})")
|
||||
continue
|
||||
|
||||
# Price
|
||||
price = unit.get("price_czk") or unit.get("action_price_czk") or 0
|
||||
if price <= 0 or price > MAX_PRICE:
|
||||
excluded_price += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (price {price})")
|
||||
# Cena
|
||||
price = unit.get("action_price_czk") or unit.get("price_czk") or 0
|
||||
if not price or price <= 0 or price > MAX_PRICE:
|
||||
excluded["cena"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (cena={price})")
|
||||
continue
|
||||
|
||||
# Area
|
||||
# Plocha
|
||||
area = unit.get("total_area") or unit.get("floor_area") or 0
|
||||
if area < MIN_AREA:
|
||||
excluded_area += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)")
|
||||
excluded["plocha"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (plocha={area} m²)")
|
||||
continue
|
||||
|
||||
# Floor
|
||||
# Patro
|
||||
floor_str = str(unit.get("floor", ""))
|
||||
floor = None
|
||||
if floor_str:
|
||||
try:
|
||||
floor = int(floor_str)
|
||||
except ValueError:
|
||||
floor_match = re.search(r'(-?\d+)', floor_str)
|
||||
if floor_match:
|
||||
floor = int(floor_match.group(1))
|
||||
m = re.search(r'(-?\d+)', floor_str)
|
||||
if m:
|
||||
floor = int(m.group(1))
|
||||
|
||||
if floor is not None and floor < MIN_FLOOR:
|
||||
excluded_floor += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})")
|
||||
excluded["patro"] += 1
|
||||
logger.debug(f"id={unit_id}: přeskočen (patro={floor})")
|
||||
continue
|
||||
|
||||
# Construction — check for panel
|
||||
build_type = str(unit.get("build_type", "")).lower()
|
||||
if "panel" in build_type:
|
||||
excluded_panel += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (panel construction)")
|
||||
logger.info(f"✗ Vyloučen: panel ({build_type})")
|
||||
# GPS — opravit prohozené souřadnice
|
||||
lat_raw = unit.get("latitude")
|
||||
lng_raw = unit.get("longitude")
|
||||
lat, lng = fix_gps(lat_raw, lng_raw)
|
||||
if not lat or not lng:
|
||||
logger.warning(f"id={unit_id}: chybí GPS souřadnice, přeskakuji")
|
||||
continue
|
||||
|
||||
# Build construction label
|
||||
building_type = "neuvedeno"
|
||||
if build_type and build_type != "nevybráno":
|
||||
if "cihlo" in build_type or "cihla" in build_type:
|
||||
building_type = "Cihlová"
|
||||
elif "skelet" in build_type:
|
||||
building_type = "Skeletová"
|
||||
else:
|
||||
building_type = build_type.capitalize()
|
||||
# Sestavit adresu pro locality
|
||||
addr = unit.get("address") or {}
|
||||
street = addr.get("street", "")
|
||||
street_no = addr.get("street_no", "")
|
||||
if street and street_no:
|
||||
locality_str = f"{street} {street_no}, {city}"
|
||||
elif street:
|
||||
locality_str = f"{street}, {city}"
|
||||
else:
|
||||
project_name = unit.get("project", "")
|
||||
locality_str = f"{project_name}, {city}" if project_name else city
|
||||
|
||||
lat = unit.get("latitude", 0)
|
||||
lon = unit.get("longitude", 0)
|
||||
|
||||
slug = unit.get("slug", "")
|
||||
project_slug = unit.get("_project_slug", "")
|
||||
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
|
||||
# URL na detail jednotky
|
||||
unit_slug = unit.get("slug", "")
|
||||
project_slug = ""
|
||||
# project_slug lze odvodit z projektu nebo z reference_no
|
||||
# API nevrací project_slug přímo — použijeme reference_no nebo jen ID
|
||||
reference_no = unit.get("reference_no", "")
|
||||
if unit_slug:
|
||||
detail_url = f"{BASE_URL}/prodej/{unit_slug}"
|
||||
elif reference_no:
|
||||
detail_url = f"{BASE_URL}/prodej/{reference_no}"
|
||||
else:
|
||||
detail_url = BASE_URL
|
||||
|
||||
result = {
|
||||
"hash_id": unit.get("id", slug),
|
||||
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
|
||||
"hash_id": str(unit_id),
|
||||
"name": f"Prodej bytu {disp}, {int(area)} m² — {unit.get('project', locality_str)}",
|
||||
"price": int(price),
|
||||
"price_formatted": format_price(int(price)),
|
||||
"locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
|
||||
"locality": locality_str,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"lon": lng,
|
||||
"disposition": disp,
|
||||
"floor": floor,
|
||||
"area": area,
|
||||
"building_type": building_type,
|
||||
"ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
|
||||
"area": float(area),
|
||||
"building_type": "neuvedeno",
|
||||
"ownership": "osobní",
|
||||
"url": detail_url,
|
||||
"source": "psn",
|
||||
"image": "",
|
||||
"scraped_at": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"Výsledky PSN:")
|
||||
logger.info(f" Celkem jednotek: {len(unique_units)}")
|
||||
logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
logger.info(f" Vyloučeno (typ): {excluded_type}")
|
||||
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
||||
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
||||
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
||||
logger.info(f" Vyloučeno (panel): {excluded_panel}")
|
||||
logger.info(f" Staženo jednotek: {len(all_units)}")
|
||||
for reason, count in excluded.items():
|
||||
if count:
|
||||
logger.info(f" Vyloučeno ({reason}): {count}")
|
||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
|
||||
@@ -320,15 +246,13 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
|
||||
parser.add_argument("--max-pages", type=int, default=None,
|
||||
help="Maximum number of listing pages per project to scrape")
|
||||
parser.add_argument("--max-properties", type=int, default=None,
|
||||
help="Maximum number of properties to include in results")
|
||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
parser.add_argument("--log-level", type=str, default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging level (default: INFO)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
@@ -336,7 +260,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||
estates = scrape(max_properties=args.max_properties)
|
||||
|
||||
if estates:
|
||||
json_path = Path("byty_psn.json")
|
||||
@@ -346,6 +270,6 @@ if __name__ == "__main__":
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.1f} s")
|
||||
else:
|
||||
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")
|
||||
|
||||
Reference in New Issue
Block a user