Upload files to "/"

v1 scrapery
This commit is contained in:
2026-02-13 16:11:28 +00:00
parent 82d1f94104
commit 846d0bd9f2
5 changed files with 1760 additions and 0 deletions

306
scrape_psn.py Normal file
View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
PSN.cz scraper.
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
Výstup: byty_psn.json
"""
from __future__ import annotations
import json
import re
import subprocess
import time
from pathlib import Path
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz"
# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]
def fetch_url(url: str) -> str:
"""Fetch URL via curl (urllib SSL too old for Cloudflare)."""
result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}",
"-H", "Accept: text/html",
url],
capture_output=True, text=True, timeout=60
)
if result.returncode != 0:
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
return result.stdout
def extract_units_from_html(html: str) -> list[dict]:
"""Extract unit JSON objects from raw HTML with escaped quotes."""
# The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
# Step 1: Unescape the double-backslash-quotes to regular quotes
cleaned = html.replace('\\"', '"')
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
units = []
decoder = json.JSONDecoder()
for m in re.finditer(r'"title":"Byt', cleaned):
pos = m.start()
# Walk backwards to find the opening brace
depth = 0
found = False
for i in range(pos - 1, max(pos - 3000, 0), -1):
if cleaned[i] == '}':
depth += 1
elif cleaned[i] == '{':
if depth == 0:
try:
obj, end = decoder.raw_decode(cleaned, i)
if isinstance(obj, dict) and 'price_czk' in obj:
units.append(obj)
found = True
except (json.JSONDecodeError, ValueError):
pass
break
depth -= 1
return units
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def scrape():
print("=" * 60)
print("Stahuji inzeráty z PSN.cz")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
print("=" * 60)
# Fetch units from each Prague project
all_units = []
for proj in PRAGUE_PROJECTS:
page = 1
project_units = []
while True:
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
print(f" {proj['name']} — strana {page} ...")
time.sleep(0.5)
try:
html = fetch_url(url)
except Exception as e:
print(f" Chyba: {e}")
break
units = extract_units_from_html(html)
if not units:
if page == 1:
print(f" → 0 jednotek")
break
# Add project info to each unit
for unit in units:
if not unit.get("latitude") or not unit.get("longitude"):
unit["latitude"] = proj["lat"]
unit["longitude"] = proj["lon"]
unit["_project_name"] = proj["name"]
unit["_project_slug"] = proj["slug"]
project_units.extend(units)
if page == 1:
print(f"{len(units)} jednotek na stránce")
# Check if there might be more pages
# If we got fewer than expected or same units, stop
if len(units) < 10:
break
page += 1
if page > 10: # Safety limit
break
all_units.extend(project_units)
# Deduplicate by slug
seen_slugs = set()
unique_units = []
for u in all_units:
slug = u.get("slug", "")
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
unique_units.append(u)
elif not slug:
unique_units.append(u)
print(f"\n Staženo celkem: {len(unique_units)} unikátních jednotek")
# Filter
print(f"\nFiltrování...")
results = []
excluded_sold = 0
excluded_type = 0
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_floor = 0
excluded_panel = 0
for unit in unique_units:
# Only free units
is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False)
if is_sold or not is_free:
excluded_sold += 1
continue
# Only apartments
category = str(unit.get("category", "")).lower()
if "byt" not in category and "ateliér" not in category:
excluded_type += 1
continue
# Disposition
disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
continue
# Price
price = unit.get("price_czk") or unit.get("action_price_czk") or 0
if price <= 0 or price > MAX_PRICE:
excluded_price += 1
continue
# Area
area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA:
excluded_area += 1
continue
# Floor
floor_str = str(unit.get("floor", ""))
floor = None
if floor_str:
try:
floor = int(floor_str)
except ValueError:
floor_match = re.search(r'(-?\d+)', floor_str)
if floor_match:
floor = int(floor_match.group(1))
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
continue
# Construction — check for panel
build_type = str(unit.get("build_type", "")).lower()
if "panel" in build_type:
excluded_panel += 1
print(f" ✗ Vyloučen: panel ({build_type})")
continue
# Build construction label
building_type = "neuvedeno"
if build_type and build_type != "nevybráno":
if "cihlo" in build_type or "cihla" in build_type:
building_type = "Cihlová"
elif "skelet" in build_type:
building_type = "Skeletová"
else:
building_type = build_type.capitalize()
lat = unit.get("latitude", 0)
lon = unit.get("longitude", 0)
slug = unit.get("slug", "")
project_slug = unit.get("_project_slug", "")
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
result = {
"hash_id": unit.get("id", slug),
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
"price": int(price),
"price_formatted": format_price(int(price)),
"locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
"lat": lat,
"lon": lon,
"disposition": disp,
"floor": floor,
"area": area,
"building_type": building_type,
"ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
"url": detail_url,
"source": "psn",
"image": "",
}
results.append(result)
print(f"\n{'=' * 60}")
print(f"Výsledky PSN:")
print(f" Celkem jednotek: {len(unique_units)}")
print(f" Vyloučeno (prodáno): {excluded_sold}")
print(f" Vyloučeno (typ): {excluded_type}")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (panel): {excluded_panel}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_psn.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z PSN neodpovídají kritériím :(")