Files
maru-hleda-byt/scrape_bezrealitky.py
2026-02-13 16:11:28 +00:00

352 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Bezrealitky.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_bezrealitky.json
"""
from __future__ import annotations
import json
import math
import re
import time
import urllib.request
from pathlib import Path
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 15 # Bezrealitky vrací 15 na stránku
# Dispozice které chceme
WANTED_DISPOSITIONS = {
"DISP_3_KK", "DISP_3_1",
"DISP_4_KK", "DISP_4_1",
"DISP_5_KK", "DISP_5_1",
"DISP_6",
"DISP_OTHER", # atypické
}
DISPOSITION_LABELS = {
"DISP_1_KK": "1+kk", "DISP_1_1": "1+1",
"DISP_2_KK": "2+kk", "DISP_2_1": "2+1",
"DISP_3_KK": "3+kk", "DISP_3_1": "3+1",
"DISP_4_KK": "4+kk", "DISP_4_1": "4+1",
"DISP_5_KK": "5+kk", "DISP_5_1": "5+1",
"DISP_6": "6+",
"DISP_OTHER": "Atypický",
}
CONSTRUCTION_MAP = {
"BRICK": "Cihlová",
"PANEL": "Panelová",
"WOOD": "Dřevostavba",
"MIXED": "Smíšená",
"MONTAGE": "Montovaná",
"STEEL": "Ocelová",
}
OWNERSHIP_MAP = {
"OSOBNI": "Osobní",
"DRUZSTEVNI": "Družstevní",
"STATNI": "Státní/obecní",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "cs,en;q=0.9",
}
BASE_URL = "https://www.bezrealitky.cz"
def fetch_page(page: int) -> tuple[list[dict], int]:
"""
Fetch a listing page from Bezrealitky.
Returns (list of advert dicts from Apollo cache, total count).
"""
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return [], 0
data = json.loads(match.group(1))
cache = data["props"]["pageProps"]["apolloCache"]
# Extract adverts from cache
adverts = []
for key, val in cache.items():
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
adverts.append(val)
# Get total count from ROOT_QUERY
total = 0
root = cache.get("ROOT_QUERY", {})
for key, val in root.items():
if "listAdverts" in key and isinstance(val, dict):
tc = val.get("totalCount")
if tc and tc > total:
total = tc
return adverts, total
def fetch_detail(uri: str) -> dict | None:
"""Fetch detail page for a listing."""
try:
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return None
data = json.loads(match.group(1))
cache = data["props"]["pageProps"]["apolloCache"]
# Find the full advert in cache
for key, val in cache.items():
if key.startswith("Advert:") and isinstance(val, dict):
# Detail pages have much more fields
if "construction" in val or "etage" in val or "ownership" in val:
return val
except Exception as e:
print(f" Warning: detail failed for {uri}: {e}")
return None
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {e["hash_id"]: e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape():
cache = load_cache()
print("=" * 60)
print("Stahuji inzeráty z Bezrealitky.cz")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha")
if cache:
print(f"Cache: {len(cache)} bytů z minulého běhu")
print("=" * 60)
# Step 1: Fetch all listing pages
print("\nFáze 1: Stahování seznamu inzerátů...")
all_adverts = {} # id -> advert dict (dedup)
page = 1
total = None
while True:
print(f" Strana {page} ...")
adverts, total_count = fetch_page(page)
if total is None and total_count > 0:
total = total_count
total_pages = math.ceil(total / PER_PAGE)
print(f" → Celkem {total} inzerátů, ~{total_pages} stran")
if not adverts:
break
for adv in adverts:
adv_id = adv.get("id")
if adv_id and adv_id not in all_adverts:
all_adverts[adv_id] = adv
page += 1
if total and page > math.ceil(total / PER_PAGE):
break
time.sleep(0.5)
print(f"\n Staženo: {len(all_adverts)} unikátních inzerátů")
# Step 2: Pre-filter by disposition, price, area from list data
pre_filtered = []
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_no_gps = 0
for adv in all_adverts.values():
disp = adv.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
continue
price = adv.get("price", 0) or 0
if price > MAX_PRICE or price == 0:
excluded_price += 1
continue
surface = adv.get("surface")
if surface is not None and surface < MIN_AREA:
excluded_area += 1
continue
gps = adv.get("gps", {})
if not gps or not gps.get("lat") or not gps.get("lng"):
excluded_no_gps += 1
continue
pre_filtered.append(adv)
print(f"\nPo předfiltraci:")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
print(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_detail = 0
cache_hits = 0
for i, adv in enumerate(pre_filtered):
uri = adv.get("uri", "")
if not uri:
excluded_detail += 1
continue
# Check cache — if hash_id exists and price unchanged, reuse
adv_id = int(adv["id"])
adv_price = adv.get("price", 0) or 0
cached = cache.get(adv_id)
if cached and cached.get("price") == adv_price:
cache_hits += 1
results.append(cached)
continue
time.sleep(0.4)
detail = fetch_detail(uri)
if not detail:
excluded_detail += 1
continue
# Check construction — exclude panel
construction = detail.get("construction", "")
if construction == "PANEL":
excluded_panel += 1
print(f" ✗ Vyloučen #{adv['id']}: panel")
continue
# Check situation — exclude sídliště
situation = detail.get("situation", "")
if situation and "HOUSING_ESTATE" in str(situation).upper():
excluded_panel += 1
print(f" ✗ Vyloučen #{adv['id']}: sídliště")
continue
# Check floor (etage)
etage = detail.get("etage")
if etage is not None and etage < MIN_FLOOR:
excluded_floor += 1
continue
gps = adv.get("gps", {})
disp = adv.get("disposition", "")
# Get address — key includes locale parameter
address = ""
for key in detail:
if key.startswith("address(") and "withHouseNumber" not in key:
address = detail[key]
break
if not address:
for key in detail:
if key.startswith("address("):
address = detail[key]
break
if not address:
address = adv.get('address({"locale":"CS"})', "Praha")
result = {
"hash_id": int(adv["id"]),
"name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')}",
"price": adv.get("price", 0),
"price_formatted": format_price(adv.get("price", 0)),
"locality": address,
"lat": gps["lat"],
"lon": gps["lng"],
"disposition": DISPOSITION_LABELS.get(disp, "?"),
"floor": etage,
"area": adv.get("surface"),
"building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"),
"ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
"source": "bezrealitky",
"image": "",
}
results.append(result)
if (i + 1) % 20 == 0:
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
print(f"\n{'=' * 60}")
print(f"Výsledky Bezrealitky:")
print(f" Předfiltrováno: {len(pre_filtered)}")
print(f" Z cache (přeskočeno): {cache_hits}")
print(f" Vyloučeno (panel/síd): {excluded_panel}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (bez detailu): {excluded_detail}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_bezrealitky.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")