Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it
All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).
The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.
The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
421 lines
14 KiB
Python
421 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bezrealitky.cz scraper.
|
|
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
|
|
Výstup: byty_bezrealitky.json
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import math
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
|
|
|
MAX_PRICE = 13_500_000
|
|
MIN_AREA = 69
|
|
MIN_FLOOR = 2
|
|
PER_PAGE = 15 # Bezrealitky vrací 15 na stránku
|
|
|
|
# Dispozice které chceme
|
|
WANTED_DISPOSITIONS = {
|
|
"DISP_3_KK", "DISP_3_1",
|
|
"DISP_4_KK", "DISP_4_1",
|
|
"DISP_5_KK", "DISP_5_1",
|
|
"DISP_6",
|
|
"DISP_OTHER", # atypické
|
|
}
|
|
|
|
DISPOSITION_LABELS = {
|
|
"DISP_1_KK": "1+kk", "DISP_1_1": "1+1",
|
|
"DISP_2_KK": "2+kk", "DISP_2_1": "2+1",
|
|
"DISP_3_KK": "3+kk", "DISP_3_1": "3+1",
|
|
"DISP_4_KK": "4+kk", "DISP_4_1": "4+1",
|
|
"DISP_5_KK": "5+kk", "DISP_5_1": "5+1",
|
|
"DISP_6": "6+",
|
|
"DISP_OTHER": "Atypický",
|
|
}
|
|
|
|
CONSTRUCTION_MAP = {
|
|
"BRICK": "Cihlová",
|
|
"PANEL": "Panelová",
|
|
"WOOD": "Dřevostavba",
|
|
"MIXED": "Smíšená",
|
|
"MONTAGE": "Montovaná",
|
|
"STEEL": "Ocelová",
|
|
}
|
|
|
|
OWNERSHIP_MAP = {
|
|
"OSOBNI": "Osobní",
|
|
"DRUZSTEVNI": "Družstevní",
|
|
"STATNI": "Státní/obecní",
|
|
}
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
"Accept-Language": "cs,en;q=0.9",
|
|
}
|
|
|
|
BASE_URL = "https://www.bezrealitky.cz"
|
|
|
|
|
|
def fetch_page(page: int) -> tuple[list[dict], int]:
|
|
"""
|
|
Fetch a listing page from Bezrealitky.
|
|
Returns (list of advert dicts from Apollo cache, total count).
|
|
"""
|
|
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
|
|
logger.debug(f"HTTP GET request: {url}")
|
|
logger.debug(f"Headers: {HEADERS}")
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
try:
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
html = resp.read().decode("utf-8")
|
|
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
|
|
match = re.search(
|
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
|
html, re.DOTALL
|
|
)
|
|
if not match:
|
|
logger.debug("No __NEXT_DATA__ script found in HTML")
|
|
return [], 0
|
|
|
|
data = json.loads(match.group(1))
|
|
cache = data["props"]["pageProps"]["apolloCache"]
|
|
|
|
# Extract adverts from cache
|
|
adverts = []
|
|
for key, val in cache.items():
|
|
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
|
|
adverts.append(val)
|
|
|
|
# Get total count from ROOT_QUERY
|
|
total = 0
|
|
root = cache.get("ROOT_QUERY", {})
|
|
for key, val in root.items():
|
|
if "listAdverts" in key and isinstance(val, dict):
|
|
tc = val.get("totalCount")
|
|
if tc and tc > total:
|
|
total = tc
|
|
|
|
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
|
|
return adverts, total
|
|
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
|
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
|
raise
|
|
|
|
|
|
def fetch_detail(uri: str) -> dict | None:
|
|
"""Fetch detail page for a listing."""
|
|
try:
|
|
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
|
|
logger.debug(f"HTTP GET request: {url}")
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
html = resp.read().decode("utf-8")
|
|
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
|
|
|
match = re.search(
|
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
|
html, re.DOTALL
|
|
)
|
|
if not match:
|
|
logger.debug("No __NEXT_DATA__ script found in detail page")
|
|
return None
|
|
|
|
data = json.loads(match.group(1))
|
|
cache = data["props"]["pageProps"]["apolloCache"]
|
|
|
|
# Find the full advert in cache
|
|
for key, val in cache.items():
|
|
if key.startswith("Advert:") and isinstance(val, dict):
|
|
# Detail pages have much more fields
|
|
if "construction" in val or "etage" in val or "ownership" in val:
|
|
logger.debug(f"Detail found for {uri}: construction={val.get('construction')}, etage={val.get('etage')}")
|
|
return val
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Detail failed for {uri}: {e}", exc_info=True)
|
|
return None
|
|
|
|
|
|
def format_price(price: int) -> str:
|
|
s = str(price)
|
|
parts = []
|
|
while s:
|
|
parts.append(s[-3:])
|
|
s = s[:-3]
|
|
return " ".join(reversed(parts)) + " Kč"
|
|
|
|
|
|
def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
|
|
"""Load previously scraped data as cache keyed by hash_id."""
|
|
path = Path(json_path)
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
return {e["hash_id"]: e for e in data if "hash_id" in e}
|
|
except (json.JSONDecodeError, KeyError):
|
|
return {}
|
|
|
|
|
|
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
|
cache = load_cache()
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("Stahuji inzeráty z Bezrealitky.cz")
|
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
|
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
|
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
|
logger.info(f"Region: Praha")
|
|
if cache:
|
|
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
|
|
if max_pages:
|
|
logger.info(f"Max. stran: {max_pages}")
|
|
if max_properties:
|
|
logger.info(f"Max. bytů: {max_properties}")
|
|
logger.info("=" * 60)
|
|
|
|
# Step 1: Fetch all listing pages
|
|
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
|
|
all_adverts = {} # id -> advert dict (dedup)
|
|
page = 1
|
|
total = None
|
|
|
|
while True:
|
|
if max_pages and page > max_pages:
|
|
logger.debug(f"Max pages limit reached: {max_pages}")
|
|
break
|
|
logger.info(f"Strana {page} ...")
|
|
adverts, total_count = fetch_page(page)
|
|
|
|
if total is None and total_count > 0:
|
|
total = total_count
|
|
total_pages = math.ceil(total / PER_PAGE)
|
|
logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")
|
|
|
|
if not adverts:
|
|
logger.debug(f"No adverts found on page {page}, stopping")
|
|
break
|
|
|
|
for adv in adverts:
|
|
adv_id = adv.get("id")
|
|
if adv_id and adv_id not in all_adverts:
|
|
all_adverts[adv_id] = adv
|
|
|
|
page += 1
|
|
if total and page > math.ceil(total / PER_PAGE):
|
|
break
|
|
time.sleep(0.5)
|
|
|
|
logger.info(f"\nStaženo: {len(all_adverts)} unikátních inzerátů")
|
|
|
|
# Step 2: Pre-filter by disposition, price, area from list data
|
|
pre_filtered = []
|
|
excluded_disp = 0
|
|
excluded_price = 0
|
|
excluded_area = 0
|
|
excluded_no_gps = 0
|
|
|
|
for adv in all_adverts.values():
|
|
adv_id = adv.get("id")
|
|
disp = adv.get("disposition", "")
|
|
if disp not in WANTED_DISPOSITIONS:
|
|
excluded_disp += 1
|
|
logger.debug(f"Filter: id={adv_id} - excluded (disposition {disp})")
|
|
continue
|
|
|
|
price = adv.get("price", 0) or 0
|
|
if price > MAX_PRICE or price == 0:
|
|
excluded_price += 1
|
|
logger.debug(f"Filter: id={adv_id} - excluded (price {price})")
|
|
continue
|
|
|
|
surface = adv.get("surface")
|
|
if surface is not None and surface < MIN_AREA:
|
|
excluded_area += 1
|
|
logger.debug(f"Filter: id={adv_id} - excluded (area {surface} m²)")
|
|
continue
|
|
|
|
gps = adv.get("gps", {})
|
|
if not gps or not gps.get("lat") or not gps.get("lng"):
|
|
excluded_no_gps += 1
|
|
logger.debug(f"Filter: id={adv_id} - excluded (no GPS)")
|
|
continue
|
|
|
|
pre_filtered.append(adv)
|
|
|
|
logger.info(f"\nPo předfiltraci:")
|
|
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
|
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
|
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
|
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
|
logger.info(f" Zbývá: {len(pre_filtered)}")
|
|
|
|
# Step 3: Fetch details
|
|
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
|
|
results = []
|
|
excluded_panel = 0
|
|
excluded_floor = 0
|
|
excluded_detail = 0
|
|
cache_hits = 0
|
|
properties_fetched = 0
|
|
|
|
for i, adv in enumerate(pre_filtered):
|
|
if max_properties and properties_fetched >= max_properties:
|
|
logger.debug(f"Max properties limit reached: {max_properties}")
|
|
break
|
|
uri = adv.get("uri", "")
|
|
if not uri:
|
|
excluded_detail += 1
|
|
logger.debug(f"Filter: id={adv.get('id')} - excluded (no URI)")
|
|
continue
|
|
|
|
# Check cache — if hash_id exists and price unchanged, reuse
|
|
adv_id = int(adv["id"])
|
|
adv_price = adv.get("price", 0) or 0
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
cached = cache.get(adv_id)
|
|
if cached and cached.get("price") == adv_price:
|
|
cache_hits += 1
|
|
logger.debug(f"Cache hit for id={adv_id}")
|
|
cached["last_updated"] = today
|
|
if "first_seen" not in cached:
|
|
cached["first_seen"] = today
|
|
results.append(cached)
|
|
continue
|
|
|
|
time.sleep(0.4)
|
|
detail = fetch_detail(uri)
|
|
|
|
if not detail:
|
|
excluded_detail += 1
|
|
logger.debug(f"Filter: id={adv_id} - excluded (detail fetch failed)")
|
|
continue
|
|
|
|
# Check construction — exclude panel
|
|
construction = detail.get("construction", "")
|
|
if construction == "PANEL":
|
|
excluded_panel += 1
|
|
logger.debug(f"Filter: id={adv['id']} - excluded (panel construction)")
|
|
logger.info(f"✗ Vyloučen #{adv['id']}: panel")
|
|
continue
|
|
|
|
# Check situation — exclude sídliště
|
|
situation = detail.get("situation", "")
|
|
if situation and "HOUSING_ESTATE" in str(situation).upper():
|
|
excluded_panel += 1
|
|
logger.debug(f"Filter: id={adv['id']} - excluded (housing estate)")
|
|
logger.info(f"✗ Vyloučen #{adv['id']}: sídliště")
|
|
continue
|
|
|
|
# Check floor (etage)
|
|
etage = detail.get("etage")
|
|
if etage is not None and etage < MIN_FLOOR:
|
|
excluded_floor += 1
|
|
logger.debug(f"Filter: id={adv_id} - excluded (floor {etage})")
|
|
continue
|
|
|
|
gps = adv.get("gps", {})
|
|
disp = adv.get("disposition", "")
|
|
|
|
# Get address — key includes locale parameter
|
|
address = ""
|
|
for key in detail:
|
|
if key.startswith("address(") and "withHouseNumber" not in key:
|
|
address = detail[key]
|
|
break
|
|
if not address:
|
|
for key in detail:
|
|
if key.startswith("address("):
|
|
address = detail[key]
|
|
break
|
|
if not address:
|
|
address = adv.get('address({"locale":"CS"})', "Praha")
|
|
|
|
# Preserve first_seen from cache if this is a price-changed re-fetch
|
|
first_seen = today
|
|
if cached and "first_seen" in cached:
|
|
first_seen = cached["first_seen"]
|
|
|
|
result = {
|
|
"hash_id": int(adv["id"]),
|
|
"name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²",
|
|
"price": adv.get("price", 0),
|
|
"price_formatted": format_price(adv.get("price", 0)),
|
|
"locality": address,
|
|
"lat": gps["lat"],
|
|
"lon": gps["lng"],
|
|
"disposition": DISPOSITION_LABELS.get(disp, "?"),
|
|
"floor": etage,
|
|
"area": adv.get("surface"),
|
|
"building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"),
|
|
"ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
|
|
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
|
|
"source": "bezrealitky",
|
|
"image": "",
|
|
"first_seen": first_seen,
|
|
"last_updated": today,
|
|
}
|
|
results.append(result)
|
|
properties_fetched += 1
|
|
|
|
if (i + 1) % 20 == 0:
|
|
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
|
|
|
|
logger.info(f"\n{'=' * 60}")
|
|
logger.info(f"Výsledky Bezrealitky:")
|
|
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
|
|
logger.info(f" Z cache (přeskočeno): {cache_hits}")
|
|
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
|
|
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
|
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
|
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
|
logger.info(f"{'=' * 60}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Scrape apartments from Bezrealitky.cz")
|
|
parser.add_argument("--max-pages", type=int, default=None,
|
|
help="Maximum number of listing pages to scrape")
|
|
parser.add_argument("--max-properties", type=int, default=None,
|
|
help="Maximum number of properties to fetch details for")
|
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
help="Logging level (default: INFO)")
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=getattr(logging, args.log_level),
|
|
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
|
handlers=[logging.StreamHandler()]
|
|
)
|
|
|
|
start = time.time()
|
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
|
|
|
if estates:
|
|
json_path = Path("byty_bezrealitky.json")
|
|
json_path.write_text(
|
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
elapsed = time.time() - start
|
|
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
|
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
|
else:
|
|
logger.info("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")
|