Files
maru-hleda-byt/scrape_realingo.py
Jan Novak 0b95c847c4 Add first_seen/last_updated timestamps to track property freshness
Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it

All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).

The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.

The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:03:08 +01:00

380 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Realingo.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_realingo.json
"""
from __future__ import annotations
import argparse
import json
import logging
import math
import re
import time
import urllib.request
from datetime import datetime
from pathlib import Path
logger = logging.getLogger(__name__)
# ── Konfigurace (sdílená se Sreality scraperem) ─────────────────────────────
MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 40 # Realingo vrací 40 na stránku
# Kategorie které chceme (dispozice 3+kk a větší)
WANTED_CATEGORIES = {
"FLAT3_KK", "FLAT31", # 3+kk, 3+1
"FLAT4_KK", "FLAT41", # 4+kk, 4+1
"FLAT5_KK", "FLAT51", # 5+kk, 5+1
"FLAT6", # 6+
"OTHERS_FLAT", # atypické — zkontrolujeme plochu
}
# Mapování category → label
CATEGORY_LABELS = {
"FLAT1_KK": "1+kk", "FLAT11": "1+1",
"FLAT2_KK": "2+kk", "FLAT21": "2+1",
"FLAT3_KK": "3+kk", "FLAT31": "3+1",
"FLAT4_KK": "4+kk", "FLAT41": "4+1",
"FLAT5_KK": "5+kk", "FLAT51": "5+1",
"FLAT6": "6+",
"OTHERS_FLAT": "Atypický",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
}
BASE_URL = "https://www.realingo.cz"
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
"""Fetch a page of Prague listings. Returns (items, total_count)."""
if page == 1:
url = f"{BASE_URL}/prodej_byty/praha/"
else:
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
logger.debug(f"HTTP GET request: {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
try:
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
logger.debug("No __NEXT_DATA__ script found in HTML")
return [], 0
data = json.loads(match.group(1))
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
return offer_list["data"], offer_list["total"]
except (urllib.error.URLError, ConnectionError, OSError) as e:
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
raise
def fetch_detail(listing_url: str) -> dict | None:
"""Fetch detail page for a listing to get floor, building type, etc."""
try:
url = f"{BASE_URL}{listing_url}"
logger.debug(f"HTTP GET request: {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
logger.debug("No __NEXT_DATA__ script found in detail page")
return None
data = json.loads(match.group(1))
details = data["props"]["pageProps"]["store"]["offer"]["details"]
# Get first (only) detail entry
for detail_data in details.values():
logger.debug(f"Detail fetched for {listing_url}")
return detail_data
except Exception as e:
logger.warning(f"Detail fetch failed for {listing_url}: {e}", exc_info=True)
return None
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {e["hash_id"]: e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache()
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Realingo.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha")
if cache:
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Step 1: Fetch all listing pages
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
all_listings = []
page = 1
total = None
while True:
if max_pages and page > max_pages:
logger.debug(f"Max pages limit reached: {max_pages}")
break
logger.info(f"Strana {page} ...")
items, total_count = fetch_listing_page(page)
if total is None:
total = total_count
total_pages = math.ceil(total / PER_PAGE)
logger.info(f"→ Celkem {total} inzerátů, {total_pages} stran")
if not items:
logger.debug(f"No items found on page {page}, stopping")
break
all_listings.extend(items)
page += 1
if page > total_pages:
break
time.sleep(0.5)
logger.info(f"\nStaženo: {len(all_listings)} inzerátů")
# Step 2: Pre-filter by category, price, area from listing data
pre_filtered = []
excluded_category = 0
excluded_price = 0
excluded_area = 0
excluded_no_gps = 0
for item in all_listings:
item_id = item.get("id")
cat = item.get("category", "")
if cat not in WANTED_CATEGORIES:
excluded_category += 1
logger.debug(f"Filter: id={item_id} - excluded (category {cat})")
continue
price = item.get("price", {}).get("total", 0) or 0
if price > MAX_PRICE or price == 0:
excluded_price += 1
logger.debug(f"Filter: id={item_id} - excluded (price {price})")
continue
area = item.get("area", {}).get("main")
if area is not None and area < MIN_AREA:
excluded_area += 1
logger.debug(f"Filter: id={item_id} - excluded (area {area} m²)")
continue
loc = item.get("location", {})
if not loc.get("latitude") or not loc.get("longitude"):
excluded_no_gps += 1
logger.debug(f"Filter: id={item_id} - excluded (no GPS)")
continue
pre_filtered.append(item)
logger.info(f"\nPo předfiltraci:")
logger.info(f" Vyloučeno (dispozice): {excluded_category}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
logger.info(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details for remaining listings (floor, building type)
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_detail = 0
cache_hits = 0
properties_fetched = 0
for i, item in enumerate(pre_filtered):
if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break
# Check cache — if hash_id exists and price unchanged, reuse
item_id = int(item["id"])
item_price = item.get("price", {}).get("total", 0) or 0
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(item_id)
if cached and cached.get("price") == item_price:
cache_hits += 1
logger.debug(f"Cache hit for id={item_id}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached)
continue
time.sleep(0.3)
detail_data = fetch_detail(item["url"])
if not detail_data:
excluded_detail += 1
logger.debug(f"Filter: id={item_id} - excluded (detail fetch failed)")
continue
detail = detail_data.get("offer", {}).get("detail", {})
if not detail and "detail" in detail_data:
detail = detail_data["detail"]
# Check building type — exclude panel
building_type = detail.get("buildingType", "")
if building_type == "PANEL":
excluded_panel += 1
logger.debug(f"Filter: id={item['id']} - excluded (panel construction)")
logger.info(f"✗ Vyloučen #{item['id']}: panel")
continue
# Check building position — exclude sídliště
building_position = detail.get("buildingPosition", "")
if building_position and "ESTATE" in str(building_position).upper():
excluded_panel += 1
logger.debug(f"Filter: id={item['id']} - excluded (building estate)")
logger.info(f"✗ Vyloučen #{item['id']}: sídliště")
continue
# Check floor
floor = detail.get("floor")
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
logger.debug(f"Filter: id={item_id} - excluded (floor {floor})")
continue
# Map building type
bt_map = {
"BRICK": "Cihlová",
"PANEL": "Panelová",
"WOOD": "Dřevostavba",
"STEEL": "Ocelová",
"MIXED": "Smíšená",
"MONTAGE": "Montovaná",
}
ownership_map = {
"PRIVATE": "Osobní",
"COOPERATIVE": "Družstevní",
"STATE": "Státní/obecní",
}
cat = item.get("category", "")
loc = item.get("location", {})
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = {
"hash_id": int(item["id"]),
"name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')}",
"price": item.get("price", {}).get("total", 0),
"price_formatted": format_price(item.get("price", {}).get("total", 0)),
"locality": loc.get("address", "Praha"),
"lat": loc["latitude"],
"lon": loc["longitude"],
"disposition": CATEGORY_LABELS.get(cat, "?"),
"floor": floor,
"area": item.get("area", {}).get("main"),
"building_type": bt_map.get(building_type, building_type or "neuvedeno"),
"ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
"url": f"{BASE_URL}{item['url']}",
"source": "realingo",
"image": "",
"first_seen": first_seen,
"last_updated": today,
}
results.append(result)
properties_fetched += 1
if (i + 1) % 20 == 0:
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky Realingo:")
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
logger.info(f" Z cache (přeskočeno): {cache_hits}")
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from Realingo.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages to scrape")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates:
json_path = Path("byty_realingo.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
logger.info("\nŽádné byty z Realinga neodpovídají kritériím :(")