Files
maru-hleda-byt/scrape_idnes.py
Jan Novak 0b95c847c4 Add first_seen/last_updated timestamps to track property freshness
Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it

All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).

The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.

The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:03:08 +01:00

525 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Reality iDNES scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_idnes.json
"""
from __future__ import annotations
import argparse
import json
import logging
import math
import re
import time
import urllib.request
import urllib.parse
from datetime import datetime
from html.parser import HTMLParser
from pathlib import Path
logger = logging.getLogger(__name__)
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 26 # iDNES vrací 26 na stránku
# Dispozice — kódy pro s-qc[subtypeFlat]
DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"
# Mapování dispozice z titulku na label
DISPOSITION_MAP = {
"3+kk": "3+kk", "3+1": "3+1",
"4+kk": "4+kk", "4+1": "4+1",
"5+kk": "5+kk", "5+1": "5+1",
"6+kk": "6+", "6+1": "6+",
"6 a více": "6+",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "cs,en;q=0.9",
"Accept-Encoding": "identity",
"Connection": "keep-alive",
}
BASE_URL = "https://reality.idnes.cz"
MAX_RETRIES = 5
def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string with retry logic."""
for attempt in range(MAX_RETRIES):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{MAX_RETRIES}): {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
data = resp.read()
logger.debug(f"HTTP response: status={resp.status}, size={len(data)} bytes")
return data.decode("utf-8")
except (ConnectionResetError, ConnectionError, urllib.error.URLError,
OSError) as e:
if attempt < MAX_RETRIES - 1:
wait = (attempt + 1) * 3 # 3, 6, 9, 12s
logger.warning(f"Connection error (retry {attempt + 1}/{MAX_RETRIES} after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after {MAX_RETRIES} attempts: {e}", exc_info=True)
raise
def build_list_url(page: int = 0) -> str:
"""Build listing URL with all filters."""
base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
params = {
"s-qc[subtypeFlat]": DISPOSITION_CODES,
"s-qc[usableAreaMin]": str(MIN_AREA),
}
url = f"{base}?{urllib.parse.urlencode(params)}"
if page > 0:
url += f"&page={page}"
return url
def parse_total_count(html: str) -> int:
"""Extract total listing count from page."""
# Look for "720 inzerátů" or similar
match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
if match:
return int(match.group(1).replace(" ", "").replace("\xa0", ""))
return 0
def parse_listings(html: str) -> list[dict]:
"""Parse listing cards from HTML using regex."""
results = []
# Find each listing block — look for c-products__link with detail URL
# Pattern: <a ... class="c-products__link" href="/detail/..."> ... block ... </a>
# Each listing card contains: title (h2), price (strong), info (p.c-products__info)
# Split by listing items, skip ads
items = re.findall(
r'<div[^>]*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)</div>\s*</div>\s*</div>',
html, re.DOTALL
)
# Alternative: find all detail links and extract surrounding context
# More robust approach: find each detail link and parse nearby elements
link_pattern = re.compile(
r'<a[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
re.DOTALL
)
# Also match when class comes before href
link_pattern2 = re.compile(
r'<a[^>]*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
re.DOTALL
)
# Find all c-products__link anchors
all_links = link_pattern.findall(html) + link_pattern2.findall(html)
seen_urls = set()
# For each link, find the surrounding product block
for link_url in all_links:
if link_url in seen_urls:
continue
seen_urls.add(link_url)
# Find context around this link (the product card)
escaped_url = re.escape(link_url)
context_match = re.search(
escaped_url + r'(.*?)</div>\s*</div>',
html, re.DOTALL
)
if not context_match:
continue
block = context_match.group(1)
# Ensure full URL
url = link_url
if not url.startswith("http"):
url = BASE_URL + url
# Skip ads
ad_check_start = max(0, context_match.start() - 500)
ad_block = html[ad_check_start:context_match.start()]
if "advertisment" in ad_block or "advertisement" in ad_block:
continue
# Parse title: <h2 class="c-products__title">prodej bytu 3+kk 79 m2</h2>
title_match = re.search(r'class="c-products__title"[^>]*>(.*?)</h2>', block, re.DOTALL)
title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""
# Parse price: <p class="c-products__price"><strong>12 950 000 Kč</strong></p>
price_match = re.search(r'c-products__price[^>]*>.*?<strong>(.*?)</strong>', block, re.DOTALL)
price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""
# Parse address: <p class="c-products__info">Klečkova, Praha 5 - Stodůlky</p>
info_match = re.search(r'class="c-products__info"[^>]*>(.*?)</p>', block, re.DOTALL)
info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""
# Parse disposition and area from title
disp_match = re.search(r'(\d\+(?:kk|\d))', title)
area_match = re.search(r'(\d+)\s*m[²2]', title)
disposition = disp_match.group(1) if disp_match else None
area = int(area_match.group(1)) if area_match else None
if not disposition and ("6 a" in title or "6+" in title):
disposition = "6+"
# Parse price
price = 0
if price_text and "vyžádání" not in price_text.lower():
price_clean = re.sub(r'[^\d]', '', price_text)
if price_clean:
price = int(price_clean)
# Extract listing ID from URL
id_match = re.search(r'/([a-f0-9]{24})/?', url)
listing_id = id_match.group(1) if id_match else url
results.append({
"id": listing_id,
"url": url,
"disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
"area": area,
"price": price,
"locality": info,
})
return results
def parse_detail(html: str) -> dict:
"""Parse detail page for GPS, floor, construction, ownership."""
detail = {}
# 1. Parse dataLayer.push() for GPS and other data
dl_match = re.search(
r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
html, re.DOTALL
)
if dl_match:
# Clean up JS object to valid JSON
js_obj = dl_match.group(1)
# Replace single quotes with double, handle trailing commas, etc.
# The dataLayer is usually valid JSON-like, let's try parsing
try:
# Remove JS comments, handle unquoted keys
# Most importantly: listing_lat, listing_lon, listing_price, listing_area
lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
if lat_match:
detail["lat"] = float(lat_match.group(1))
if lon_match:
detail["lon"] = float(lon_match.group(1))
except (ValueError, AttributeError):
pass
# 2. Parse DT/DD pairs for floor, construction, ownership
# Pattern: <dt>Label</dt><dd>Value</dd>
dt_dd_pairs = re.findall(
r'<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>',
html, re.DOTALL
)
for dt, dd in dt_dd_pairs:
dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
dd_clean = re.sub(r'<[^>]+>', '', dd).strip()
if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
# "2. patro (3. NP)" or "3. podlaží z celkem 5"
# Try to find NP first
np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
if np_match:
detail["floor"] = int(np_match.group(1))
else:
# Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
patro_match = re.search(r'(\d+)', dd_clean)
if patro_match:
detail["floor"] = int(patro_match.group(1))
if "konstrukce" in dt_clean or "stavba" in dt_clean:
detail["construction"] = dd_clean.lower()
if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
detail["ownership"] = dd_clean
return detail
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {str(e["hash_id"]): e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache()
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Reality iDNES")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha")
if cache:
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Step 1: Fetch listing pages
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
all_listings = {} # id -> listing dict
page = 0
total = None
while True:
if max_pages and page >= max_pages:
logger.debug(f"Max pages limit reached: {max_pages}")
break
url = build_list_url(page)
logger.info(f"Strana {page + 1} ...")
html = fetch_url(url)
if total is None:
total = parse_total_count(html)
total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")
listings = parse_listings(html)
logger.debug(f"Page {page}: found {len(listings)} listings")
if not listings:
logger.debug(f"No listings found on page {page}, stopping")
break
for item in listings:
lid = item["id"]
if lid not in all_listings:
all_listings[lid] = item
page += 1
if total and page >= math.ceil(total / PER_PAGE):
break
time.sleep(1.0)
logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů")
# Step 2: Pre-filter by price and area from list data
pre_filtered = []
excluded_price = 0
excluded_area = 0
excluded_disp = 0
for item in all_listings.values():
item_id = item["id"]
if item["price"] <= 0 or item["price"] > MAX_PRICE:
excluded_price += 1
logger.debug(f"Filter: id={item_id} - excluded (price {item['price']})")
continue
if item["area"] is not None and item["area"] < MIN_AREA:
excluded_area += 1
logger.debug(f"Filter: id={item_id} - excluded (area {item['area']} m²)")
continue
if item["disposition"] == "?":
excluded_disp += 1
logger.debug(f"Filter: id={item_id} - excluded (unknown disposition)")
continue
pre_filtered.append(item)
logger.info(f"\nPo předfiltraci:")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
logger.info(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details for GPS, floor, construction
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_no_gps = 0
excluded_detail = 0
cache_hits = 0
properties_fetched = 0
for i, item in enumerate(pre_filtered):
if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break
# Check cache — if hash_id exists and price unchanged, reuse
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(str(item["id"]))
if cached and cached.get("price") == item["price"]:
cache_hits += 1
logger.debug(f"Cache hit for id={item['id']}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached)
continue
url = item["url"]
time.sleep(0.4)
try:
html = fetch_url(url)
except Exception as e:
excluded_detail += 1
logger.warning(f"Detail failed for id={item['id']}: {e}")
continue
detail = parse_detail(html)
logger.debug(f"Detail parsed for id={item['id']}: lat={detail.get('lat')}, lon={detail.get('lon')}, floor={detail.get('floor')}")
# Must have GPS
if not detail.get("lat") or not detail.get("lon"):
excluded_no_gps += 1
logger.debug(f"Filter: id={item['id']} - excluded (no GPS)")
continue
# Check construction — exclude panel
construction = detail.get("construction", "")
if "panel" in construction:
excluded_panel += 1
logger.debug(f"Filter: id={item['id']} - excluded (panel construction)")
logger.info(f"✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
continue
# Check for sídliště in construction/description
if "sídliště" in construction or "sidliste" in construction:
excluded_panel += 1
logger.debug(f"Filter: id={item['id']} - excluded (housing estate)")
logger.info(f"✗ Vyloučen {item['id'][:12]}...: sídliště")
continue
# Check floor
floor = detail.get("floor")
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
logger.debug(f"Filter: id={item['id']} - excluded (floor {floor})")
continue
# Map construction to Czech label
building_type = "neuvedeno"
if construction:
if "cihlo" in construction or "cihla" in construction:
building_type = "Cihlová"
elif "smíšen" in construction or "smisen" in construction:
building_type = "Smíšená"
elif "skelet" in construction:
building_type = "Skeletová"
elif "dřevo" in construction or "drevo" in construction:
building_type = "Dřevostavba"
elif "mont" in construction:
building_type = "Montovaná"
else:
building_type = construction.capitalize()
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = {
"hash_id": item["id"],
"name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}",
"price": item["price"],
"price_formatted": format_price(item["price"]),
"locality": item["locality"],
"lat": detail["lat"],
"lon": detail["lon"],
"disposition": item["disposition"],
"floor": floor,
"area": item["area"],
"building_type": building_type,
"ownership": detail.get("ownership", "neuvedeno"),
"url": item["url"],
"source": "idnes",
"image": "",
"first_seen": first_seen,
"last_updated": today,
}
results.append(result)
properties_fetched += 1
if (i + 1) % 20 == 0:
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky Reality iDNES:")
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
logger.info(f" Z cache (přeskočeno): {cache_hits}")
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from Reality iDNES")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages to scrape")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
start = time.time()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates:
json_path = Path("byty_idnes.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
logger.info("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")