All checks were successful
Build and Push / build (push) Successful in 7s
The Docker entrypoint previously created symlinks from /app/ to /app/data/ so that scripts writing relative paths would persist to the mounted volume. This caused symlink loops in production when stale symlinks leaked into the host data directory. Instead, all scrapers, merge_and_map.py, regen_map.py, and run_all.sh now accept a --data-dir argument (default: ".") that controls where data files are read from and written to. The entrypoint and crontab pass --data-dir /app/data, eliminating the need for symlinks entirely. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
528 lines
19 KiB
Python
528 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Reality iDNES scraper.
|
|
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
|
|
Výstup: byty_idnes.json
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import math
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
import urllib.parse
|
|
from datetime import datetime
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
|
|
|
MAX_PRICE = 13_500_000
|
|
MIN_AREA = 69
|
|
MIN_FLOOR = 2
|
|
PER_PAGE = 26 # iDNES vrací 26 na stránku
|
|
|
|
# Dispozice — kódy pro s-qc[subtypeFlat]
|
|
DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"
|
|
|
|
# Mapování dispozice z titulku na label
|
|
DISPOSITION_MAP = {
|
|
"3+kk": "3+kk", "3+1": "3+1",
|
|
"4+kk": "4+kk", "4+1": "4+1",
|
|
"5+kk": "5+kk", "5+1": "5+1",
|
|
"6+kk": "6+", "6+1": "6+",
|
|
"6 a více": "6+",
|
|
}
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "cs,en;q=0.9",
|
|
"Accept-Encoding": "identity",
|
|
"Connection": "keep-alive",
|
|
}
|
|
|
|
BASE_URL = "https://reality.idnes.cz"
|
|
|
|
MAX_RETRIES = 5
|
|
|
|
|
|
def fetch_url(url: str) -> str:
|
|
"""Fetch URL and return HTML string with retry logic."""
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{MAX_RETRIES}): {url}")
|
|
logger.debug(f"Headers: {HEADERS}")
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
data = resp.read()
|
|
logger.debug(f"HTTP response: status={resp.status}, size={len(data)} bytes")
|
|
return data.decode("utf-8")
|
|
except (ConnectionResetError, ConnectionError, urllib.error.URLError,
|
|
OSError) as e:
|
|
if attempt < MAX_RETRIES - 1:
|
|
wait = (attempt + 1) * 3 # 3, 6, 9, 12s
|
|
logger.warning(f"Connection error (retry {attempt + 1}/{MAX_RETRIES} after {wait}s): {e}")
|
|
time.sleep(wait)
|
|
else:
|
|
logger.error(f"HTTP request failed after {MAX_RETRIES} attempts: {e}", exc_info=True)
|
|
raise
|
|
|
|
|
|
def build_list_url(page: int = 0) -> str:
|
|
"""Build listing URL with all filters."""
|
|
base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
|
|
params = {
|
|
"s-qc[subtypeFlat]": DISPOSITION_CODES,
|
|
"s-qc[usableAreaMin]": str(MIN_AREA),
|
|
}
|
|
url = f"{base}?{urllib.parse.urlencode(params)}"
|
|
if page > 0:
|
|
url += f"&page={page}"
|
|
return url
|
|
|
|
|
|
def parse_total_count(html: str) -> int:
|
|
"""Extract total listing count from page."""
|
|
# Look for "720 inzerátů" or similar
|
|
match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
|
|
if match:
|
|
return int(match.group(1).replace(" ", "").replace("\xa0", ""))
|
|
return 0
|
|
|
|
|
|
def parse_listings(html: str) -> list[dict]:
|
|
"""Parse listing cards from HTML using regex."""
|
|
results = []
|
|
|
|
# Find each listing block — look for c-products__link with detail URL
|
|
# Pattern: <a ... class="c-products__link" href="/detail/..."> ... block ... </a>
|
|
# Each listing card contains: title (h2), price (strong), info (p.c-products__info)
|
|
|
|
# Split by listing items, skip ads
|
|
items = re.findall(
|
|
r'<div[^>]*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
|
html, re.DOTALL
|
|
)
|
|
|
|
# Alternative: find all detail links and extract surrounding context
|
|
# More robust approach: find each detail link and parse nearby elements
|
|
link_pattern = re.compile(
|
|
r'<a[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
|
|
re.DOTALL
|
|
)
|
|
# Also match when class comes before href
|
|
link_pattern2 = re.compile(
|
|
r'<a[^>]*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
|
|
re.DOTALL
|
|
)
|
|
|
|
# Find all c-products__link anchors
|
|
all_links = link_pattern.findall(html) + link_pattern2.findall(html)
|
|
seen_urls = set()
|
|
|
|
# For each link, find the surrounding product block
|
|
for link_url in all_links:
|
|
if link_url in seen_urls:
|
|
continue
|
|
seen_urls.add(link_url)
|
|
|
|
# Find context around this link (the product card)
|
|
escaped_url = re.escape(link_url)
|
|
context_match = re.search(
|
|
escaped_url + r'(.*?)</div>\s*</div>',
|
|
html, re.DOTALL
|
|
)
|
|
if not context_match:
|
|
continue
|
|
|
|
block = context_match.group(1)
|
|
|
|
# Ensure full URL
|
|
url = link_url
|
|
if not url.startswith("http"):
|
|
url = BASE_URL + url
|
|
|
|
# Skip ads
|
|
ad_check_start = max(0, context_match.start() - 500)
|
|
ad_block = html[ad_check_start:context_match.start()]
|
|
if "advertisment" in ad_block or "advertisement" in ad_block:
|
|
continue
|
|
|
|
# Parse title: <h2 class="c-products__title">prodej bytu 3+kk 79 m2</h2>
|
|
title_match = re.search(r'class="c-products__title"[^>]*>(.*?)</h2>', block, re.DOTALL)
|
|
title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""
|
|
|
|
# Parse price: <p class="c-products__price"><strong>12 950 000 Kč</strong></p>
|
|
price_match = re.search(r'c-products__price[^>]*>.*?<strong>(.*?)</strong>', block, re.DOTALL)
|
|
price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""
|
|
|
|
# Parse address: <p class="c-products__info">Klečkova, Praha 5 - Stodůlky</p>
|
|
info_match = re.search(r'class="c-products__info"[^>]*>(.*?)</p>', block, re.DOTALL)
|
|
info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""
|
|
|
|
# Parse disposition and area from title
|
|
disp_match = re.search(r'(\d\+(?:kk|\d))', title)
|
|
area_match = re.search(r'(\d+)\s*m[²2]', title)
|
|
|
|
disposition = disp_match.group(1) if disp_match else None
|
|
area = int(area_match.group(1)) if area_match else None
|
|
|
|
if not disposition and ("6 a" in title or "6+" in title):
|
|
disposition = "6+"
|
|
|
|
# Parse price
|
|
price = 0
|
|
if price_text and "vyžádání" not in price_text.lower():
|
|
price_clean = re.sub(r'[^\d]', '', price_text)
|
|
if price_clean:
|
|
price = int(price_clean)
|
|
|
|
# Extract listing ID from URL
|
|
id_match = re.search(r'/([a-f0-9]{24})/?', url)
|
|
listing_id = id_match.group(1) if id_match else url
|
|
|
|
results.append({
|
|
"id": listing_id,
|
|
"url": url,
|
|
"disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
|
|
"area": area,
|
|
"price": price,
|
|
"locality": info,
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
def parse_detail(html: str) -> dict:
|
|
"""Parse detail page for GPS, floor, construction, ownership."""
|
|
detail = {}
|
|
|
|
# 1. Parse dataLayer.push() for GPS and other data
|
|
dl_match = re.search(
|
|
r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
|
|
html, re.DOTALL
|
|
)
|
|
if dl_match:
|
|
# Clean up JS object to valid JSON
|
|
js_obj = dl_match.group(1)
|
|
# Replace single quotes with double, handle trailing commas, etc.
|
|
# The dataLayer is usually valid JSON-like, let's try parsing
|
|
try:
|
|
# Remove JS comments, handle unquoted keys
|
|
# Most importantly: listing_lat, listing_lon, listing_price, listing_area
|
|
lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
|
|
lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
|
|
if lat_match:
|
|
detail["lat"] = float(lat_match.group(1))
|
|
if lon_match:
|
|
detail["lon"] = float(lon_match.group(1))
|
|
except (ValueError, AttributeError):
|
|
pass
|
|
|
|
# 2. Parse DT/DD pairs for floor, construction, ownership
|
|
# Pattern: <dt>Label</dt><dd>Value</dd>
|
|
dt_dd_pairs = re.findall(
|
|
r'<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>',
|
|
html, re.DOTALL
|
|
)
|
|
|
|
for dt, dd in dt_dd_pairs:
|
|
dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
|
|
dd_clean = re.sub(r'<[^>]+>', '', dd).strip()
|
|
|
|
if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
|
|
# "2. patro (3. NP)" or "3. podlaží z celkem 5"
|
|
# Try to find NP first
|
|
np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
|
|
if np_match:
|
|
detail["floor"] = int(np_match.group(1))
|
|
else:
|
|
# Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
|
|
patro_match = re.search(r'(\d+)', dd_clean)
|
|
if patro_match:
|
|
detail["floor"] = int(patro_match.group(1))
|
|
|
|
if "konstrukce" in dt_clean or "stavba" in dt_clean:
|
|
detail["construction"] = dd_clean.lower()
|
|
|
|
if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
|
|
detail["ownership"] = dd_clean
|
|
|
|
return detail
|
|
|
|
|
|
def format_price(price: int) -> str:
|
|
s = str(price)
|
|
parts = []
|
|
while s:
|
|
parts.append(s[-3:])
|
|
s = s[:-3]
|
|
return " ".join(reversed(parts)) + " Kč"
|
|
|
|
|
|
def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
|
|
"""Load previously scraped data as cache keyed by hash_id."""
|
|
path = Path(json_path)
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
return {str(e["hash_id"]): e for e in data if "hash_id" in e}
|
|
except (json.JSONDecodeError, KeyError):
|
|
return {}
|
|
|
|
|
|
def scrape(max_pages: int | None = None, max_properties: int | None = None, data_dir: str = "."):
|
|
cache = load_cache(str(Path(data_dir) / "byty_idnes.json"))
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("Stahuji inzeráty z Reality iDNES")
|
|
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
|
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
|
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
|
logger.info(f"Region: Praha")
|
|
if cache:
|
|
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
|
|
if max_pages:
|
|
logger.info(f"Max. stran: {max_pages}")
|
|
if max_properties:
|
|
logger.info(f"Max. bytů: {max_properties}")
|
|
logger.info("=" * 60)
|
|
|
|
# Step 1: Fetch listing pages
|
|
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
|
|
all_listings = {} # id -> listing dict
|
|
page = 0
|
|
total = None
|
|
|
|
while True:
|
|
if max_pages and page >= max_pages:
|
|
logger.debug(f"Max pages limit reached: {max_pages}")
|
|
break
|
|
url = build_list_url(page)
|
|
logger.info(f"Strana {page + 1} ...")
|
|
html = fetch_url(url)
|
|
|
|
if total is None:
|
|
total = parse_total_count(html)
|
|
total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
|
|
logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")
|
|
|
|
listings = parse_listings(html)
|
|
logger.debug(f"Page {page}: found {len(listings)} listings")
|
|
|
|
if not listings:
|
|
logger.debug(f"No listings found on page {page}, stopping")
|
|
break
|
|
|
|
for item in listings:
|
|
lid = item["id"]
|
|
if lid not in all_listings:
|
|
all_listings[lid] = item
|
|
|
|
page += 1
|
|
if total and page >= math.ceil(total / PER_PAGE):
|
|
break
|
|
time.sleep(1.0)
|
|
|
|
logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů")
|
|
|
|
# Step 2: Pre-filter by price and area from list data
|
|
pre_filtered = []
|
|
excluded_price = 0
|
|
excluded_area = 0
|
|
excluded_disp = 0
|
|
|
|
for item in all_listings.values():
|
|
item_id = item["id"]
|
|
if item["price"] <= 0 or item["price"] > MAX_PRICE:
|
|
excluded_price += 1
|
|
logger.debug(f"Filter: id={item_id} - excluded (price {item['price']})")
|
|
continue
|
|
|
|
if item["area"] is not None and item["area"] < MIN_AREA:
|
|
excluded_area += 1
|
|
logger.debug(f"Filter: id={item_id} - excluded (area {item['area']} m²)")
|
|
continue
|
|
|
|
if item["disposition"] == "?":
|
|
excluded_disp += 1
|
|
logger.debug(f"Filter: id={item_id} - excluded (unknown disposition)")
|
|
continue
|
|
|
|
pre_filtered.append(item)
|
|
|
|
logger.info(f"\nPo předfiltraci:")
|
|
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
|
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
|
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
|
logger.info(f" Zbývá: {len(pre_filtered)}")
|
|
|
|
# Step 3: Fetch details for GPS, floor, construction
|
|
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
|
|
results = []
|
|
excluded_panel = 0
|
|
excluded_floor = 0
|
|
excluded_no_gps = 0
|
|
excluded_detail = 0
|
|
cache_hits = 0
|
|
properties_fetched = 0
|
|
|
|
for i, item in enumerate(pre_filtered):
|
|
if max_properties and properties_fetched >= max_properties:
|
|
logger.debug(f"Max properties limit reached: {max_properties}")
|
|
break
|
|
# Check cache — if hash_id exists and price unchanged, reuse
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
cached = cache.get(str(item["id"]))
|
|
if cached and cached.get("price") == item["price"]:
|
|
cache_hits += 1
|
|
logger.debug(f"Cache hit for id={item['id']}")
|
|
cached["last_updated"] = today
|
|
if "first_seen" not in cached:
|
|
cached["first_seen"] = today
|
|
results.append(cached)
|
|
continue
|
|
|
|
url = item["url"]
|
|
time.sleep(0.4)
|
|
|
|
try:
|
|
html = fetch_url(url)
|
|
except Exception as e:
|
|
excluded_detail += 1
|
|
logger.warning(f"Detail failed for id={item['id']}: {e}")
|
|
continue
|
|
|
|
detail = parse_detail(html)
|
|
logger.debug(f"Detail parsed for id={item['id']}: lat={detail.get('lat')}, lon={detail.get('lon')}, floor={detail.get('floor')}")
|
|
|
|
# Must have GPS
|
|
if not detail.get("lat") or not detail.get("lon"):
|
|
excluded_no_gps += 1
|
|
logger.debug(f"Filter: id={item['id']} - excluded (no GPS)")
|
|
continue
|
|
|
|
# Check construction — exclude panel
|
|
construction = detail.get("construction", "")
|
|
if "panel" in construction:
|
|
excluded_panel += 1
|
|
logger.debug(f"Filter: id={item['id']} - excluded (panel construction)")
|
|
logger.info(f"✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
|
|
continue
|
|
|
|
# Check for sídliště in construction/description
|
|
if "sídliště" in construction or "sidliste" in construction:
|
|
excluded_panel += 1
|
|
logger.debug(f"Filter: id={item['id']} - excluded (housing estate)")
|
|
logger.info(f"✗ Vyloučen {item['id'][:12]}...: sídliště")
|
|
continue
|
|
|
|
# Check floor
|
|
floor = detail.get("floor")
|
|
if floor is not None and floor < MIN_FLOOR:
|
|
excluded_floor += 1
|
|
logger.debug(f"Filter: id={item['id']} - excluded (floor {floor})")
|
|
continue
|
|
|
|
# Map construction to Czech label
|
|
building_type = "neuvedeno"
|
|
if construction:
|
|
if "cihlo" in construction or "cihla" in construction:
|
|
building_type = "Cihlová"
|
|
elif "smíšen" in construction or "smisen" in construction:
|
|
building_type = "Smíšená"
|
|
elif "skelet" in construction:
|
|
building_type = "Skeletová"
|
|
elif "dřevo" in construction or "drevo" in construction:
|
|
building_type = "Dřevostavba"
|
|
elif "mont" in construction:
|
|
building_type = "Montovaná"
|
|
else:
|
|
building_type = construction.capitalize()
|
|
|
|
# Preserve first_seen from cache if this is a price-changed re-fetch
|
|
first_seen = today
|
|
if cached and "first_seen" in cached:
|
|
first_seen = cached["first_seen"]
|
|
|
|
result = {
|
|
"hash_id": item["id"],
|
|
"name": f"Prodej bytu {item['disposition']} {item.get('area', '?')} m²",
|
|
"price": item["price"],
|
|
"price_formatted": format_price(item["price"]),
|
|
"locality": item["locality"],
|
|
"lat": detail["lat"],
|
|
"lon": detail["lon"],
|
|
"disposition": item["disposition"],
|
|
"floor": floor,
|
|
"area": item["area"],
|
|
"building_type": building_type,
|
|
"ownership": detail.get("ownership", "neuvedeno"),
|
|
"url": item["url"],
|
|
"source": "idnes",
|
|
"image": "",
|
|
"first_seen": first_seen,
|
|
"last_updated": today,
|
|
}
|
|
results.append(result)
|
|
properties_fetched += 1
|
|
|
|
if (i + 1) % 20 == 0:
|
|
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
|
|
|
|
logger.info(f"\n{'=' * 60}")
|
|
logger.info(f"Výsledky Reality iDNES:")
|
|
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
|
|
logger.info(f" Z cache (přeskočeno): {cache_hits}")
|
|
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
|
|
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
|
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
|
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
|
|
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
|
logger.info(f"{'=' * 60}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Scrape apartments from Reality iDNES")
|
|
parser.add_argument("--max-pages", type=int, default=None,
|
|
help="Maximum number of listing pages to scrape")
|
|
parser.add_argument("--max-properties", type=int, default=None,
|
|
help="Maximum number of properties to fetch details for")
|
|
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
help="Logging level (default: INFO)")
|
|
parser.add_argument("--data-dir", type=str, default=".",
|
|
help="Directory for reading/writing data files (default: current dir)")
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=getattr(logging, args.log_level),
|
|
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
|
handlers=[logging.StreamHandler()]
|
|
)
|
|
|
|
data_dir = Path(args.data_dir)
|
|
start = time.time()
|
|
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties, data_dir=args.data_dir)
|
|
|
|
if estates:
|
|
json_path = data_dir / "byty_idnes.json"
|
|
json_path.write_text(
|
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
elapsed = time.time() - start
|
|
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
|
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
|
else:
|
|
logger.info("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")
|