Add validation mode, structured logging, and CLI args to all scrapers

- Replace print() with Python logging module across all 6 scrapers
  for configurable log levels (DEBUG/INFO/WARNING/ERROR)
- Add --max-pages, --max-properties, and --log-level CLI arguments
  to each scraper via argparse for limiting scrape scope
- Add validation Make targets (validation, validation-local,
  validation-local-debug) for quick test runs with limited data
- Update run_all.sh to parse and forward CLI args to all scrapers
- Update mapa_bytu.html with latest scrape results

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jan Novak
2026-02-14 23:12:59 +01:00
committed by kacerr
parent 5207c48890
commit 09a853aa05
9 changed files with 720 additions and 999 deletions

View File

@@ -6,13 +6,17 @@ Výstup: byty_bezrealitky.json
"""
from __future__ import annotations
import argparse
import json
import logging
import math
import re
import time
import urllib.request
from pathlib import Path
logger = logging.getLogger(__name__)
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 13_500_000
@@ -69,51 +73,63 @@ def fetch_page(page: int) -> tuple[list[dict], int]:
Returns (list of advert dicts from Apollo cache, total count).
"""
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
logger.debug(f"HTTP GET request: {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return [], 0
data = json.loads(match.group(1))
cache = data["props"]["pageProps"]["apolloCache"]
# Extract adverts from cache
adverts = []
for key, val in cache.items():
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
adverts.append(val)
# Get total count from ROOT_QUERY
total = 0
root = cache.get("ROOT_QUERY", {})
for key, val in root.items():
if "listAdverts" in key and isinstance(val, dict):
tc = val.get("totalCount")
if tc and tc > total:
total = tc
return adverts, total
def fetch_detail(uri: str) -> dict | None:
"""Fetch detail page for a listing."""
try:
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
logger.debug("No __NEXT_DATA__ script found in HTML")
return [], 0
data = json.loads(match.group(1))
cache = data["props"]["pageProps"]["apolloCache"]
# Extract adverts from cache
adverts = []
for key, val in cache.items():
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
adverts.append(val)
# Get total count from ROOT_QUERY
total = 0
root = cache.get("ROOT_QUERY", {})
for key, val in root.items():
if "listAdverts" in key and isinstance(val, dict):
tc = val.get("totalCount")
if tc and tc > total:
total = tc
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
return adverts, total
except (urllib.error.URLError, ConnectionError, OSError) as e:
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
raise
def fetch_detail(uri: str) -> dict | None:
"""Fetch detail page for a listing."""
try:
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
logger.debug(f"HTTP GET request: {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
logger.debug("No __NEXT_DATA__ script found in detail page")
return None
data = json.loads(match.group(1))
@@ -124,10 +140,11 @@ def fetch_detail(uri: str) -> dict | None:
if key.startswith("Advert:") and isinstance(val, dict):
# Detail pages have much more fields
if "construction" in val or "etage" in val or "ownership" in val:
logger.debug(f"Detail found for {uri}: construction={val.get('construction')}, etage={val.get('etage')}")
return val
except Exception as e:
print(f" Warning: detail failed for {uri}: {e}")
logger.warning(f"Detail failed for {uri}: {e}", exc_info=True)
return None
@@ -152,35 +169,43 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
return {}
def scrape():
def scrape(max_pages: int | None = None, max_properties: int | None = None):
cache = load_cache()
print("=" * 60)
print("Stahuji inzeráty z Bezrealitky.cz")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha")
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Bezrealitky.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha")
if cache:
print(f"Cache: {len(cache)} bytů z minulého běhu")
print("=" * 60)
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Step 1: Fetch all listing pages
print("\nFáze 1: Stahování seznamu inzerátů...")
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
all_adverts = {} # id -> advert dict (dedup)
page = 1
total = None
while True:
print(f" Strana {page} ...")
if max_pages and page > max_pages:
logger.debug(f"Max pages limit reached: {max_pages}")
break
logger.info(f"Strana {page} ...")
adverts, total_count = fetch_page(page)
if total is None and total_count > 0:
total = total_count
total_pages = math.ceil(total / PER_PAGE)
print(f" → Celkem {total} inzerátů, ~{total_pages} stran")
logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")
if not adverts:
logger.debug(f"No adverts found on page {page}, stopping")
break
for adv in adverts:
@@ -193,7 +218,7 @@ def scrape():
break
time.sleep(0.5)
print(f"\n Staženo: {len(all_adverts)} unikátních inzerátů")
logger.info(f"\nStaženo: {len(all_adverts)} unikátních inzerátů")
# Step 2: Pre-filter by disposition, price, area from list data
pre_filtered = []
@@ -203,47 +228,57 @@ def scrape():
excluded_no_gps = 0
for adv in all_adverts.values():
adv_id = adv.get("id")
disp = adv.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
logger.debug(f"Filter: id={adv_id} - excluded (disposition {disp})")
continue
price = adv.get("price", 0) or 0
if price > MAX_PRICE or price == 0:
excluded_price += 1
logger.debug(f"Filter: id={adv_id} - excluded (price {price})")
continue
surface = adv.get("surface")
if surface is not None and surface < MIN_AREA:
excluded_area += 1
logger.debug(f"Filter: id={adv_id} - excluded (area {surface} m²)")
continue
gps = adv.get("gps", {})
if not gps or not gps.get("lat") or not gps.get("lng"):
excluded_no_gps += 1
logger.debug(f"Filter: id={adv_id} - excluded (no GPS)")
continue
pre_filtered.append(adv)
print(f"\nPo předfiltraci:")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
print(f" Zbývá: {len(pre_filtered)}")
logger.info(f"\nPo předfiltraci:")
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
logger.info(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_detail = 0
cache_hits = 0
properties_fetched = 0
for i, adv in enumerate(pre_filtered):
if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break
uri = adv.get("uri", "")
if not uri:
excluded_detail += 1
logger.debug(f"Filter: id={adv.get('id')} - excluded (no URI)")
continue
# Check cache — if hash_id exists and price unchanged, reuse
@@ -252,6 +287,7 @@ def scrape():
cached = cache.get(adv_id)
if cached and cached.get("price") == adv_price:
cache_hits += 1
logger.debug(f"Cache hit for id={adv_id}")
results.append(cached)
continue
@@ -260,26 +296,30 @@ def scrape():
if not detail:
excluded_detail += 1
logger.debug(f"Filter: id={adv_id} - excluded (detail fetch failed)")
continue
# Check construction — exclude panel
construction = detail.get("construction", "")
if construction == "PANEL":
excluded_panel += 1
print(f" ✗ Vyloučen #{adv['id']}: panel")
logger.debug(f"Filter: id={adv['id']} - excluded (panel construction)")
logger.info(f"✗ Vyloučen #{adv['id']}: panel")
continue
# Check situation — exclude sídliště
situation = detail.get("situation", "")
if situation and "HOUSING_ESTATE" in str(situation).upper():
excluded_panel += 1
print(f" ✗ Vyloučen #{adv['id']}: sídliště")
logger.debug(f"Filter: id={adv['id']} - excluded (housing estate)")
logger.info(f"✗ Vyloučen #{adv['id']}: sídliště")
continue
# Check floor (etage)
etage = detail.get("etage")
if etage is not None and etage < MIN_FLOOR:
excluded_floor += 1
logger.debug(f"Filter: id={adv_id} - excluded (floor {etage})")
continue
gps = adv.get("gps", {})
@@ -317,26 +357,43 @@ def scrape():
"image": "",
}
results.append(result)
properties_fetched += 1
if (i + 1) % 20 == 0:
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
print(f"\n{'=' * 60}")
print(f"Výsledky Bezrealitky:")
print(f" Předfiltrováno: {len(pre_filtered)}")
print(f" Z cache (přeskočeno): {cache_hits}")
print(f" Vyloučeno (panel/síd): {excluded_panel}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (bez detailu): {excluded_detail}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky Bezrealitky:")
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
logger.info(f" Z cache (přeskočeno): {cache_hits}")
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from Bezrealitky.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages to scrape")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
start = time.time()
estates = scrape()
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
if estates:
json_path = Path("byty_bezrealitky.json")
@@ -345,7 +402,7 @@ if __name__ == "__main__":
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")
logger.info("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")