Add validation mode, structured logging, and CLI args to all scrapers

- Replace print() with Python logging module across all 6 scrapers for configurable log levels (DEBUG/INFO/WARNING/ERROR) - Add --max-pages, --max-properties, and --log-level CLI arguments to each scraper via argparse for limiting scrape scope - Add validation Make targets (validation, validation-local, validation-local-debug) for quick test runs with limited data - Update run_all.sh to parse and forward CLI args to all scrapers - Update mapa_bytu.html with latest scrape results Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 23:12:59 +01:00
parent 5207c48890
commit 09a853aa05
9 changed files with 720 additions and 999 deletions
--- a/scrape_bezrealitky.py
+++ b/scrape_bezrealitky.py
@@ -6,13 +6,17 @@ Výstup: byty_bezrealitky.json
 """
 from __future__ import annotations

+import argparse
 import json
+import logging
 import math
 import re
 import time
 import urllib.request
 from pathlib import Path

+logger = logging.getLogger(__name__)
+
 # ── Konfigurace ─────────────────────────────────────────────────────────────

 MAX_PRICE = 13_500_000
@@ -69,51 +73,63 @@ def fetch_page(page: int) -> tuple[list[dict], int]:
    Returns (list of advert dicts from Apollo cache, total count).
    """
    url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
+    logger.debug(f"HTTP GET request: {url}")
+    logger.debug(f"Headers: {HEADERS}")
    req = urllib.request.Request(url, headers=HEADERS)
-    resp = urllib.request.urlopen(req, timeout=30)
-    html = resp.read().decode("utf-8")
-
-    match = re.search(
-        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
-        html, re.DOTALL
-    )
-    if not match:
-        return [], 0
-
-    data = json.loads(match.group(1))
-    cache = data["props"]["pageProps"]["apolloCache"]
-
-    # Extract adverts from cache
-    adverts = []
-    for key, val in cache.items():
-        if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
-            adverts.append(val)
-
-    # Get total count from ROOT_QUERY
-    total = 0
-    root = cache.get("ROOT_QUERY", {})
-    for key, val in root.items():
-        if "listAdverts" in key and isinstance(val, dict):
-            tc = val.get("totalCount")
-            if tc and tc > total:
-                total = tc
-
-    return adverts, total
-
-
-def fetch_detail(uri: str) -> dict | None:
-    """Fetch detail page for a listing."""
    try:
-        url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
-        req = urllib.request.Request(url, headers=HEADERS)
        resp = urllib.request.urlopen(req, timeout=30)
        html = resp.read().decode("utf-8")
+        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")

        match = re.search(
            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
            html, re.DOTALL
        )
        if not match:
+            logger.debug("No __NEXT_DATA__ script found in HTML")
+            return [], 0
+
+        data = json.loads(match.group(1))
+        cache = data["props"]["pageProps"]["apolloCache"]
+
+        # Extract adverts from cache
+        adverts = []
+        for key, val in cache.items():
+            if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
+                adverts.append(val)
+
+        # Get total count from ROOT_QUERY
+        total = 0
+        root = cache.get("ROOT_QUERY", {})
+        for key, val in root.items():
+            if "listAdverts" in key and isinstance(val, dict):
+                tc = val.get("totalCount")
+                if tc and tc > total:
+                    total = tc
+
+        logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
+        return adverts, total
+    except (urllib.error.URLError, ConnectionError, OSError) as e:
+        logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
+        raise
+
+
+def fetch_detail(uri: str) -> dict | None:
+    """Fetch detail page for a listing."""
+    try:
+        url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
+        logger.debug(f"HTTP GET request: {url}")
+        req = urllib.request.Request(url, headers=HEADERS)
+        resp = urllib.request.urlopen(req, timeout=30)
+        html = resp.read().decode("utf-8")
+        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+
+        match = re.search(
+            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
+            html, re.DOTALL
+        )
+        if not match:
+            logger.debug("No __NEXT_DATA__ script found in detail page")
            return None

        data = json.loads(match.group(1))
@@ -124,10 +140,11 @@ def fetch_detail(uri: str) -> dict | None:
            if key.startswith("Advert:") and isinstance(val, dict):
                # Detail pages have much more fields
                if "construction" in val or "etage" in val or "ownership" in val:
+                    logger.debug(f"Detail found for {uri}: construction={val.get('construction')}, etage={val.get('etage')}")
                    return val

    except Exception as e:
-        print(f"    Warning: detail failed for {uri}: {e}")
+        logger.warning(f"Detail failed for {uri}: {e}", exc_info=True)
    return None


@@ -152,35 +169,43 @@ def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
        return {}


-def scrape():
+def scrape(max_pages: int | None = None, max_properties: int | None = None):
    cache = load_cache()

-    print("=" * 60)
-    print("Stahuji inzeráty z Bezrealitky.cz")
-    print(f"Cena: do {format_price(MAX_PRICE)}")
-    print(f"Min. plocha: {MIN_AREA} m²")
-    print(f"Patro: od {MIN_FLOOR}. NP")
-    print(f"Region: Praha")
+    logger.info("=" * 60)
+    logger.info("Stahuji inzeráty z Bezrealitky.cz")
+    logger.info(f"Cena: do {format_price(MAX_PRICE)}")
+    logger.info(f"Min. plocha: {MIN_AREA} m²")
+    logger.info(f"Patro: od {MIN_FLOOR}. NP")
+    logger.info(f"Region: Praha")
    if cache:
-        print(f"Cache: {len(cache)} bytů z minulého běhu")
-    print("=" * 60)
+        logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
+    if max_pages:
+        logger.info(f"Max. stran: {max_pages}")
+    if max_properties:
+        logger.info(f"Max. bytů: {max_properties}")
+    logger.info("=" * 60)

    # Step 1: Fetch all listing pages
-    print("\nFáze 1: Stahování seznamu inzerátů...")
+    logger.info("\nFáze 1: Stahování seznamu inzerátů...")
    all_adverts = {}  # id -> advert dict (dedup)
    page = 1
    total = None

    while True:
-        print(f"  Strana {page} ...")
+        if max_pages and page > max_pages:
+            logger.debug(f"Max pages limit reached: {max_pages}")
+            break
+        logger.info(f"Strana {page} ...")
        adverts, total_count = fetch_page(page)

        if total is None and total_count > 0:
            total = total_count
            total_pages = math.ceil(total / PER_PAGE)
-            print(f"  → Celkem {total} inzerátů, ~{total_pages} stran")
+            logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")

        if not adverts:
+            logger.debug(f"No adverts found on page {page}, stopping")
            break

        for adv in adverts:
@@ -193,7 +218,7 @@ def scrape():
            break
        time.sleep(0.5)

-    print(f"\n  Staženo: {len(all_adverts)} unikátních inzerátů")
+    logger.info(f"\nStaženo: {len(all_adverts)} unikátních inzerátů")

    # Step 2: Pre-filter by disposition, price, area from list data
    pre_filtered = []
@@ -203,47 +228,57 @@ def scrape():
    excluded_no_gps = 0

    for adv in all_adverts.values():
+        adv_id = adv.get("id")
        disp = adv.get("disposition", "")
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
+            logger.debug(f"Filter: id={adv_id} - excluded (disposition {disp})")
            continue

        price = adv.get("price", 0) or 0
        if price > MAX_PRICE or price == 0:
            excluded_price += 1
+            logger.debug(f"Filter: id={adv_id} - excluded (price {price})")
            continue

        surface = adv.get("surface")
        if surface is not None and surface < MIN_AREA:
            excluded_area += 1
+            logger.debug(f"Filter: id={adv_id} - excluded (area {surface} m²)")
            continue

        gps = adv.get("gps", {})
        if not gps or not gps.get("lat") or not gps.get("lng"):
            excluded_no_gps += 1
+            logger.debug(f"Filter: id={adv_id} - excluded (no GPS)")
            continue

        pre_filtered.append(adv)

-    print(f"\nPo předfiltraci:")
-    print(f"  Vyloučeno (dispozice): {excluded_disp}")
-    print(f"  Vyloučeno (cena):      {excluded_price}")
-    print(f"  Vyloučeno (plocha):    {excluded_area}")
-    print(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
-    print(f"  Zbývá:                 {len(pre_filtered)}")
+    logger.info(f"\nPo předfiltraci:")
+    logger.info(f"  Vyloučeno (dispozice): {excluded_disp}")
+    logger.info(f"  Vyloučeno (cena):      {excluded_price}")
+    logger.info(f"  Vyloučeno (plocha):    {excluded_area}")
+    logger.info(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
+    logger.info(f"  Zbývá:                 {len(pre_filtered)}")

    # Step 3: Fetch details
-    print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
+    logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_detail = 0
    cache_hits = 0
+    properties_fetched = 0

    for i, adv in enumerate(pre_filtered):
+        if max_properties and properties_fetched >= max_properties:
+            logger.debug(f"Max properties limit reached: {max_properties}")
+            break
        uri = adv.get("uri", "")
        if not uri:
            excluded_detail += 1
+            logger.debug(f"Filter: id={adv.get('id')} - excluded (no URI)")
            continue

        # Check cache — if hash_id exists and price unchanged, reuse
@@ -252,6 +287,7 @@ def scrape():
        cached = cache.get(adv_id)
        if cached and cached.get("price") == adv_price:
            cache_hits += 1
+            logger.debug(f"Cache hit for id={adv_id}")
            results.append(cached)
            continue

@@ -260,26 +296,30 @@ def scrape():

        if not detail:
            excluded_detail += 1
+            logger.debug(f"Filter: id={adv_id} - excluded (detail fetch failed)")
            continue

        # Check construction — exclude panel
        construction = detail.get("construction", "")
        if construction == "PANEL":
            excluded_panel += 1
-            print(f"  ✗ Vyloučen #{adv['id']}: panel")
+            logger.debug(f"Filter: id={adv['id']} - excluded (panel construction)")
+            logger.info(f"✗ Vyloučen #{adv['id']}: panel")
            continue

        # Check situation — exclude sídliště
        situation = detail.get("situation", "")
        if situation and "HOUSING_ESTATE" in str(situation).upper():
            excluded_panel += 1
-            print(f"  ✗ Vyloučen #{adv['id']}: sídliště")
+            logger.debug(f"Filter: id={adv['id']} - excluded (housing estate)")
+            logger.info(f"✗ Vyloučen #{adv['id']}: sídliště")
            continue

        # Check floor (etage)
        etage = detail.get("etage")
        if etage is not None and etage < MIN_FLOOR:
            excluded_floor += 1
+            logger.debug(f"Filter: id={adv_id} - excluded (floor {etage})")
            continue

        gps = adv.get("gps", {})
@@ -317,26 +357,43 @@ def scrape():
            "image": "",
        }
        results.append(result)
+        properties_fetched += 1

        if (i + 1) % 20 == 0:
-            print(f"  Zpracováno {i + 1}/{len(pre_filtered)} ...")
+            logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")

-    print(f"\n{'=' * 60}")
-    print(f"Výsledky Bezrealitky:")
-    print(f"  Předfiltrováno:        {len(pre_filtered)}")
-    print(f"  Z cache (přeskočeno): {cache_hits}")
-    print(f"  Vyloučeno (panel/síd): {excluded_panel}")
-    print(f"  Vyloučeno (patro):     {excluded_floor}")
-    print(f"  Vyloučeno (bez detailu): {excluded_detail}")
-    print(f"  ✓ Vyhovující byty:    {len(results)}")
-    print(f"{'=' * 60}")
+    logger.info(f"\n{'=' * 60}")
+    logger.info(f"Výsledky Bezrealitky:")
+    logger.info(f"  Předfiltrováno:        {len(pre_filtered)}")
+    logger.info(f"  Z cache (přeskočeno): {cache_hits}")
+    logger.info(f"  Vyloučeno (panel/síd): {excluded_panel}")
+    logger.info(f"  Vyloučeno (patro):     {excluded_floor}")
+    logger.info(f"  Vyloučeno (bez detailu): {excluded_detail}")
+    logger.info(f"  ✓ Vyhovující byty:    {len(results)}")
+    logger.info(f"{'=' * 60}")

    return results


 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Scrape apartments from Bezrealitky.cz")
+    parser.add_argument("--max-pages", type=int, default=None,
+                        help="Maximum number of listing pages to scrape")
+    parser.add_argument("--max-properties", type=int, default=None,
+                        help="Maximum number of properties to fetch details for")
+    parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+                        help="Logging level (default: INFO)")
+    args = parser.parse_args()
+
+    # Configure logging
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
+        handlers=[logging.StreamHandler()]
+    )
+
    start = time.time()
-    estates = scrape()
+    estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)

    if estates:
        json_path = Path("byty_bezrealitky.json")
@@ -345,7 +402,7 @@ if __name__ == "__main__":
            encoding="utf-8",
        )
        elapsed = time.time() - start
-        print(f"\n✓ Data uložena: {json_path.resolve()}")
-        print(f"⏱  Celkový čas: {elapsed:.0f} s")
+        logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
+        logger.info(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
-        print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")
+        logger.info("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")