Reliability improvements: retry logic, validation, ratings sync

- Add 3-attempt retry with exponential backoff to Sreality, Realingo, Bezrealitky, and PSN scrapers (CityHome and iDNES already had it) - Add shared validate_listing() in scraper_stats.py; all 6 scrapers now validate GPS bounds, price, area, and required fields before output - Wire ratings to server /api/ratings on page load (merge with localStorage) and save (async POST); ratings now persist across browsers and devices - Namespace JS hash IDs as {source}_{id} to prevent rating collisions between listings from different portals with the same numeric ID - Replace manual Czech diacritic table with unicodedata.normalize() in merge_and_map.py for correct deduplication of all edge cases - Correct README schedule docs: every 4 hours, not twice daily Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 10:36:37 +01:00
parent 57a9f6f21a
commit 27a7834eb6
9 changed files with 212 additions and 114 deletions
--- a/scrape_bezrealitky.py
+++ b/scrape_bezrealitky.py
@@ -15,7 +15,7 @@ import re
 import time
 import urllib.request
 from pathlib import Path
-from scraper_stats import write_stats
+from scraper_stats import write_stats, validate_listing

 STATS_FILE = "stats_bezrealitky.json"

@@ -71,62 +71,71 @@ HEADERS = {
 BASE_URL = "https://www.bezrealitky.cz"


+def fetch_url(url: str, retries: int = 3) -> str:
+    """Fetch URL and return HTML string with retry on transient errors."""
+    for attempt in range(retries):
+        try:
+            logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
+            req = urllib.request.Request(url, headers=HEADERS)
+            resp = urllib.request.urlopen(req, timeout=30)
+            html = resp.read().decode("utf-8")
+            logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+            return html
+        except urllib.error.HTTPError:
+            raise
+        except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
+            if attempt < retries - 1:
+                wait = (attempt + 1) * 2
+                logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
+                time.sleep(wait)
+            else:
+                logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
+                raise
+
+
 def fetch_page(page: int) -> tuple[list[dict], int]:
    """
    Fetch a listing page from Bezrealitky.
    Returns (list of advert dicts from Apollo cache, total count).
    """
    url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
-    logger.debug(f"HTTP GET request: {url}")
-    logger.debug(f"Headers: {HEADERS}")
-    req = urllib.request.Request(url, headers=HEADERS)
-    try:
-        resp = urllib.request.urlopen(req, timeout=30)
-        html = resp.read().decode("utf-8")
-        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+    html = fetch_url(url)

-        match = re.search(
-            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
-            html, re.DOTALL
-        )
-        if not match:
-            logger.debug("No __NEXT_DATA__ script found in HTML")
-            return [], 0
+    match = re.search(
+        r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
+        html, re.DOTALL
+    )
+    if not match:
+        logger.debug("No __NEXT_DATA__ script found in HTML")
+        return [], 0

-        data = json.loads(match.group(1))
-        cache = data["props"]["pageProps"]["apolloCache"]
+    data = json.loads(match.group(1))
+    cache = data["props"]["pageProps"]["apolloCache"]

-        # Extract adverts from cache
-        adverts = []
-        for key, val in cache.items():
-            if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
-                adverts.append(val)
+    # Extract adverts from cache
+    adverts = []
+    for key, val in cache.items():
+        if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
+            adverts.append(val)

-        # Get total count from ROOT_QUERY
-        total = 0
-        root = cache.get("ROOT_QUERY", {})
-        for key, val in root.items():
-            if "listAdverts" in key and isinstance(val, dict):
-                tc = val.get("totalCount")
-                if tc and tc > total:
-                    total = tc
+    # Get total count from ROOT_QUERY
+    total = 0
+    root = cache.get("ROOT_QUERY", {})
+    for key, val in root.items():
+        if "listAdverts" in key and isinstance(val, dict):
+            tc = val.get("totalCount")
+            if tc and tc > total:
+                total = tc

-        logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
-        return adverts, total
-    except (urllib.error.URLError, ConnectionError, OSError) as e:
-        logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
-        raise
+    logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
+    return adverts, total


 def fetch_detail(uri: str) -> dict | None:
    """Fetch detail page for a listing."""
    try:
        url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
-        logger.debug(f"HTTP GET request: {url}")
-        req = urllib.request.Request(url, headers=HEADERS)
-        resp = urllib.request.urlopen(req, timeout=30)
-        html = resp.read().decode("utf-8")
-        logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
+        html = fetch_url(url)

        match = re.search(
            r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
@@ -365,6 +374,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
            "first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
            "last_changed": datetime.now().strftime("%Y-%m-%d"),
        }
+        if not validate_listing(result, "bezrealitky"):
+            continue
        results.append(result)
        properties_fetched += 1