Reliability improvements: retry logic, validation, ratings sync
Some checks failed
Build and Push / build (push) Failing after 4s

- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
  Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
  validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
  localStorage) and save (async POST); ratings now persist across
  browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
  between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
  in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Jan Novak
2026-02-27 10:36:37 +01:00
parent 57a9f6f21a
commit 27a7834eb6
9 changed files with 212 additions and 114 deletions

View File

@@ -15,7 +15,7 @@ import re
import time
import urllib.request
from pathlib import Path
from scraper_stats import write_stats
from scraper_stats import write_stats, validate_listing
STATS_FILE = "stats_realingo.json"
@@ -56,6 +56,28 @@ HEADERS = {
BASE_URL = "https://www.realingo.cz"
def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string with retry on transient errors."""
for attempt in range(retries):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html
except urllib.error.HTTPError:
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
if attempt < retries - 1:
wait = (attempt + 1) * 2
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
"""Fetch a page of Prague listings. Returns (items, total_count)."""
if page == 1:
@@ -63,41 +85,26 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
else:
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
logger.debug(f"HTTP GET request: {url}")
logger.debug(f"Headers: {HEADERS}")
req = urllib.request.Request(url, headers=HEADERS)
try:
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
html = fetch_url(url)
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
logger.debug("No __NEXT_DATA__ script found in HTML")
return [], 0
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
logger.debug("No __NEXT_DATA__ script found in HTML")
return [], 0
data = json.loads(match.group(1))
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
return offer_list["data"], offer_list["total"]
except (urllib.error.URLError, ConnectionError, OSError) as e:
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
raise
data = json.loads(match.group(1))
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
return offer_list["data"], offer_list["total"]
def fetch_detail(listing_url: str) -> dict | None:
"""Fetch detail page for a listing to get floor, building type, etc."""
try:
url = f"{BASE_URL}{listing_url}"
logger.debug(f"HTTP GET request: {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
html = fetch_url(url)
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
@@ -324,6 +331,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
"last_changed": datetime.now().strftime("%Y-%m-%d"),
}
if not validate_listing(result, "realingo"):
continue
results.append(result)
properties_fetched += 1