Reliability improvements: retry logic, validation, ratings sync
Some checks failed
Build and Push / build (push) Failing after 4s
Some checks failed
Build and Push / build (push) Failing after 4s
- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
localStorage) and save (async POST); ratings now persist across
browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ import re
|
||||
import time
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from scraper_stats import write_stats
|
||||
from scraper_stats import write_stats, validate_listing
|
||||
|
||||
STATS_FILE = "stats_bezrealitky.json"
|
||||
|
||||
@@ -71,62 +71,71 @@ HEADERS = {
|
||||
BASE_URL = "https://www.bezrealitky.cz"
|
||||
|
||||
|
||||
def fetch_url(url: str, retries: int = 3) -> str:
|
||||
"""Fetch URL and return HTML string with retry on transient errors."""
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
return html
|
||||
except urllib.error.HTTPError:
|
||||
raise
|
||||
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
|
||||
if attempt < retries - 1:
|
||||
wait = (attempt + 1) * 2
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def fetch_page(page: int) -> tuple[list[dict], int]:
|
||||
"""
|
||||
Fetch a listing page from Bezrealitky.
|
||||
Returns (list of advert dicts from Apollo cache, total count).
|
||||
"""
|
||||
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
|
||||
logger.debug(f"HTTP GET request: {url}")
|
||||
logger.debug(f"Headers: {HEADERS}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
html = fetch_url(url)
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
html, re.DOTALL
|
||||
)
|
||||
if not match:
|
||||
logger.debug("No __NEXT_DATA__ script found in HTML")
|
||||
return [], 0
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
html, re.DOTALL
|
||||
)
|
||||
if not match:
|
||||
logger.debug("No __NEXT_DATA__ script found in HTML")
|
||||
return [], 0
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
cache = data["props"]["pageProps"]["apolloCache"]
|
||||
data = json.loads(match.group(1))
|
||||
cache = data["props"]["pageProps"]["apolloCache"]
|
||||
|
||||
# Extract adverts from cache
|
||||
adverts = []
|
||||
for key, val in cache.items():
|
||||
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
|
||||
adverts.append(val)
|
||||
# Extract adverts from cache
|
||||
adverts = []
|
||||
for key, val in cache.items():
|
||||
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
|
||||
adverts.append(val)
|
||||
|
||||
# Get total count from ROOT_QUERY
|
||||
total = 0
|
||||
root = cache.get("ROOT_QUERY", {})
|
||||
for key, val in root.items():
|
||||
if "listAdverts" in key and isinstance(val, dict):
|
||||
tc = val.get("totalCount")
|
||||
if tc and tc > total:
|
||||
total = tc
|
||||
# Get total count from ROOT_QUERY
|
||||
total = 0
|
||||
root = cache.get("ROOT_QUERY", {})
|
||||
for key, val in root.items():
|
||||
if "listAdverts" in key and isinstance(val, dict):
|
||||
tc = val.get("totalCount")
|
||||
if tc and tc > total:
|
||||
total = tc
|
||||
|
||||
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
|
||||
return adverts, total
|
||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
||||
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
||||
raise
|
||||
logger.debug(f"Page {page}: found {len(adverts)} adverts, total={total}")
|
||||
return adverts, total
|
||||
|
||||
|
||||
def fetch_detail(uri: str) -> dict | None:
|
||||
"""Fetch detail page for a listing."""
|
||||
try:
|
||||
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
|
||||
logger.debug(f"HTTP GET request: {url}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
html = fetch_url(url)
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
@@ -365,6 +374,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
"first_seen": cached.get("first_seen", datetime.now().strftime("%Y-%m-%d")) if cached else datetime.now().strftime("%Y-%m-%d"),
|
||||
"last_changed": datetime.now().strftime("%Y-%m-%d"),
|
||||
}
|
||||
if not validate_listing(result, "bezrealitky"):
|
||||
continue
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user