Reliability improvements: retry logic, validation, ratings sync
Some checks failed
Build and Push / build (push) Failing after 4s
Some checks failed
Build and Push / build (push) Failing after 4s
- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
localStorage) and save (async POST); ratings now persist across
browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,13 +1,53 @@
|
||||
"""Shared utility for writing per-scraper run statistics to JSON."""
|
||||
"""Shared utilities for scraper run statistics and listing validation."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", HERE))
|
||||
|
||||
_val_log = logging.getLogger(__name__)
|
||||
|
||||
_REQUIRED_FIELDS = ("hash_id", "price", "locality", "lat", "lon", "url", "source")
|
||||
|
||||
|
||||
def validate_listing(listing: dict, context: str = "") -> bool:
|
||||
"""
|
||||
Validate a listing dict before it is written to the output JSON.
|
||||
Returns True if valid, False if the listing should be skipped.
|
||||
Logs a warning for each invalid listing.
|
||||
"""
|
||||
prefix = f"[{context}] " if context else ""
|
||||
|
||||
for field in _REQUIRED_FIELDS:
|
||||
val = listing.get(field)
|
||||
if val is None or val == "":
|
||||
_val_log.warning(f"{prefix}Skipping listing — missing field '{field}': {listing.get('hash_id', '?')}")
|
||||
return False
|
||||
|
||||
price = listing.get("price")
|
||||
if not isinstance(price, (int, float)) or price <= 0:
|
||||
_val_log.warning(f"{prefix}Skipping listing — invalid price={price!r}: {listing.get('hash_id', '?')}")
|
||||
return False
|
||||
|
||||
lat, lon = listing.get("lat"), listing.get("lon")
|
||||
if not isinstance(lat, (int, float)) or not isinstance(lon, (int, float)):
|
||||
_val_log.warning(f"{prefix}Skipping listing — non-numeric GPS lat={lat!r} lon={lon!r}: {listing.get('hash_id', '?')}")
|
||||
return False
|
||||
if not (47.0 <= lat <= 52.0) or not (12.0 <= lon <= 19.0):
|
||||
_val_log.warning(f"{prefix}Skipping listing — GPS outside Czech Republic lat={lat} lon={lon}: {listing.get('hash_id', '?')}")
|
||||
return False
|
||||
|
||||
area = listing.get("area")
|
||||
if area is not None and (not isinstance(area, (int, float)) or area <= 0):
|
||||
_val_log.warning(f"{prefix}Skipping listing — invalid area={area!r}: {listing.get('hash_id', '?')}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def write_stats(filename: str, stats: dict) -> None:
|
||||
"""Write scraper run stats dict to the data directory."""
|
||||
|
||||
Reference in New Issue
Block a user