Reliability improvements: retry logic, validation, ratings sync
Some checks failed
Build and Push / build (push) Failing after 4s
Some checks failed
Build and Push / build (push) Failing after 4s
- Add 3-attempt retry with exponential backoff to Sreality, Realingo,
Bezrealitky, and PSN scrapers (CityHome and iDNES already had it)
- Add shared validate_listing() in scraper_stats.py; all 6 scrapers now
validate GPS bounds, price, area, and required fields before output
- Wire ratings to server /api/ratings on page load (merge with
localStorage) and save (async POST); ratings now persist across
browsers and devices
- Namespace JS hash IDs as {source}_{id} to prevent rating collisions
between listings from different portals with the same numeric ID
- Replace manual Czech diacritic table with unicodedata.normalize()
in merge_and_map.py for correct deduplication of all edge cases
- Correct README schedule docs: every 4 hours, not twice daily
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
from scraper_stats import write_stats
|
||||
from scraper_stats import write_stats, validate_listing
|
||||
|
||||
STATS_FILE = "stats_psn.json"
|
||||
|
||||
@@ -38,19 +38,25 @@ BASE_URL = "https://psn.cz"
|
||||
UNITS_API = f"{BASE_URL}/api/units-list"
|
||||
|
||||
|
||||
def fetch_json(url: str) -> dict:
|
||||
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare)."""
|
||||
logger.debug(f"HTTP GET: {url}")
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-L", "--max-time", "30",
|
||||
"-H", f"User-Agent: {UA}",
|
||||
"-H", "Accept: application/json",
|
||||
url],
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
|
||||
return json.loads(result.stdout)
|
||||
def fetch_json(url: str, retries: int = 3) -> dict:
|
||||
"""Fetch JSON via curl (urllib SSL may fail on Cloudflare) with retry."""
|
||||
for attempt in range(retries):
|
||||
logger.debug(f"HTTP GET (attempt {attempt + 1}/{retries}): {url}")
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-L", "--max-time", "30",
|
||||
"-H", f"User-Agent: {UA}",
|
||||
"-H", "Accept: application/json",
|
||||
url],
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return json.loads(result.stdout)
|
||||
if attempt < retries - 1:
|
||||
wait = (attempt + 1) * 2
|
||||
logger.warning(f"curl failed (retry {attempt + 1}/{retries} after {wait}s): {result.stderr[:200]}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
raise RuntimeError(f"curl failed after {retries} attempts ({result.returncode}): {result.stderr[:200]}")
|
||||
|
||||
|
||||
def fix_gps(lat, lng):
|
||||
@@ -255,6 +261,8 @@ def scrape(max_properties: int | None = None):
|
||||
"first_seen": _prev_cache.get(str(unit_id), {}).get("first_seen", datetime.now().strftime("%Y-%m-%d")),
|
||||
"last_changed": datetime.now().strftime("%Y-%m-%d") if _prev_cache.get(str(unit_id), {}).get("price") != int(price) else _prev_cache[str(unit_id)].get("last_changed", datetime.now().strftime("%Y-%m-%d")),
|
||||
}
|
||||
if not validate_listing(result, "psn"):
|
||||
continue
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user