Add first_seen/last_updated timestamps to track property freshness

Each property record now carries two date fields: - first_seen: date the listing first appeared (preserved across runs) - last_updated: date of the most recent scrape that included it All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome) set these fields during scraping. Cached results preserve first_seen and refresh last_updated. PSN and CityHome gain a load_previous() helper to track first_seen across runs (they lacked caching before). The merge script keeps the earliest first_seen and latest last_updated when deduplicating listings across sources. The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno" dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:03:08 +01:00
parent c6089f0da9
commit 0b95c847c4
9 changed files with 1604 additions and 11509 deletions
--- a/scrape_bezrealitky.py
+++ b/scrape_bezrealitky.py
@@ -13,6 +13,7 @@ import math
 import re
 import time
 import urllib.request
+from datetime import datetime
 from pathlib import Path

 logger = logging.getLogger(__name__)
@@ -284,10 +285,14 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
        # Check cache — if hash_id exists and price unchanged, reuse
        adv_id = int(adv["id"])
        adv_price = adv.get("price", 0) or 0
+        today = datetime.now().strftime("%Y-%m-%d")
        cached = cache.get(adv_id)
        if cached and cached.get("price") == adv_price:
            cache_hits += 1
            logger.debug(f"Cache hit for id={adv_id}")
+            cached["last_updated"] = today
+            if "first_seen" not in cached:
+                cached["first_seen"] = today
            results.append(cached)
            continue

@@ -339,6 +344,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
        if not address:
            address = adv.get('address({"locale":"CS"})', "Praha")

+        # Preserve first_seen from cache if this is a price-changed re-fetch
+        first_seen = today
+        if cached and "first_seen" in cached:
+            first_seen = cached["first_seen"]
+
        result = {
            "hash_id": int(adv["id"]),
            "name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')} m²",
@@ -355,6 +365,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
            "url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
            "source": "bezrealitky",
            "image": "",
+            "first_seen": first_seen,
+            "last_updated": today,
        }
        results.append(result)
        properties_fetched += 1