Add first_seen/last_updated timestamps to track property freshness

Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it

All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).

The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.

The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jan Novak
2026-02-15 21:03:08 +01:00
parent c6089f0da9
commit 0b95c847c4
9 changed files with 1604 additions and 11509 deletions

View File

@@ -14,6 +14,7 @@ import re
import time
import urllib.request
import urllib.parse
from datetime import datetime
from html.parser import HTMLParser
from pathlib import Path
@@ -378,10 +379,14 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
logger.debug(f"Max properties limit reached: {max_properties}")
break
# Check cache — if hash_id exists and price unchanged, reuse
today = datetime.now().strftime("%Y-%m-%d")
cached = cache.get(str(item["id"]))
if cached and cached.get("price") == item["price"]:
cache_hits += 1
logger.debug(f"Cache hit for id={item['id']}")
cached["last_updated"] = today
if "first_seen" not in cached:
cached["first_seen"] = today
results.append(cached)
continue
@@ -442,6 +447,11 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
else:
building_type = construction.capitalize()
# Preserve first_seen from cache if this is a price-changed re-fetch
first_seen = today
if cached and "first_seen" in cached:
first_seen = cached["first_seen"]
result = {
"hash_id": item["id"],
"name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}",
@@ -458,6 +468,8 @@ def scrape(max_pages: int | None = None, max_properties: int | None = None):
"url": item["url"],
"source": "idnes",
"image": "",
"first_seen": first_seen,
"last_updated": today,
}
results.append(result)
properties_fetched += 1