Files
maru-hleda-byt/merge_and_map.py
Jan Novak 0b95c847c4 Add first_seen/last_updated timestamps to track property freshness
Each property record now carries two date fields:
- first_seen: date the listing first appeared (preserved across runs)
- last_updated: date of the most recent scrape that included it

All 6 scrapers (Sreality, Realingo, Bezrealitky, iDNES, PSN, CityHome)
set these fields during scraping. Cached results preserve first_seen and
refresh last_updated. PSN and CityHome gain a load_previous() helper to
track first_seen across runs (they lacked caching before).

The merge script keeps the earliest first_seen and latest last_updated
when deduplicating listings across sources.

The HTML map now shows dates in popups ("Přidáno: DD.MM.YYYY"), displays
a green "NOVÉ" badge on newly discovered listings, and adds a "Přidáno"
dropdown filter (24h / 3 days / 7 days / 14 days) for spotting new ones.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 21:03:08 +01:00

127 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""
Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN a CityHome,
deduplikuje a vygeneruje mapu.
Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát.
PSN a CityHome mají při deduplikaci prioritu (načtou se první).
"""
from __future__ import annotations
import json
import re
from pathlib import Path
from scrape_and_map import generate_map, format_price
def normalize_street(locality: str) -> str:
"""Extract and normalize street name from locality string."""
# "Studentská, Praha 6 - Dejvice" → "studentska"
# "Rýnská, Praha" → "rynska"
street = locality.split(",")[0].strip().lower()
# Remove diacritics (simple Czech)
replacements = {
"á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e",
"í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s",
"ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z",
}
for src, dst in replacements.items():
street = street.replace(src, dst)
# Remove non-alphanumeric
street = re.sub(r"[^a-z0-9]", "", street)
return street
def dedup_key(estate: dict) -> str:
"""Create deduplication key from street + price + area."""
street = normalize_street(estate.get("locality", ""))
price = estate.get("price", 0)
area = estate.get("area") or 0
return f"{street}_{price}_{area}"
def main():
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
sources = [
("PSN", "byty_psn.json"),
("CityHome", "byty_cityhome.json"),
("Sreality", "byty_sreality.json"),
("Realingo", "byty_realingo.json"),
("Bezrealitky", "byty_bezrealitky.json"),
("iDNES", "byty_idnes.json"),
]
all_estates = []
for label, filename in sources:
path = Path(filename)
if path.exists():
data = json.loads(path.read_text(encoding="utf-8"))
# Ensure source is set (Sreality legacy)
if label == "Sreality":
for e in data:
if "source" not in e:
e["source"] = "sreality"
all_estates.extend(data)
print(f"{label:12s} {len(data)} bytů")
else:
print(f"{label:12s} data nenalezena ({filename})")
print(f"Celkem: {len(all_estates)} bytů před deduplikací")
# Deduplicate — prefer Sreality (has better detail URLs)
seen_keys = {}
deduplicated = []
dupes = 0
for e in all_estates:
key = dedup_key(e)
if key in seen_keys:
dupes += 1
existing = seen_keys[key]
# Merge timestamps: keep earliest first_seen, latest last_updated
e_first = e.get("first_seen", "")
ex_first = existing.get("first_seen", "")
if e_first and ex_first:
existing["first_seen"] = min(e_first, ex_first)
elif e_first:
existing["first_seen"] = e_first
e_updated = e.get("last_updated", "")
ex_updated = existing.get("last_updated", "")
if e_updated and ex_updated:
existing["last_updated"] = max(e_updated, ex_updated)
elif e_updated:
existing["last_updated"] = e_updated
# Log it
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')}"
f"({e.get('source', '?')} vs {existing.get('source', '?')})")
else:
seen_keys[key] = e
deduplicated.append(e)
print(f"\nOdstraněno duplikátů: {dupes}")
print(f"Výsledek: {len(deduplicated)} unikátních bytů")
# Count by source
by_source = {}
for e in deduplicated:
src = e.get("source", "unknown")
by_source[src] = by_source.get(src, 0) + 1
for src, count in sorted(by_source.items()):
print(f" {src}: {count}")
# Save merged data
merged_path = Path("byty_merged.json")
merged_path.write_text(
json.dumps(deduplicated, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(f"\n✓ Sloučená data: {merged_path.resolve()}")
# Generate map
generate_map(deduplicated)
if __name__ == "__main__":
main()