#!/usr/bin/env python3 """ Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN a CityHome, deduplikuje a vygeneruje mapu. Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát. PSN a CityHome mají při deduplikaci prioritu (načtou se první). """ from __future__ import annotations import argparse import json import re from pathlib import Path from scrape_and_map import generate_map, format_price def normalize_street(locality: str) -> str: """Extract and normalize street name from locality string.""" # "Studentská, Praha 6 - Dejvice" → "studentska" # "Rýnská, Praha" → "rynska" street = locality.split(",")[0].strip().lower() # Remove diacritics (simple Czech) replacements = { "á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e", "í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s", "ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z", } for src, dst in replacements.items(): street = street.replace(src, dst) # Remove non-alphanumeric street = re.sub(r"[^a-z0-9]", "", street) return street def dedup_key(estate: dict) -> str: """Create deduplication key from street + price + area.""" street = normalize_street(estate.get("locality", "")) price = estate.get("price", 0) area = estate.get("area") or 0 return f"{street}_{price}_{area}" def main(data_dir: str = "."): # Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci) sources = [ ("PSN", "byty_psn.json"), ("CityHome", "byty_cityhome.json"), ("Sreality", "byty_sreality.json"), ("Realingo", "byty_realingo.json"), ("Bezrealitky", "byty_bezrealitky.json"), ("iDNES", "byty_idnes.json"), ] data_path = Path(data_dir) all_estates = [] for label, filename in sources: path = data_path / filename if path.exists(): data = json.loads(path.read_text(encoding="utf-8")) # Ensure source is set (Sreality legacy) if label == "Sreality": for e in data: if "source" not in e: e["source"] = "sreality" all_estates.extend(data) print(f"{label:12s} {len(data)} bytů") else: print(f"{label:12s} data nenalezena ({filename})") print(f"Celkem: {len(all_estates)} bytů před deduplikací") # Deduplicate — prefer Sreality (has better detail URLs) seen_keys = {} deduplicated = [] dupes = 0 for e in all_estates: key = dedup_key(e) if key in seen_keys: dupes += 1 existing = seen_keys[key] # Merge timestamps: keep earliest first_seen, latest last_updated e_first = e.get("first_seen", "") ex_first = existing.get("first_seen", "") if e_first and ex_first: existing["first_seen"] = min(e_first, ex_first) elif e_first: existing["first_seen"] = e_first e_updated = e.get("last_updated", "") ex_updated = existing.get("last_updated", "") if e_updated and ex_updated: existing["last_updated"] = max(e_updated, ex_updated) elif e_updated: existing["last_updated"] = e_updated # Log it print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')} m² " f"({e.get('source', '?')} vs {existing.get('source', '?')})") else: seen_keys[key] = e deduplicated.append(e) print(f"\nOdstraněno duplikátů: {dupes}") print(f"Výsledek: {len(deduplicated)} unikátních bytů") # Count by source by_source = {} for e in deduplicated: src = e.get("source", "unknown") by_source[src] = by_source.get(src, 0) + 1 for src, count in sorted(by_source.items()): print(f" {src}: {count}") # Save merged data merged_path = data_path / "byty_merged.json" merged_path.write_text( json.dumps(deduplicated, ensure_ascii=False, indent=2), encoding="utf-8", ) print(f"\n✓ Sloučená data: {merged_path.resolve()}") # Generate map generate_map(deduplicated, output_path=str(data_path / "mapa_bytu.html")) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Merge scraped data and generate map") parser.add_argument("--data-dir", type=str, default=".", help="Directory for reading/writing data files (default: current dir)") args = parser.parse_args() main(data_dir=args.data_dir)