v1
This commit is contained in:
113
merge_and_map.py
Normal file
113
merge_and_map.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN a CityHome,
|
||||
deduplikuje a vygeneruje mapu.
|
||||
Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát.
|
||||
PSN a CityHome mají při deduplikaci prioritu (načtou se první).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from scrape_and_map import generate_map, format_price
|
||||
|
||||
|
||||
def normalize_street(locality: str) -> str:
|
||||
"""Extract and normalize street name from locality string."""
|
||||
# "Studentská, Praha 6 - Dejvice" → "studentska"
|
||||
# "Rýnská, Praha" → "rynska"
|
||||
street = locality.split(",")[0].strip().lower()
|
||||
# Remove diacritics (simple Czech)
|
||||
replacements = {
|
||||
"á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e",
|
||||
"í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s",
|
||||
"ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z",
|
||||
}
|
||||
for src, dst in replacements.items():
|
||||
street = street.replace(src, dst)
|
||||
# Remove non-alphanumeric
|
||||
street = re.sub(r"[^a-z0-9]", "", street)
|
||||
return street
|
||||
|
||||
|
||||
def dedup_key(estate: dict) -> str:
|
||||
"""Create deduplication key from street + price + area."""
|
||||
street = normalize_street(estate.get("locality", ""))
|
||||
price = estate.get("price", 0)
|
||||
area = estate.get("area") or 0
|
||||
return f"{street}_{price}_{area}"
|
||||
|
||||
|
||||
def main():
|
||||
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
|
||||
sources = [
|
||||
("PSN", "byty_psn.json"),
|
||||
("CityHome", "byty_cityhome.json"),
|
||||
("Sreality", "byty_sreality.json"),
|
||||
("Realingo", "byty_realingo.json"),
|
||||
("Bezrealitky", "byty_bezrealitky.json"),
|
||||
("iDNES", "byty_idnes.json"),
|
||||
]
|
||||
|
||||
all_estates = []
|
||||
|
||||
for label, filename in sources:
|
||||
path = Path(filename)
|
||||
if path.exists():
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
# Ensure source is set (Sreality legacy)
|
||||
if label == "Sreality":
|
||||
for e in data:
|
||||
if "source" not in e:
|
||||
e["source"] = "sreality"
|
||||
all_estates.extend(data)
|
||||
print(f"{label:12s} {len(data)} bytů")
|
||||
else:
|
||||
print(f"{label:12s} data nenalezena ({filename})")
|
||||
|
||||
print(f"Celkem: {len(all_estates)} bytů před deduplikací")
|
||||
|
||||
# Deduplicate — prefer Sreality (has better detail URLs)
|
||||
seen_keys = {}
|
||||
deduplicated = []
|
||||
dupes = 0
|
||||
|
||||
for e in all_estates:
|
||||
key = dedup_key(e)
|
||||
if key in seen_keys:
|
||||
dupes += 1
|
||||
existing = seen_keys[key]
|
||||
# Log it
|
||||
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')} m² "
|
||||
f"({e.get('source', '?')} vs {existing.get('source', '?')})")
|
||||
else:
|
||||
seen_keys[key] = e
|
||||
deduplicated.append(e)
|
||||
|
||||
print(f"\nOdstraněno duplikátů: {dupes}")
|
||||
print(f"Výsledek: {len(deduplicated)} unikátních bytů")
|
||||
|
||||
# Count by source
|
||||
by_source = {}
|
||||
for e in deduplicated:
|
||||
src = e.get("source", "unknown")
|
||||
by_source[src] = by_source.get(src, 0) + 1
|
||||
for src, count in sorted(by_source.items()):
|
||||
print(f" {src}: {count}")
|
||||
|
||||
# Save merged data
|
||||
merged_path = Path("byty_merged.json")
|
||||
merged_path.write_text(
|
||||
json.dumps(deduplicated, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
print(f"\n✓ Sloučená data: {merged_path.resolve()}")
|
||||
|
||||
# Generate map
|
||||
generate_map(deduplicated)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user