All checks were successful
Build and Push / build (push) Successful in 7s
The Docker entrypoint previously created symlinks from /app/ to /app/data/ so that scripts writing relative paths would persist to the mounted volume. This caused symlink loops in production when stale symlinks leaked into the host data directory. Instead, all scrapers, merge_and_map.py, regen_map.py, and run_all.sh now accept a --data-dir argument (default: ".") that controls where data files are read from and written to. The entrypoint and crontab pass --data-dir /app/data, eliminating the need for symlinks entirely. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
133 lines
4.6 KiB
Python
133 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN a CityHome,
|
|
deduplikuje a vygeneruje mapu.
|
|
Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát.
|
|
PSN a CityHome mají při deduplikaci prioritu (načtou se první).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
from scrape_and_map import generate_map, format_price
|
|
|
|
|
|
def normalize_street(locality: str) -> str:
|
|
"""Extract and normalize street name from locality string."""
|
|
# "Studentská, Praha 6 - Dejvice" → "studentska"
|
|
# "Rýnská, Praha" → "rynska"
|
|
street = locality.split(",")[0].strip().lower()
|
|
# Remove diacritics (simple Czech)
|
|
replacements = {
|
|
"á": "a", "č": "c", "ď": "d", "é": "e", "ě": "e",
|
|
"í": "i", "ň": "n", "ó": "o", "ř": "r", "š": "s",
|
|
"ť": "t", "ú": "u", "ů": "u", "ý": "y", "ž": "z",
|
|
}
|
|
for src, dst in replacements.items():
|
|
street = street.replace(src, dst)
|
|
# Remove non-alphanumeric
|
|
street = re.sub(r"[^a-z0-9]", "", street)
|
|
return street
|
|
|
|
|
|
def dedup_key(estate: dict) -> str:
|
|
"""Create deduplication key from street + price + area."""
|
|
street = normalize_street(estate.get("locality", ""))
|
|
price = estate.get("price", 0)
|
|
area = estate.get("area") or 0
|
|
return f"{street}_{price}_{area}"
|
|
|
|
|
|
def main(data_dir: str = "."):
|
|
# Definice zdrojů — PSN a CityHome jako první (mají prioritu při deduplikaci)
|
|
sources = [
|
|
("PSN", "byty_psn.json"),
|
|
("CityHome", "byty_cityhome.json"),
|
|
("Sreality", "byty_sreality.json"),
|
|
("Realingo", "byty_realingo.json"),
|
|
("Bezrealitky", "byty_bezrealitky.json"),
|
|
("iDNES", "byty_idnes.json"),
|
|
]
|
|
|
|
data_path = Path(data_dir)
|
|
all_estates = []
|
|
|
|
for label, filename in sources:
|
|
path = data_path / filename
|
|
if path.exists():
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
# Ensure source is set (Sreality legacy)
|
|
if label == "Sreality":
|
|
for e in data:
|
|
if "source" not in e:
|
|
e["source"] = "sreality"
|
|
all_estates.extend(data)
|
|
print(f"{label:12s} {len(data)} bytů")
|
|
else:
|
|
print(f"{label:12s} data nenalezena ({filename})")
|
|
|
|
print(f"Celkem: {len(all_estates)} bytů před deduplikací")
|
|
|
|
# Deduplicate — prefer Sreality (has better detail URLs)
|
|
seen_keys = {}
|
|
deduplicated = []
|
|
dupes = 0
|
|
|
|
for e in all_estates:
|
|
key = dedup_key(e)
|
|
if key in seen_keys:
|
|
dupes += 1
|
|
existing = seen_keys[key]
|
|
# Merge timestamps: keep earliest first_seen, latest last_updated
|
|
e_first = e.get("first_seen", "")
|
|
ex_first = existing.get("first_seen", "")
|
|
if e_first and ex_first:
|
|
existing["first_seen"] = min(e_first, ex_first)
|
|
elif e_first:
|
|
existing["first_seen"] = e_first
|
|
e_updated = e.get("last_updated", "")
|
|
ex_updated = existing.get("last_updated", "")
|
|
if e_updated and ex_updated:
|
|
existing["last_updated"] = max(e_updated, ex_updated)
|
|
elif e_updated:
|
|
existing["last_updated"] = e_updated
|
|
# Log it
|
|
print(f" Duplikát: {e['locality']} | {format_price(e['price'])} | {e.get('area', '?')} m² "
|
|
f"({e.get('source', '?')} vs {existing.get('source', '?')})")
|
|
else:
|
|
seen_keys[key] = e
|
|
deduplicated.append(e)
|
|
|
|
print(f"\nOdstraněno duplikátů: {dupes}")
|
|
print(f"Výsledek: {len(deduplicated)} unikátních bytů")
|
|
|
|
# Count by source
|
|
by_source = {}
|
|
for e in deduplicated:
|
|
src = e.get("source", "unknown")
|
|
by_source[src] = by_source.get(src, 0) + 1
|
|
for src, count in sorted(by_source.items()):
|
|
print(f" {src}: {count}")
|
|
|
|
# Save merged data
|
|
merged_path = data_path / "byty_merged.json"
|
|
merged_path.write_text(
|
|
json.dumps(deduplicated, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
print(f"\n✓ Sloučená data: {merged_path.resolve()}")
|
|
|
|
# Generate map
|
|
generate_map(deduplicated, output_path=str(data_path / "mapa_bytu.html"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Merge scraped data and generate map")
|
|
parser.add_argument("--data-dir", type=str, default=".",
|
|
help="Directory for reading/writing data files (default: current dir)")
|
|
args = parser.parse_args()
|
|
main(data_dir=args.data_dir)
|