312 lines
9.7 KiB
Python
312 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Realingo.cz scraper.
|
|
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
|
|
Výstup: byty_realingo.json
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import math
|
|
import re
|
|
import time
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
# ── Konfigurace (sdílená se Sreality scraperem) ─────────────────────────────
|
|
|
|
MAX_PRICE = 13_500_000
|
|
MIN_AREA = 69
|
|
MIN_FLOOR = 2
|
|
PER_PAGE = 40 # Realingo vrací 40 na stránku
|
|
|
|
# Kategorie které chceme (dispozice 3+kk a větší)
|
|
WANTED_CATEGORIES = {
|
|
"FLAT3_KK", "FLAT31", # 3+kk, 3+1
|
|
"FLAT4_KK", "FLAT41", # 4+kk, 4+1
|
|
"FLAT5_KK", "FLAT51", # 5+kk, 5+1
|
|
"FLAT6", # 6+
|
|
"OTHERS_FLAT", # atypické — zkontrolujeme plochu
|
|
}
|
|
|
|
# Mapování category → label
|
|
CATEGORY_LABELS = {
|
|
"FLAT1_KK": "1+kk", "FLAT11": "1+1",
|
|
"FLAT2_KK": "2+kk", "FLAT21": "2+1",
|
|
"FLAT3_KK": "3+kk", "FLAT31": "3+1",
|
|
"FLAT4_KK": "4+kk", "FLAT41": "4+1",
|
|
"FLAT5_KK": "5+kk", "FLAT51": "5+1",
|
|
"FLAT6": "6+",
|
|
"OTHERS_FLAT": "Atypický",
|
|
}
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
}
|
|
|
|
BASE_URL = "https://www.realingo.cz"
|
|
|
|
|
|
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
|
|
"""Fetch a page of Prague listings. Returns (items, total_count)."""
|
|
if page == 1:
|
|
url = f"{BASE_URL}/prodej_byty/praha/"
|
|
else:
|
|
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
|
|
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
html = resp.read().decode("utf-8")
|
|
|
|
match = re.search(
|
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
|
html, re.DOTALL
|
|
)
|
|
if not match:
|
|
return [], 0
|
|
|
|
data = json.loads(match.group(1))
|
|
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
|
|
return offer_list["data"], offer_list["total"]
|
|
|
|
|
|
def fetch_detail(listing_url: str) -> dict | None:
|
|
"""Fetch detail page for a listing to get floor, building type, etc."""
|
|
try:
|
|
url = f"{BASE_URL}{listing_url}"
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
html = resp.read().decode("utf-8")
|
|
|
|
match = re.search(
|
|
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
|
html, re.DOTALL
|
|
)
|
|
if not match:
|
|
return None
|
|
|
|
data = json.loads(match.group(1))
|
|
details = data["props"]["pageProps"]["store"]["offer"]["details"]
|
|
# Get first (only) detail entry
|
|
for detail_data in details.values():
|
|
return detail_data
|
|
except Exception as e:
|
|
print(f" Warning: detail fetch failed for {listing_url}: {e}")
|
|
return None
|
|
|
|
|
|
def format_price(price: int) -> str:
|
|
s = str(price)
|
|
parts = []
|
|
while s:
|
|
parts.append(s[-3:])
|
|
s = s[:-3]
|
|
return " ".join(reversed(parts)) + " Kč"
|
|
|
|
|
|
def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
|
|
"""Load previously scraped data as cache keyed by hash_id."""
|
|
path = Path(json_path)
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
return {e["hash_id"]: e for e in data if "hash_id" in e}
|
|
except (json.JSONDecodeError, KeyError):
|
|
return {}
|
|
|
|
|
|
def scrape():
|
|
cache = load_cache()
|
|
|
|
print("=" * 60)
|
|
print("Stahuji inzeráty z Realingo.cz")
|
|
print(f"Cena: do {format_price(MAX_PRICE)}")
|
|
print(f"Min. plocha: {MIN_AREA} m²")
|
|
print(f"Patro: od {MIN_FLOOR}. NP")
|
|
print(f"Region: Praha")
|
|
if cache:
|
|
print(f"Cache: {len(cache)} bytů z minulého běhu")
|
|
print("=" * 60)
|
|
|
|
# Step 1: Fetch all listing pages
|
|
print("\nFáze 1: Stahování seznamu inzerátů...")
|
|
all_listings = []
|
|
page = 1
|
|
total = None
|
|
|
|
while True:
|
|
print(f" Strana {page} ...")
|
|
items, total_count = fetch_listing_page(page)
|
|
if total is None:
|
|
total = total_count
|
|
total_pages = math.ceil(total / PER_PAGE)
|
|
print(f" → Celkem {total} inzerátů, {total_pages} stran")
|
|
|
|
if not items:
|
|
break
|
|
|
|
all_listings.extend(items)
|
|
page += 1
|
|
if page > total_pages:
|
|
break
|
|
time.sleep(0.5)
|
|
|
|
print(f"\n Staženo: {len(all_listings)} inzerátů")
|
|
|
|
# Step 2: Pre-filter by category, price, area from listing data
|
|
pre_filtered = []
|
|
excluded_category = 0
|
|
excluded_price = 0
|
|
excluded_area = 0
|
|
excluded_no_gps = 0
|
|
|
|
for item in all_listings:
|
|
cat = item.get("category", "")
|
|
if cat not in WANTED_CATEGORIES:
|
|
excluded_category += 1
|
|
continue
|
|
|
|
price = item.get("price", {}).get("total", 0) or 0
|
|
if price > MAX_PRICE or price == 0:
|
|
excluded_price += 1
|
|
continue
|
|
|
|
area = item.get("area", {}).get("main")
|
|
if area is not None and area < MIN_AREA:
|
|
excluded_area += 1
|
|
continue
|
|
|
|
loc = item.get("location", {})
|
|
if not loc.get("latitude") or not loc.get("longitude"):
|
|
excluded_no_gps += 1
|
|
continue
|
|
|
|
pre_filtered.append(item)
|
|
|
|
print(f"\nPo předfiltraci:")
|
|
print(f" Vyloučeno (dispozice): {excluded_category}")
|
|
print(f" Vyloučeno (cena): {excluded_price}")
|
|
print(f" Vyloučeno (plocha): {excluded_area}")
|
|
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
|
print(f" Zbývá: {len(pre_filtered)}")
|
|
|
|
# Step 3: Fetch details for remaining listings (floor, building type)
|
|
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
|
|
results = []
|
|
excluded_panel = 0
|
|
excluded_floor = 0
|
|
excluded_detail = 0
|
|
cache_hits = 0
|
|
|
|
for i, item in enumerate(pre_filtered):
|
|
# Check cache — if hash_id exists and price unchanged, reuse
|
|
item_id = int(item["id"])
|
|
item_price = item.get("price", {}).get("total", 0) or 0
|
|
cached = cache.get(item_id)
|
|
if cached and cached.get("price") == item_price:
|
|
cache_hits += 1
|
|
results.append(cached)
|
|
continue
|
|
|
|
time.sleep(0.3)
|
|
detail_data = fetch_detail(item["url"])
|
|
|
|
if not detail_data:
|
|
excluded_detail += 1
|
|
continue
|
|
|
|
detail = detail_data.get("offer", {}).get("detail", {})
|
|
if not detail and "detail" in detail_data:
|
|
detail = detail_data["detail"]
|
|
|
|
# Check building type — exclude panel
|
|
building_type = detail.get("buildingType", "")
|
|
if building_type == "PANEL":
|
|
excluded_panel += 1
|
|
print(f" ✗ Vyloučen #{item['id']}: panel")
|
|
continue
|
|
|
|
# Check building position — exclude sídliště
|
|
building_position = detail.get("buildingPosition", "")
|
|
if building_position and "ESTATE" in str(building_position).upper():
|
|
excluded_panel += 1
|
|
print(f" ✗ Vyloučen #{item['id']}: sídliště")
|
|
continue
|
|
|
|
# Check floor
|
|
floor = detail.get("floor")
|
|
if floor is not None and floor < MIN_FLOOR:
|
|
excluded_floor += 1
|
|
continue
|
|
|
|
# Map building type
|
|
bt_map = {
|
|
"BRICK": "Cihlová",
|
|
"PANEL": "Panelová",
|
|
"WOOD": "Dřevostavba",
|
|
"STEEL": "Ocelová",
|
|
"MIXED": "Smíšená",
|
|
"MONTAGE": "Montovaná",
|
|
}
|
|
ownership_map = {
|
|
"PRIVATE": "Osobní",
|
|
"COOPERATIVE": "Družstevní",
|
|
"STATE": "Státní/obecní",
|
|
}
|
|
|
|
cat = item.get("category", "")
|
|
loc = item.get("location", {})
|
|
|
|
result = {
|
|
"hash_id": int(item["id"]),
|
|
"name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')} m²",
|
|
"price": item.get("price", {}).get("total", 0),
|
|
"price_formatted": format_price(item.get("price", {}).get("total", 0)),
|
|
"locality": loc.get("address", "Praha"),
|
|
"lat": loc["latitude"],
|
|
"lon": loc["longitude"],
|
|
"disposition": CATEGORY_LABELS.get(cat, "?"),
|
|
"floor": floor,
|
|
"area": item.get("area", {}).get("main"),
|
|
"building_type": bt_map.get(building_type, building_type or "neuvedeno"),
|
|
"ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
|
|
"url": f"{BASE_URL}{item['url']}",
|
|
"source": "realingo",
|
|
"image": "",
|
|
}
|
|
results.append(result)
|
|
|
|
if (i + 1) % 20 == 0:
|
|
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Výsledky Realingo:")
|
|
print(f" Předfiltrováno: {len(pre_filtered)}")
|
|
print(f" Z cache (přeskočeno): {cache_hits}")
|
|
print(f" Vyloučeno (panel/síd): {excluded_panel}")
|
|
print(f" Vyloučeno (patro): {excluded_floor}")
|
|
print(f" Vyloučeno (bez detailu): {excluded_detail}")
|
|
print(f" ✓ Vyhovující byty: {len(results)}")
|
|
print(f"{'=' * 60}")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
start = time.time()
|
|
estates = scrape()
|
|
|
|
if estates:
|
|
json_path = Path("byty_realingo.json")
|
|
json_path.write_text(
|
|
json.dumps(estates, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
elapsed = time.time() - start
|
|
print(f"\n✓ Data uložena: {json_path.resolve()}")
|
|
print(f"⏱ Celkový čas: {elapsed:.0f} s")
|
|
else:
|
|
print("\nŽádné byty z Realinga neodpovídají kritériím :(")
|