Upload files to "/"

v1 scrapery
This commit is contained in:
2026-02-13 16:11:28 +00:00
parent 82d1f94104
commit 846d0bd9f2
5 changed files with 1760 additions and 0 deletions

351
scrape_bezrealitky.py Normal file
View File

@@ -0,0 +1,351 @@
#!/usr/bin/env python3
"""
Bezrealitky.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_bezrealitky.json
"""
from __future__ import annotations
import json
import math
import re
import time
import urllib.request
from pathlib import Path
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 15 # Bezrealitky vrací 15 na stránku
# Dispozice které chceme
WANTED_DISPOSITIONS = {
"DISP_3_KK", "DISP_3_1",
"DISP_4_KK", "DISP_4_1",
"DISP_5_KK", "DISP_5_1",
"DISP_6",
"DISP_OTHER", # atypické
}
DISPOSITION_LABELS = {
"DISP_1_KK": "1+kk", "DISP_1_1": "1+1",
"DISP_2_KK": "2+kk", "DISP_2_1": "2+1",
"DISP_3_KK": "3+kk", "DISP_3_1": "3+1",
"DISP_4_KK": "4+kk", "DISP_4_1": "4+1",
"DISP_5_KK": "5+kk", "DISP_5_1": "5+1",
"DISP_6": "6+",
"DISP_OTHER": "Atypický",
}
CONSTRUCTION_MAP = {
"BRICK": "Cihlová",
"PANEL": "Panelová",
"WOOD": "Dřevostavba",
"MIXED": "Smíšená",
"MONTAGE": "Montovaná",
"STEEL": "Ocelová",
}
OWNERSHIP_MAP = {
"OSOBNI": "Osobní",
"DRUZSTEVNI": "Družstevní",
"STATNI": "Státní/obecní",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "cs,en;q=0.9",
}
BASE_URL = "https://www.bezrealitky.cz"
def fetch_page(page: int) -> tuple[list[dict], int]:
"""
Fetch a listing page from Bezrealitky.
Returns (list of advert dicts from Apollo cache, total count).
"""
url = f"{BASE_URL}/vypis/nabidka-prodej/byt/praha?page={page}"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return [], 0
data = json.loads(match.group(1))
cache = data["props"]["pageProps"]["apolloCache"]
# Extract adverts from cache
adverts = []
for key, val in cache.items():
if key.startswith("Advert:") and isinstance(val, dict) and val.get("__typename") == "Advert":
adverts.append(val)
# Get total count from ROOT_QUERY
total = 0
root = cache.get("ROOT_QUERY", {})
for key, val in root.items():
if "listAdverts" in key and isinstance(val, dict):
tc = val.get("totalCount")
if tc and tc > total:
total = tc
return adverts, total
def fetch_detail(uri: str) -> dict | None:
"""Fetch detail page for a listing."""
try:
url = f"{BASE_URL}/nemovitosti-byty-domy/{uri}"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return None
data = json.loads(match.group(1))
cache = data["props"]["pageProps"]["apolloCache"]
# Find the full advert in cache
for key, val in cache.items():
if key.startswith("Advert:") and isinstance(val, dict):
# Detail pages have much more fields
if "construction" in val or "etage" in val or "ownership" in val:
return val
except Exception as e:
print(f" Warning: detail failed for {uri}: {e}")
return None
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def load_cache(json_path: str = "byty_bezrealitky.json") -> dict[int, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {e["hash_id"]: e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape():
cache = load_cache()
print("=" * 60)
print("Stahuji inzeráty z Bezrealitky.cz")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha")
if cache:
print(f"Cache: {len(cache)} bytů z minulého běhu")
print("=" * 60)
# Step 1: Fetch all listing pages
print("\nFáze 1: Stahování seznamu inzerátů...")
all_adverts = {} # id -> advert dict (dedup)
page = 1
total = None
while True:
print(f" Strana {page} ...")
adverts, total_count = fetch_page(page)
if total is None and total_count > 0:
total = total_count
total_pages = math.ceil(total / PER_PAGE)
print(f" → Celkem {total} inzerátů, ~{total_pages} stran")
if not adverts:
break
for adv in adverts:
adv_id = adv.get("id")
if adv_id and adv_id not in all_adverts:
all_adverts[adv_id] = adv
page += 1
if total and page > math.ceil(total / PER_PAGE):
break
time.sleep(0.5)
print(f"\n Staženo: {len(all_adverts)} unikátních inzerátů")
# Step 2: Pre-filter by disposition, price, area from list data
pre_filtered = []
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_no_gps = 0
for adv in all_adverts.values():
disp = adv.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
continue
price = adv.get("price", 0) or 0
if price > MAX_PRICE or price == 0:
excluded_price += 1
continue
surface = adv.get("surface")
if surface is not None and surface < MIN_AREA:
excluded_area += 1
continue
gps = adv.get("gps", {})
if not gps or not gps.get("lat") or not gps.get("lng"):
excluded_no_gps += 1
continue
pre_filtered.append(adv)
print(f"\nPo předfiltraci:")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
print(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_detail = 0
cache_hits = 0
for i, adv in enumerate(pre_filtered):
uri = adv.get("uri", "")
if not uri:
excluded_detail += 1
continue
# Check cache — if hash_id exists and price unchanged, reuse
adv_id = int(adv["id"])
adv_price = adv.get("price", 0) or 0
cached = cache.get(adv_id)
if cached and cached.get("price") == adv_price:
cache_hits += 1
results.append(cached)
continue
time.sleep(0.4)
detail = fetch_detail(uri)
if not detail:
excluded_detail += 1
continue
# Check construction — exclude panel
construction = detail.get("construction", "")
if construction == "PANEL":
excluded_panel += 1
print(f" ✗ Vyloučen #{adv['id']}: panel")
continue
# Check situation — exclude sídliště
situation = detail.get("situation", "")
if situation and "HOUSING_ESTATE" in str(situation).upper():
excluded_panel += 1
print(f" ✗ Vyloučen #{adv['id']}: sídliště")
continue
# Check floor (etage)
etage = detail.get("etage")
if etage is not None and etage < MIN_FLOOR:
excluded_floor += 1
continue
gps = adv.get("gps", {})
disp = adv.get("disposition", "")
# Get address — key includes locale parameter
address = ""
for key in detail:
if key.startswith("address(") and "withHouseNumber" not in key:
address = detail[key]
break
if not address:
for key in detail:
if key.startswith("address("):
address = detail[key]
break
if not address:
address = adv.get('address({"locale":"CS"})', "Praha")
result = {
"hash_id": int(adv["id"]),
"name": f"Prodej bytu {DISPOSITION_LABELS.get(disp, '?')} {adv.get('surface', '?')}",
"price": adv.get("price", 0),
"price_formatted": format_price(adv.get("price", 0)),
"locality": address,
"lat": gps["lat"],
"lon": gps["lng"],
"disposition": DISPOSITION_LABELS.get(disp, "?"),
"floor": etage,
"area": adv.get("surface"),
"building_type": CONSTRUCTION_MAP.get(construction, construction or "neuvedeno"),
"ownership": OWNERSHIP_MAP.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
"url": f"{BASE_URL}/nemovitosti-byty-domy/{uri}",
"source": "bezrealitky",
"image": "",
}
results.append(result)
if (i + 1) % 20 == 0:
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
print(f"\n{'=' * 60}")
print(f"Výsledky Bezrealitky:")
print(f" Předfiltrováno: {len(pre_filtered)}")
print(f" Z cache (přeskočeno): {cache_hits}")
print(f" Vyloučeno (panel/síd): {excluded_panel}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (bez detailu): {excluded_detail}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_bezrealitky.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z Bezrealitek neodpovídají kritériím :(")

328
scrape_cityhome.py Normal file
View File

@@ -0,0 +1,328 @@
#!/usr/bin/env python3
"""
CityHome (city-home.cz) scraper.
Stáhne byty na prodej v Praze z projektů CityHome/SATPO.
Výstup: byty_cityhome.json
"""
from __future__ import annotations
import json
import re
import time
import urllib.request
from pathlib import Path
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "cs,en;q=0.9",
}
BASE_URL = "https://www.city-home.cz"
def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string."""
for attempt in range(3):
try:
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
return resp.read().decode("utf-8")
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
if attempt < 2:
time.sleep((attempt + 1) * 2)
print(f" Retry {attempt + 1}: {e}")
else:
raise
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def parse_filter_page(html: str) -> list[dict]:
"""Parse all listing rows from the filter page."""
listings = []
# Find all <tr> with data-cena attribute
row_pattern = re.compile(
r'<tr[^>]*'
r'data-cena="(\d+)"[^>]*'
r'data-plocha="([\d.]+)"[^>]*'
r'data-unittype="(\d+)"[^>]*'
r'data-free="(yes|no)"[^>]*'
r'data-project="(\d+)"[^>]*'
r'data-transaction="([^"]*)"[^>]*'
r'data-dispozition="([^"]*)"[^>]*'
r'data-location="([^"]*)"[^>]*'
r'>(.*?)</tr>',
re.DOTALL
)
# Also try with different attribute order
rows = re.findall(r'<tr[^>]*data-cena="[^"]*"[^>]*>(.*?)</tr>', html, re.DOTALL)
for row_html in rows:
# Extract data attributes from the surrounding <tr>
tr_match = re.search(
r'<tr[^>]*data-cena="([^"]*)"[^>]*data-plocha="([^"]*)"[^>]*'
r'data-unittype="([^"]*)"[^>]*data-free="([^"]*)"[^>]*'
r'data-project="([^"]*)"[^>]*data-transaction="([^"]*)"[^>]*'
r'data-dispozition="([^"]*)"[^>]*data-location="([^"]*)"',
html
)
# More flexible: search around each row
pass
# Better approach: find each tr tag with all its attributes
for match in re.finditer(r'<tr\s+([^>]*data-cena="[^"]*"[^>]*)>(.*?)</tr>', html, re.DOTALL):
attrs_str = match.group(1)
row_content = match.group(2)
# Extract all data attributes
cena = re.search(r'data-cena="(\d+)"', attrs_str)
plocha = re.search(r'data-plocha="([\d.]+)"', attrs_str)
unittype = re.search(r'data-unittype="(\d+)"', attrs_str)
free = re.search(r'data-free="(yes|no)"', attrs_str)
project = re.search(r'data-project="(\d+)"', attrs_str)
transaction = re.search(r'data-transaction="([^"]*)"', attrs_str)
dispozition = re.search(r'data-dispozition="([^"]*)"', attrs_str)
location = re.search(r'data-location="([^"]*)"', attrs_str)
if not cena:
continue
# Extract detail URL and unit name from first cell
link_match = re.search(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', row_content, re.DOTALL)
detail_url = link_match.group(1).strip() if link_match else ""
unit_name = re.sub(r'<[^>]+>', '', link_match.group(2)).strip() if link_match else ""
if detail_url and not detail_url.startswith("http"):
detail_url = BASE_URL + detail_url
# Extract floor from cells — look for pattern like "3.NP" or "2.PP"
cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL)
floor = None
floor_text = ""
project_name = ""
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
# Floor pattern
np_match = re.search(r'(\d+)\.\s*NP', cell_text)
pp_match = re.search(r'(\d+)\.\s*PP', cell_text)
if np_match:
floor = int(np_match.group(1))
floor_text = cell_text
elif pp_match:
floor = -int(pp_match.group(1)) # Underground
floor_text = cell_text
# Extract project name — usually in a cell that's not a number/price/floor
for cell in cells:
cell_text = re.sub(r'<[^>]+>', '', cell).strip()
if cell_text and not re.match(r'^[\d\s.,]+$', cell_text) and "NP" not in cell_text and "PP" not in cell_text and "" not in cell_text and "" not in cell_text and "EUR" not in cell_text and "CZK" not in cell_text:
if len(cell_text) > 3 and cell_text != unit_name:
project_name = cell_text
break
listing = {
"price": int(cena.group(1)),
"area": float(plocha.group(1)) if plocha else 0,
"unittype": int(unittype.group(1)) if unittype else 0,
"free": free.group(1) if free else "no",
"project_id": project.group(1) if project else "",
"transaction": transaction.group(1) if transaction else "",
"disposition": dispozition.group(1) if dispozition else "",
"location": location.group(1) if location else "",
"url": detail_url,
"unit_name": unit_name,
"floor": floor,
"project_name": project_name,
}
listings.append(listing)
return listings
def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
"""Extract GPS coordinates for projects from locality pages."""
# Pattern in JS: ['<h4>Project Name</h4>...', 'LAT', 'LON', '1', 'Name']
gps_data = {}
for match in re.finditer(r"\['[^']*<h4>([^<]+)</h4>[^']*',\s*'([\d.]+)',\s*'([\d.]+)'", html):
name = match.group(1).strip()
lat = float(match.group(2))
lon = float(match.group(3))
gps_data[name] = (lat, lon)
return gps_data
def scrape():
print("=" * 60)
print("Stahuji inzeráty z CityHome (city-home.cz)")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print("=" * 60)
# Step 1: Fetch the main filter page
print("\nFáze 1: Stahování seznamu bytů...")
html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
all_listings = parse_filter_page(html)
print(f" Nalezeno: {len(all_listings)} jednotek")
# Step 2: Collect unique project slugs from detail URLs to fetch GPS
print("\nFáze 2: Stahování GPS souřadnic projektů...")
project_slugs = set()
for listing in all_listings:
url = listing.get("url", "")
# /projekty/zateckych-14/nabidka-nemovitosti/byt-a31
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
if slug_match:
project_slugs.add(slug_match.group(1))
# Fetch GPS for each project from locality pages
project_gps = {}
for slug in sorted(project_slugs):
time.sleep(0.5)
try:
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
loc_html = fetch_url(locality_url)
gps = extract_project_gps(loc_html)
if gps:
# Take first entry (the project itself)
first_name, (lat, lon) = next(iter(gps.items()))
project_gps[slug] = (lat, lon)
print(f"{slug}: {lat}, {lon}")
else:
print(f"{slug}: GPS nenalezeno")
except Exception as e:
print(f"{slug}: chyba ({e})")
# Step 3: Filter listings
print(f"\nFáze 3: Filtrování...")
results = []
excluded_sold = 0
excluded_type = 0
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_floor = 0
excluded_no_gps = 0
for listing in all_listings:
# Only available units
if listing["free"] != "yes":
excluded_sold += 1
continue
# Only apartments (unittype=2)
if listing["unittype"] != 2:
excluded_type += 1
continue
# Only sales
if listing["transaction"] != "prodej":
excluded_type += 1
continue
# Disposition
disp = listing["disposition"]
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
continue
# Price
price = listing["price"]
if price <= 0 or price > MAX_PRICE:
excluded_price += 1
continue
# Area
area = listing["area"]
if area < MIN_AREA:
excluded_area += 1
continue
# Floor
floor = listing["floor"]
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
continue
# GPS from project
url = listing.get("url", "")
slug_match = re.search(r'/(?:projekty|bytove-domy)/([^/]+)/', url)
slug = slug_match.group(1) if slug_match else ""
gps = project_gps.get(slug)
if not gps:
excluded_no_gps += 1
continue
lat, lon = gps
result = {
"hash_id": f"cityhome_{slug}_{listing['unit_name']}",
"name": f"Prodej bytu {disp} {area} m² — {listing['project_name']}",
"price": price,
"price_formatted": format_price(price),
"locality": f"{listing['project_name']}, Praha",
"lat": lat,
"lon": lon,
"disposition": disp,
"floor": floor,
"area": area,
"building_type": "Cihlová", # CityHome renovuje cihlové domy
"ownership": "neuvedeno",
"url": url,
"source": "cityhome",
"image": "",
}
results.append(result)
print(f"\n{'=' * 60}")
print(f"Výsledky CityHome:")
print(f" Celkem jednotek: {len(all_listings)}")
print(f" Vyloučeno (prodáno): {excluded_sold}")
print(f" Vyloučeno (typ): {excluded_type}")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_cityhome.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z CityHome neodpovídají kritériím :(")

464
scrape_idnes.py Normal file
View File

@@ -0,0 +1,464 @@
#!/usr/bin/env python3
"""
Reality iDNES scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_idnes.json
"""
from __future__ import annotations
import json
import math
import re
import time
import urllib.request
import urllib.parse
from html.parser import HTMLParser
from pathlib import Path
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 26 # iDNES vrací 26 na stránku
# Dispozice — kódy pro s-qc[subtypeFlat]
DISPOSITION_CODES = "3k|31|4k|41|5k|51|6k"
# Mapování dispozice z titulku na label
DISPOSITION_MAP = {
"3+kk": "3+kk", "3+1": "3+1",
"4+kk": "4+kk", "4+1": "4+1",
"5+kk": "5+kk", "5+1": "5+1",
"6+kk": "6+", "6+1": "6+",
"6 a více": "6+",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "cs,en;q=0.9",
"Accept-Encoding": "identity",
"Connection": "keep-alive",
}
BASE_URL = "https://reality.idnes.cz"
MAX_RETRIES = 5
def fetch_url(url: str) -> str:
"""Fetch URL and return HTML string with retry logic."""
for attempt in range(MAX_RETRIES):
try:
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
data = resp.read()
return data.decode("utf-8")
except (ConnectionResetError, ConnectionError, urllib.error.URLError,
OSError) as e:
if attempt < MAX_RETRIES - 1:
wait = (attempt + 1) * 3 # 3, 6, 9, 12s
print(f" Retry {attempt + 1}/{MAX_RETRIES} (wait {wait}s): {e}")
time.sleep(wait)
else:
raise
def build_list_url(page: int = 0) -> str:
"""Build listing URL with all filters."""
base = f"{BASE_URL}/s/prodej/byty/cena-do-{MAX_PRICE}/praha/"
params = {
"s-qc[subtypeFlat]": DISPOSITION_CODES,
"s-qc[usableAreaMin]": str(MIN_AREA),
}
url = f"{base}?{urllib.parse.urlencode(params)}"
if page > 0:
url += f"&page={page}"
return url
def parse_total_count(html: str) -> int:
"""Extract total listing count from page."""
# Look for "720 inzerátů" or similar
match = re.search(r'(\d[\d\s]*)\s*inzerát', html)
if match:
return int(match.group(1).replace(" ", "").replace("\xa0", ""))
return 0
def parse_listings(html: str) -> list[dict]:
"""Parse listing cards from HTML using regex."""
results = []
# Find each listing block — look for c-products__link with detail URL
# Pattern: <a ... class="c-products__link" href="/detail/..."> ... block ... </a>
# Each listing card contains: title (h2), price (strong), info (p.c-products__info)
# Split by listing items, skip ads
items = re.findall(
r'<div[^>]*class="c-products__item(?:(?!advertisment)[^"]*)"[^>]*>(.*?)</div>\s*</div>\s*</div>',
html, re.DOTALL
)
# Alternative: find all detail links and extract surrounding context
# More robust approach: find each detail link and parse nearby elements
link_pattern = re.compile(
r'<a[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*class="c-products__link"[^>]*>',
re.DOTALL
)
# Also match when class comes before href
link_pattern2 = re.compile(
r'<a[^>]*class="c-products__link"[^>]*href="([^"]*?/detail/[^"]*?)"[^>]*>',
re.DOTALL
)
# Find all c-products__link anchors
all_links = link_pattern.findall(html) + link_pattern2.findall(html)
seen_urls = set()
# For each link, find the surrounding product block
for link_url in all_links:
if link_url in seen_urls:
continue
seen_urls.add(link_url)
# Find context around this link (the product card)
escaped_url = re.escape(link_url)
context_match = re.search(
escaped_url + r'(.*?)</div>\s*</div>',
html, re.DOTALL
)
if not context_match:
continue
block = context_match.group(1)
# Ensure full URL
url = link_url
if not url.startswith("http"):
url = BASE_URL + url
# Skip ads
ad_check_start = max(0, context_match.start() - 500)
ad_block = html[ad_check_start:context_match.start()]
if "advertisment" in ad_block or "advertisement" in ad_block:
continue
# Parse title: <h2 class="c-products__title">prodej bytu 3+kk 79 m2</h2>
title_match = re.search(r'class="c-products__title"[^>]*>(.*?)</h2>', block, re.DOTALL)
title = re.sub(r'<[^>]+>', '', title_match.group(1)).strip().lower() if title_match else ""
# Parse price: <p class="c-products__price"><strong>12 950 000 Kč</strong></p>
price_match = re.search(r'c-products__price[^>]*>.*?<strong>(.*?)</strong>', block, re.DOTALL)
price_text = re.sub(r'<[^>]+>', '', price_match.group(1)).strip() if price_match else ""
# Parse address: <p class="c-products__info">Klečkova, Praha 5 - Stodůlky</p>
info_match = re.search(r'class="c-products__info"[^>]*>(.*?)</p>', block, re.DOTALL)
info = re.sub(r'<[^>]+>', '', info_match.group(1)).strip() if info_match else ""
# Parse disposition and area from title
disp_match = re.search(r'(\d\+(?:kk|\d))', title)
area_match = re.search(r'(\d+)\s*m[²2]', title)
disposition = disp_match.group(1) if disp_match else None
area = int(area_match.group(1)) if area_match else None
if not disposition and ("6 a" in title or "6+" in title):
disposition = "6+"
# Parse price
price = 0
if price_text and "vyžádání" not in price_text.lower():
price_clean = re.sub(r'[^\d]', '', price_text)
if price_clean:
price = int(price_clean)
# Extract listing ID from URL
id_match = re.search(r'/([a-f0-9]{24})/?', url)
listing_id = id_match.group(1) if id_match else url
results.append({
"id": listing_id,
"url": url,
"disposition": DISPOSITION_MAP.get(disposition, disposition or "?"),
"area": area,
"price": price,
"locality": info,
})
return results
def parse_detail(html: str) -> dict:
"""Parse detail page for GPS, floor, construction, ownership."""
detail = {}
# 1. Parse dataLayer.push() for GPS and other data
dl_match = re.search(
r'dataLayer\.push\(\s*(\{[^}]+?"listing_lat"[^}]+?\})\s*\)',
html, re.DOTALL
)
if dl_match:
# Clean up JS object to valid JSON
js_obj = dl_match.group(1)
# Replace single quotes with double, handle trailing commas, etc.
# The dataLayer is usually valid JSON-like, let's try parsing
try:
# Remove JS comments, handle unquoted keys
# Most importantly: listing_lat, listing_lon, listing_price, listing_area
lat_match = re.search(r'"listing_lat"\s*:\s*([\d.]+)', js_obj)
lon_match = re.search(r'"listing_lon"\s*:\s*([\d.]+)', js_obj)
if lat_match:
detail["lat"] = float(lat_match.group(1))
if lon_match:
detail["lon"] = float(lon_match.group(1))
except (ValueError, AttributeError):
pass
# 2. Parse DT/DD pairs for floor, construction, ownership
# Pattern: <dt>Label</dt><dd>Value</dd>
dt_dd_pairs = re.findall(
r'<dt[^>]*>(.*?)</dt>\s*<dd[^>]*>(.*?)</dd>',
html, re.DOTALL
)
for dt, dd in dt_dd_pairs:
dt_clean = re.sub(r'<[^>]+>', '', dt).strip().lower()
dd_clean = re.sub(r'<[^>]+>', '', dd).strip()
if "podlaží" in dt_clean or "podlazi" in dt_clean or "patro" in dt_clean:
# "2. patro (3. NP)" or "3. podlaží z celkem 5"
# Try to find NP first
np_match = re.search(r'(\d+)\.\s*NP', dd_clean)
if np_match:
detail["floor"] = int(np_match.group(1))
else:
# Try "X. patro" — patro = NP - 1 usually, but iDNES seems to use NP directly
patro_match = re.search(r'(\d+)', dd_clean)
if patro_match:
detail["floor"] = int(patro_match.group(1))
if "konstrukce" in dt_clean or "stavba" in dt_clean:
detail["construction"] = dd_clean.lower()
if "vlastnictví" in dt_clean or "vlastnictvi" in dt_clean:
detail["ownership"] = dd_clean
return detail
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def load_cache(json_path: str = "byty_idnes.json") -> dict[str, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {str(e["hash_id"]): e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape():
cache = load_cache()
print("=" * 60)
print("Stahuji inzeráty z Reality iDNES")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha")
if cache:
print(f"Cache: {len(cache)} bytů z minulého běhu")
print("=" * 60)
# Step 1: Fetch listing pages
print("\nFáze 1: Stahování seznamu inzerátů...")
all_listings = {} # id -> listing dict
page = 0
total = None
while True:
url = build_list_url(page)
print(f" Strana {page + 1} ...")
html = fetch_url(url)
if total is None:
total = parse_total_count(html)
total_pages = math.ceil(total / PER_PAGE) if total > 0 else 1
print(f" → Celkem {total} inzerátů, ~{total_pages} stran")
listings = parse_listings(html)
if not listings:
break
for item in listings:
lid = item["id"]
if lid not in all_listings:
all_listings[lid] = item
page += 1
if total and page >= math.ceil(total / PER_PAGE):
break
time.sleep(1.0)
print(f"\n Staženo: {len(all_listings)} unikátních inzerátů")
# Step 2: Pre-filter by price and area from list data
pre_filtered = []
excluded_price = 0
excluded_area = 0
excluded_disp = 0
for item in all_listings.values():
if item["price"] <= 0 or item["price"] > MAX_PRICE:
excluded_price += 1
continue
if item["area"] is not None and item["area"] < MIN_AREA:
excluded_area += 1
continue
if item["disposition"] == "?":
excluded_disp += 1
continue
pre_filtered.append(item)
print(f"\nPo předfiltraci:")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details for GPS, floor, construction
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_no_gps = 0
excluded_detail = 0
cache_hits = 0
for i, item in enumerate(pre_filtered):
# Check cache — if hash_id exists and price unchanged, reuse
cached = cache.get(str(item["id"]))
if cached and cached.get("price") == item["price"]:
cache_hits += 1
results.append(cached)
continue
url = item["url"]
time.sleep(0.4)
try:
html = fetch_url(url)
except Exception as e:
print(f" Warning: detail failed for {item['id']}: {e}")
excluded_detail += 1
continue
detail = parse_detail(html)
# Must have GPS
if not detail.get("lat") or not detail.get("lon"):
excluded_no_gps += 1
continue
# Check construction — exclude panel
construction = detail.get("construction", "")
if "panel" in construction:
excluded_panel += 1
print(f" ✗ Vyloučen {item['id'][:12]}...: panel ({construction})")
continue
# Check for sídliště in construction/description
if "sídliště" in construction or "sidliste" in construction:
excluded_panel += 1
print(f" ✗ Vyloučen {item['id'][:12]}...: sídliště")
continue
# Check floor
floor = detail.get("floor")
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
continue
# Map construction to Czech label
building_type = "neuvedeno"
if construction:
if "cihlo" in construction or "cihla" in construction:
building_type = "Cihlová"
elif "smíšen" in construction or "smisen" in construction:
building_type = "Smíšená"
elif "skelet" in construction:
building_type = "Skeletová"
elif "dřevo" in construction or "drevo" in construction:
building_type = "Dřevostavba"
elif "mont" in construction:
building_type = "Montovaná"
else:
building_type = construction.capitalize()
result = {
"hash_id": item["id"],
"name": f"Prodej bytu {item['disposition']} {item.get('area', '?')}",
"price": item["price"],
"price_formatted": format_price(item["price"]),
"locality": item["locality"],
"lat": detail["lat"],
"lon": detail["lon"],
"disposition": item["disposition"],
"floor": floor,
"area": item["area"],
"building_type": building_type,
"ownership": detail.get("ownership", "neuvedeno"),
"url": item["url"],
"source": "idnes",
"image": "",
}
results.append(result)
if (i + 1) % 20 == 0:
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
print(f"\n{'=' * 60}")
print(f"Výsledky Reality iDNES:")
print(f" Předfiltrováno: {len(pre_filtered)}")
print(f" Z cache (přeskočeno): {cache_hits}")
print(f" Vyloučeno (panel/síd): {excluded_panel}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
print(f" Vyloučeno (bez detailu): {excluded_detail}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_idnes.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z Reality iDNES neodpovídají kritériím :(")

306
scrape_psn.py Normal file
View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
PSN.cz scraper.
Stáhne byty na prodej v Praze z projektů PSN a vyfiltruje podle kritérií.
Výstup: byty_psn.json
"""
from __future__ import annotations
import json
import re
import subprocess
import time
from pathlib import Path
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://psn.cz"
# Known Prague project slugs with GPS (from research)
PRAGUE_PROJECTS = [
{"slug": "zit-branik", "name": "Žít Braník", "lat": 50.0353, "lon": 14.4125},
{"slug": "rostislavova-4", "name": "Rostislavova 4", "lat": 50.0620, "lon": 14.4463},
{"slug": "pod-drinopolem", "name": "Pod Drinopolem", "lat": 50.0851, "lon": 14.3720},
{"slug": "skyline-chodov", "name": "Skyline Chodov", "lat": 50.0418, "lon": 14.4990},
{"slug": "jitro", "name": "Jitro", "lat": 50.0729, "lon": 14.4768},
{"slug": "maroldka", "name": "Maroldka", "lat": 50.0614, "lon": 14.4517},
{"slug": "belehradska-29", "name": "Bělehradská 29", "lat": 50.0682, "lon": 14.4348},
{"slug": "jeseniova-93", "name": "Jeseniova 93", "lat": 50.0887, "lon": 14.4692},
{"slug": "vanguard", "name": "Vanguard", "lat": 50.0164, "lon": 14.4036},
{"slug": "vinohradska-160", "name": "Vinohradská 160", "lat": 50.0780, "lon": 14.4653},
{"slug": "hermanova24", "name": "Heřmanova 24", "lat": 50.1009, "lon": 14.4313},
{"slug": "vinohradska-8", "name": "Vinohradská 8", "lat": 50.0787, "lon": 14.4342},
{"slug": "bydleni-na-vysinach", "name": "Bydlení Na Výšinách", "lat": 50.1003, "lon": 14.4187},
{"slug": "bydleni-u-pekaren", "name": "Bydlení U Pekáren", "lat": 50.0555, "lon": 14.5414},
{"slug": "pechackova-6", "name": "Pechackova 6", "lat": 50.0734, "lon": 14.4063},
{"slug": "ahoj-vanguard", "name": "Ahoj Vanguard", "lat": 50.0164, "lon": 14.4033},
]
def fetch_url(url: str) -> str:
"""Fetch URL via curl (urllib SSL too old for Cloudflare)."""
result = subprocess.run(
["curl", "-s", "-L", "--max-time", "30",
"-H", f"User-Agent: {UA}",
"-H", "Accept: text/html",
url],
capture_output=True, text=True, timeout=60
)
if result.returncode != 0:
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
return result.stdout
def extract_units_from_html(html: str) -> list[dict]:
"""Extract unit JSON objects from raw HTML with escaped quotes."""
# The HTML contains RSC data with escaped JSON: \\"key\\":\\"value\\"
# Step 1: Unescape the double-backslash-quotes to regular quotes
cleaned = html.replace('\\"', '"')
# Step 2: Find each unit by looking for "title":"Byt and walking back to {
units = []
decoder = json.JSONDecoder()
for m in re.finditer(r'"title":"Byt', cleaned):
pos = m.start()
# Walk backwards to find the opening brace
depth = 0
found = False
for i in range(pos - 1, max(pos - 3000, 0), -1):
if cleaned[i] == '}':
depth += 1
elif cleaned[i] == '{':
if depth == 0:
try:
obj, end = decoder.raw_decode(cleaned, i)
if isinstance(obj, dict) and 'price_czk' in obj:
units.append(obj)
found = True
except (json.JSONDecodeError, ValueError):
pass
break
depth -= 1
return units
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def scrape():
print("=" * 60)
print("Stahuji inzeráty z PSN.cz")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
print("=" * 60)
# Fetch units from each Prague project
all_units = []
for proj in PRAGUE_PROJECTS:
page = 1
project_units = []
while True:
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
print(f" {proj['name']} — strana {page} ...")
time.sleep(0.5)
try:
html = fetch_url(url)
except Exception as e:
print(f" Chyba: {e}")
break
units = extract_units_from_html(html)
if not units:
if page == 1:
print(f" → 0 jednotek")
break
# Add project info to each unit
for unit in units:
if not unit.get("latitude") or not unit.get("longitude"):
unit["latitude"] = proj["lat"]
unit["longitude"] = proj["lon"]
unit["_project_name"] = proj["name"]
unit["_project_slug"] = proj["slug"]
project_units.extend(units)
if page == 1:
print(f"{len(units)} jednotek na stránce")
# Check if there might be more pages
# If we got fewer than expected or same units, stop
if len(units) < 10:
break
page += 1
if page > 10: # Safety limit
break
all_units.extend(project_units)
# Deduplicate by slug
seen_slugs = set()
unique_units = []
for u in all_units:
slug = u.get("slug", "")
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
unique_units.append(u)
elif not slug:
unique_units.append(u)
print(f"\n Staženo celkem: {len(unique_units)} unikátních jednotek")
# Filter
print(f"\nFiltrování...")
results = []
excluded_sold = 0
excluded_type = 0
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_floor = 0
excluded_panel = 0
for unit in unique_units:
# Only free units
is_free = unit.get("is_free", False)
is_sold = unit.get("is_sold", False)
if is_sold or not is_free:
excluded_sold += 1
continue
# Only apartments
category = str(unit.get("category", "")).lower()
if "byt" not in category and "ateliér" not in category:
excluded_type += 1
continue
# Disposition
disp = unit.get("disposition", "")
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
continue
# Price
price = unit.get("price_czk") or unit.get("action_price_czk") or 0
if price <= 0 or price > MAX_PRICE:
excluded_price += 1
continue
# Area
area = unit.get("total_area") or unit.get("floor_area") or 0
if area < MIN_AREA:
excluded_area += 1
continue
# Floor
floor_str = str(unit.get("floor", ""))
floor = None
if floor_str:
try:
floor = int(floor_str)
except ValueError:
floor_match = re.search(r'(-?\d+)', floor_str)
if floor_match:
floor = int(floor_match.group(1))
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
continue
# Construction — check for panel
build_type = str(unit.get("build_type", "")).lower()
if "panel" in build_type:
excluded_panel += 1
print(f" ✗ Vyloučen: panel ({build_type})")
continue
# Build construction label
building_type = "neuvedeno"
if build_type and build_type != "nevybráno":
if "cihlo" in build_type or "cihla" in build_type:
building_type = "Cihlová"
elif "skelet" in build_type:
building_type = "Skeletová"
else:
building_type = build_type.capitalize()
lat = unit.get("latitude", 0)
lon = unit.get("longitude", 0)
slug = unit.get("slug", "")
project_slug = unit.get("_project_slug", "")
detail_url = f"{BASE_URL}/projekt/{project_slug}/{slug}" if slug else f"{BASE_URL}/projekt/{project_slug}"
result = {
"hash_id": unit.get("id", slug),
"name": f"Prodej bytu {disp} {area} m² — {unit.get('_project_name', '')}",
"price": int(price),
"price_formatted": format_price(int(price)),
"locality": f"{unit.get('street', unit.get('_project_name', ''))}, Praha",
"lat": lat,
"lon": lon,
"disposition": disp,
"floor": floor,
"area": area,
"building_type": building_type,
"ownership": unit.get("ownership", "neuvedeno") or "neuvedeno",
"url": detail_url,
"source": "psn",
"image": "",
}
results.append(result)
print(f"\n{'=' * 60}")
print(f"Výsledky PSN:")
print(f" Celkem jednotek: {len(unique_units)}")
print(f" Vyloučeno (prodáno): {excluded_sold}")
print(f" Vyloučeno (typ): {excluded_type}")
print(f" Vyloučeno (dispozice): {excluded_disp}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (panel): {excluded_panel}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_psn.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z PSN neodpovídají kritériím :(")

311
scrape_realingo.py Normal file
View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
"""
Realingo.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_realingo.json
"""
from __future__ import annotations
import json
import math
import re
import time
import urllib.request
from pathlib import Path
# ── Konfigurace (sdílená se Sreality scraperem) ─────────────────────────────
MAX_PRICE = 13_500_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 40 # Realingo vrací 40 na stránku
# Kategorie které chceme (dispozice 3+kk a větší)
WANTED_CATEGORIES = {
"FLAT3_KK", "FLAT31", # 3+kk, 3+1
"FLAT4_KK", "FLAT41", # 4+kk, 4+1
"FLAT5_KK", "FLAT51", # 5+kk, 5+1
"FLAT6", # 6+
"OTHERS_FLAT", # atypické — zkontrolujeme plochu
}
# Mapování category → label
CATEGORY_LABELS = {
"FLAT1_KK": "1+kk", "FLAT11": "1+1",
"FLAT2_KK": "2+kk", "FLAT21": "2+1",
"FLAT3_KK": "3+kk", "FLAT31": "3+1",
"FLAT4_KK": "4+kk", "FLAT41": "4+1",
"FLAT5_KK": "5+kk", "FLAT51": "5+1",
"FLAT6": "6+",
"OTHERS_FLAT": "Atypický",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
}
BASE_URL = "https://www.realingo.cz"
def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
"""Fetch a page of Prague listings. Returns (items, total_count)."""
if page == 1:
url = f"{BASE_URL}/prodej_byty/praha/"
else:
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return [], 0
data = json.loads(match.group(1))
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
return offer_list["data"], offer_list["total"]
def fetch_detail(listing_url: str) -> dict | None:
"""Fetch detail page for a listing to get floor, building type, etc."""
try:
url = f"{BASE_URL}{listing_url}"
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8")
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
html, re.DOTALL
)
if not match:
return None
data = json.loads(match.group(1))
details = data["props"]["pageProps"]["store"]["offer"]["details"]
# Get first (only) detail entry
for detail_data in details.values():
return detail_data
except Exception as e:
print(f" Warning: detail fetch failed for {listing_url}: {e}")
return None
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {e["hash_id"]: e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape():
cache = load_cache()
print("=" * 60)
print("Stahuji inzeráty z Realingo.cz")
print(f"Cena: do {format_price(MAX_PRICE)}")
print(f"Min. plocha: {MIN_AREA}")
print(f"Patro: od {MIN_FLOOR}. NP")
print(f"Region: Praha")
if cache:
print(f"Cache: {len(cache)} bytů z minulého běhu")
print("=" * 60)
# Step 1: Fetch all listing pages
print("\nFáze 1: Stahování seznamu inzerátů...")
all_listings = []
page = 1
total = None
while True:
print(f" Strana {page} ...")
items, total_count = fetch_listing_page(page)
if total is None:
total = total_count
total_pages = math.ceil(total / PER_PAGE)
print(f" → Celkem {total} inzerátů, {total_pages} stran")
if not items:
break
all_listings.extend(items)
page += 1
if page > total_pages:
break
time.sleep(0.5)
print(f"\n Staženo: {len(all_listings)} inzerátů")
# Step 2: Pre-filter by category, price, area from listing data
pre_filtered = []
excluded_category = 0
excluded_price = 0
excluded_area = 0
excluded_no_gps = 0
for item in all_listings:
cat = item.get("category", "")
if cat not in WANTED_CATEGORIES:
excluded_category += 1
continue
price = item.get("price", {}).get("total", 0) or 0
if price > MAX_PRICE or price == 0:
excluded_price += 1
continue
area = item.get("area", {}).get("main")
if area is not None and area < MIN_AREA:
excluded_area += 1
continue
loc = item.get("location", {})
if not loc.get("latitude") or not loc.get("longitude"):
excluded_no_gps += 1
continue
pre_filtered.append(item)
print(f"\nPo předfiltraci:")
print(f" Vyloučeno (dispozice): {excluded_category}")
print(f" Vyloučeno (cena): {excluded_price}")
print(f" Vyloučeno (plocha): {excluded_area}")
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
print(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details for remaining listings (floor, building type)
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_detail = 0
cache_hits = 0
for i, item in enumerate(pre_filtered):
# Check cache — if hash_id exists and price unchanged, reuse
item_id = int(item["id"])
item_price = item.get("price", {}).get("total", 0) or 0
cached = cache.get(item_id)
if cached and cached.get("price") == item_price:
cache_hits += 1
results.append(cached)
continue
time.sleep(0.3)
detail_data = fetch_detail(item["url"])
if not detail_data:
excluded_detail += 1
continue
detail = detail_data.get("offer", {}).get("detail", {})
if not detail and "detail" in detail_data:
detail = detail_data["detail"]
# Check building type — exclude panel
building_type = detail.get("buildingType", "")
if building_type == "PANEL":
excluded_panel += 1
print(f" ✗ Vyloučen #{item['id']}: panel")
continue
# Check building position — exclude sídliště
building_position = detail.get("buildingPosition", "")
if building_position and "ESTATE" in str(building_position).upper():
excluded_panel += 1
print(f" ✗ Vyloučen #{item['id']}: sídliště")
continue
# Check floor
floor = detail.get("floor")
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
continue
# Map building type
bt_map = {
"BRICK": "Cihlová",
"PANEL": "Panelová",
"WOOD": "Dřevostavba",
"STEEL": "Ocelová",
"MIXED": "Smíšená",
"MONTAGE": "Montovaná",
}
ownership_map = {
"PRIVATE": "Osobní",
"COOPERATIVE": "Družstevní",
"STATE": "Státní/obecní",
}
cat = item.get("category", "")
loc = item.get("location", {})
result = {
"hash_id": int(item["id"]),
"name": f"Prodej bytu {CATEGORY_LABELS.get(cat, '?')} {item.get('area', {}).get('main', '?')}",
"price": item.get("price", {}).get("total", 0),
"price_formatted": format_price(item.get("price", {}).get("total", 0)),
"locality": loc.get("address", "Praha"),
"lat": loc["latitude"],
"lon": loc["longitude"],
"disposition": CATEGORY_LABELS.get(cat, "?"),
"floor": floor,
"area": item.get("area", {}).get("main"),
"building_type": bt_map.get(building_type, building_type or "neuvedeno"),
"ownership": ownership_map.get(detail.get("ownership", ""), detail.get("ownership") or "neuvedeno"),
"url": f"{BASE_URL}{item['url']}",
"source": "realingo",
"image": "",
}
results.append(result)
if (i + 1) % 20 == 0:
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
print(f"\n{'=' * 60}")
print(f"Výsledky Realingo:")
print(f" Předfiltrováno: {len(pre_filtered)}")
print(f" Z cache (přeskočeno): {cache_hits}")
print(f" Vyloučeno (panel/síd): {excluded_panel}")
print(f" Vyloučeno (patro): {excluded_floor}")
print(f" Vyloučeno (bez detailu): {excluded_detail}")
print(f" ✓ Vyhovující byty: {len(results)}")
print(f"{'=' * 60}")
return results
if __name__ == "__main__":
start = time.time()
estates = scrape()
if estates:
json_path = Path("byty_realingo.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
print(f"\n✓ Data uložena: {json_path.resolve()}")
print(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
print("\nŽádné byty z Realinga neodpovídají kritériím :(")