#!/usr/bin/env python3 """ Bazoš.cz scraper. Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. Výstup: byty_bazos.json """ from __future__ import annotations import argparse from datetime import datetime import json import logging import math import re import time import urllib.request import urllib.parse from pathlib import Path from scraper_stats import write_stats, validate_listing STATS_FILE = "stats_bazos.json" logger = logging.getLogger(__name__) # ── Konfigurace ───────────────────────────────────────────────────────────── MAX_PRICE = 14_000_000 MIN_AREA = 69 MIN_FLOOR = 2 PER_PAGE = 20 # Bazoš vrací 20 na stránku WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} # Regex patterns pro parsování dispozice, plochy a patra z textu DISP_RE = re.compile(r'(\d)\s*\+\s*(kk|1)', re.IGNORECASE) AREA_RE = re.compile(r'(\d+(?:[.,]\d+)?)\s*m[²2\s,.]', re.IGNORECASE) FLOOR_RE = re.compile(r'(\d+)\s*[./]\s*(\d+)\s*(?:NP|patr|podlaž|floor)', re.IGNORECASE) FLOOR_RE2 = re.compile(r'(\d+)\.\s*(?:NP|patr[eouě]|podlaž[ií])', re.IGNORECASE) FLOOR_RE3 = re.compile(r'(?:patr[eouě]|podlaž[ií]|NP)\s*[:\s]*(\d+)', re.IGNORECASE) PANEL_RE = re.compile(r'panel(?:ov|ák|\.)', re.IGNORECASE) SIDLISTE_RE = re.compile(r'sídliště|sidliste|panelák', re.IGNORECASE) HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml", "Accept-Language": "cs,en;q=0.9", } BASE_URL = "https://reality.bazos.cz" SEARCH_PARAMS = "hledat=&rubriky=reality&hlokalita=Praha&humkreis=25&cenado={max_price}&kitx=ano" def fetch_url(url: str, retries: int = 3) -> str: """Fetch URL and return HTML string with retry on transient errors.""" for attempt in range(retries): try: logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=30) html = resp.read().decode("utf-8", errors="replace") logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") return html except urllib.error.HTTPError: raise except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e: if attempt < retries - 1: wait = (attempt + 1) * 3 logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") time.sleep(wait) else: logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) raise def format_price(price: int) -> str: s = str(price) parts = [] while s: parts.append(s[-3:]) s = s[:-3] return " ".join(reversed(parts)) + " Kč" def parse_price(text: str) -> int: """Parse price from text like '5 250 000 Kč' → 5250000.""" cleaned = re.sub(r'[^\d]', '', text) return int(cleaned) if cleaned else 0 def parse_disposition(text: str) -> str | None: """Parse disposition from title/description like '3+kk', '4+1'.""" m = DISP_RE.search(text) if m: rooms = m.group(1) suffix = m.group(2).lower() return f"{rooms}+{suffix}" return None def parse_area(text: str) -> float | None: """Parse area from text like '82 m²' → 82.0.""" m = AREA_RE.search(text) if m: return float(m.group(1).replace(',', '.')) return None def parse_floor(text: str) -> int | None: """Parse floor number from description.""" for pattern in [FLOOR_RE, FLOOR_RE2, FLOOR_RE3]: m = pattern.search(text) if m: return int(m.group(1)) return None def is_panel(text: str) -> bool: """Check if description mentions panel construction.""" return bool(PANEL_RE.search(text)) def is_sidliste(text: str) -> bool: """Check if description mentions housing estate.""" return bool(SIDLISTE_RE.search(text)) def fetch_listing_page(offset: int = 0, pagination_params: str | None = None) -> tuple[list[dict], int, str | None]: """ Fetch a page of listings from Bazoš. Returns (list of basic listing dicts, total count, pagination_params for next pages). """ if pagination_params and offset > 0: # Use resolved numeric params from first page's pagination links url = f"{BASE_URL}/prodam/byt/{offset}/?{pagination_params}" else: params = SEARCH_PARAMS.format(max_price=MAX_PRICE) if offset > 0: url = f"{BASE_URL}/prodam/byt/{offset}/?{params}" else: url = f"{BASE_URL}/prodam/byt/?{params}" html = fetch_url(url) # Parse total count: "Zobrazeno 1-20 z 727" total = 0 total_match = re.search(r'z\s+([\d\s]+)\s', html) if total_match: total = int(total_match.group(1).replace(' ', '')) # Extract resolved pagination params from first page (Bazoš converts # hlokalita=Praha → hlokalita=11000, and pagination only works with numeric form) resolved_params = None pag_link = re.search(r'href="/prodam/byt/\d+/\?([^"]+)"', html) if pag_link: resolved_params = pag_link.group(1) # Parse listings — split by listing blocks (class="inzeraty inzeratyflex") listings = [] all_blocks = re.split(r'
', html)[1:] # skip before first for block in all_blocks: # Extract URL and ID from first link (/inzerat/XXXXXX/slug.php) url_match = re.search(r'href="(/inzerat/(\d+)/[^"]*)"', block) if not url_match: continue detail_path = url_match.group(1) listing_id = int(url_match.group(2)) # Title — class=nadpis (without quotes) or class="nadpis" title_match = re.search(r'class=.?nadpis.?[^>]*>\s*]*>([^<]+)', block) title = title_match.group(1).strip() if title_match else "" # Price — inside within inzeratycena price_match = re.search(r'class="inzeratycena"[^>]*>.*?]*>([^<]+)', block, re.DOTALL) if not price_match: # Fallback: direct text in inzeratycena price_match = re.search(r'class="inzeratycena"[^>]*>\s*(?:)?([^<]+)', block) price_text = price_match.group(1).strip() if price_match else "" price = parse_price(price_text) # Location loc_match = re.search(r'class="inzeratylok"[^>]*>(.*?)
', block, re.DOTALL) location = "" if loc_match: location = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip() location = re.sub(r'\s+', ' ', location) # Date — [5.3. 2026] date_match = re.search(r'\[(\d+\.\d+\.\s*\d{4})\]', block) date_str = date_match.group(1).strip() if date_match else "" # Description preview — class=popis (without quotes) or class="popis" desc_match = re.search(r'class=.?popis.?[^>]*>(.*?)', block, re.DOTALL) description = "" if desc_match: description = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip() description = re.sub(r'\s+', ' ', description) # Image — img_match = re.search(r']*src="([^"]+)"[^>]*class="obrazek"', block) if not img_match: img_match = re.search(r'class="obrazek"[^>]*src="([^"]+)"', block) image = img_match.group(1) if img_match else "" if "empty.gif" in image: image = "" listings.append({ "id": listing_id, "title": title, "price": price, "location": location, "date": date_str, "description": description, "detail_path": detail_path, "image": image, }) logger.debug(f"Offset {offset}: found {len(listings)} listings, total={total}") return listings, total, resolved_params def fetch_detail(path: str) -> dict | None: """Fetch listing detail page and extract GPS, full description.""" try: url = f"{BASE_URL}{path}" html = fetch_url(url) result = {} # GPS from Google Maps link gps_match = re.search(r'google\.com/maps[^"]*place/([\d.]+),([\d.]+)', html) if gps_match: result["lat"] = float(gps_match.group(1)) result["lon"] = float(gps_match.group(2)) # Full description — Bazoš uses unquoted class=popisdetail desc_match = re.search(r'class=.?popisdetail.?[^>]*>(.*?)', html, re.DOTALL) if desc_match: desc = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip() desc = re.sub(r'\s+', ' ', desc) result["description"] = desc # Location from detail loc_match = re.search(r'Lokalita:\s*]*>(.*?)', html, re.DOTALL) if loc_match: loc = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip() loc = re.sub(r'\s+', ' ', loc) result["detail_location"] = loc return result except Exception as e: logger.warning(f"Detail fetch failed for {path}: {e}") return None def load_cache(json_path: str = "byty_bazos.json") -> dict[int, dict]: """Load previously scraped data as cache keyed by hash_id.""" path = Path(json_path) if not path.exists(): return {} try: data = json.loads(path.read_text(encoding="utf-8")) return {e["hash_id"]: e for e in data if "hash_id" in e} except (json.JSONDecodeError, KeyError): return {} def scrape(max_pages: int | None = None, max_properties: int | None = None): _run_start = time.time() _run_ts = datetime.now().isoformat(timespec="seconds") cache = load_cache() today = datetime.now().strftime("%Y-%m-%d") logger.info("=" * 60) logger.info("Stahuji inzeráty z Bazoš.cz") logger.info(f"Cena: do {format_price(MAX_PRICE)}") logger.info(f"Min. plocha: {MIN_AREA} m²") logger.info(f"Patro: od {MIN_FLOOR}. NP") logger.info(f"Region: Praha") if cache: logger.info(f"Cache: {len(cache)} bytů z minulého běhu") if max_pages: logger.info(f"Max. stran: {max_pages}") if max_properties: logger.info(f"Max. bytů: {max_properties}") logger.info("=" * 60) # Step 1: Fetch listing pages logger.info("\nFáze 1: Stahování seznamu inzerátů...") all_listings = {} # id -> listing dict (dedup) page = 1 offset = 0 total = None pagination_params = None # resolved numeric params from first page while True: if max_pages and page > max_pages: logger.debug(f"Max pages limit reached: {max_pages}") break logger.info(f"Strana {page} (offset {offset}) ...") listings, total_count, resolved = fetch_listing_page(offset, pagination_params) if resolved and not pagination_params: pagination_params = resolved logger.debug(f"Resolved pagination params: {pagination_params}") if total is None and total_count > 0: total = total_count total_pages = math.ceil(total / PER_PAGE) logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran") if not listings: logger.debug(f"No listings found on page {page}, stopping") break for lst in listings: lid = lst["id"] if lid not in all_listings: all_listings[lid] = lst page += 1 offset += PER_PAGE if total and offset >= total: break time.sleep(0.5) logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů") # Step 2: Pre-filter by disposition, price, area from listing data pre_filtered = [] excluded_disp = 0 excluded_price = 0 excluded_area = 0 excluded_no_disp = 0 for lst in all_listings.values(): title_and_desc = f"{lst['title']} {lst['description']}" # Parse disposition disp = parse_disposition(title_and_desc) if not disp: excluded_no_disp += 1 logger.debug(f"Filter: id={lst['id']} - excluded (no disposition found in '{lst['title']}')") continue if disp not in WANTED_DISPOSITIONS: excluded_disp += 1 logger.debug(f"Filter: id={lst['id']} - excluded (disposition {disp})") continue # Price price = lst["price"] if price <= 0 or price > MAX_PRICE: excluded_price += 1 logger.debug(f"Filter: id={lst['id']} - excluded (price {price})") continue # Area (if parseable from listing) area = parse_area(title_and_desc) if area is not None and area < MIN_AREA: excluded_area += 1 logger.debug(f"Filter: id={lst['id']} - excluded (area {area} m²)") continue lst["_disposition"] = disp lst["_area"] = area pre_filtered.append(lst) logger.info(f"\nPo předfiltraci:") logger.info(f" Vyloučeno (bez dispozice): {excluded_no_disp}") logger.info(f" Vyloučeno (dispozice): {excluded_disp}") logger.info(f" Vyloučeno (cena): {excluded_price}") logger.info(f" Vyloučeno (plocha): {excluded_area}") logger.info(f" Zbývá: {len(pre_filtered)}") # Step 3: Fetch details (for GPS + full description) logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") results = [] excluded_panel = 0 excluded_floor = 0 excluded_no_gps = 0 excluded_detail = 0 excluded_area_detail = 0 cache_hits = 0 properties_fetched = 0 for i, lst in enumerate(pre_filtered): if max_properties and properties_fetched >= max_properties: logger.debug(f"Max properties limit reached: {max_properties}") break listing_id = lst["id"] price = lst["price"] # Check cache cached = cache.get(listing_id) if cached and cached.get("price") == price: cache_hits += 1 logger.debug(f"Cache hit for id={listing_id}") results.append(cached) continue time.sleep(0.4) detail = fetch_detail(lst["detail_path"]) if not detail: excluded_detail += 1 logger.debug(f"Filter: id={listing_id} - excluded (detail fetch failed)") continue # GPS required lat = detail.get("lat") lon = detail.get("lon") if not lat or not lon: excluded_no_gps += 1 logger.debug(f"Filter: id={listing_id} - excluded (no GPS)") continue # Full text for filtering full_desc = detail.get("description", "") full_text = f"{lst['title']} {lst['description']} {full_desc}" # Panel check if is_panel(full_text): excluded_panel += 1 logger.info(f"✗ Vyloučen #{listing_id}: panelová stavba") continue # Sídliště check if is_sidliste(full_text): excluded_panel += 1 logger.info(f"✗ Vyloučen #{listing_id}: sídliště") continue # Floor floor = parse_floor(full_text) if floor is not None and floor < MIN_FLOOR: excluded_floor += 1 logger.debug(f"Filter: id={listing_id} - excluded (floor {floor})") continue # Area — re-check from detail if not found before area = lst.get("_area") or parse_area(full_desc) if area is not None and area < MIN_AREA: excluded_area_detail += 1 logger.debug(f"Filter: id={listing_id} - excluded (area {area} m² from detail)") continue disp = lst["_disposition"] locality = detail.get("detail_location") or lst["location"] result = { "hash_id": listing_id, "name": f"Prodej bytu {disp} {int(area) if area else '?'} m²", "price": price, "price_formatted": format_price(price), "locality": locality, "lat": lat, "lon": lon, "disposition": disp, "floor": floor, "area": area, "building_type": "neuvedeno", "ownership": "neuvedeno", "url": f"{BASE_URL}{lst['detail_path']}", "source": "bazos", "image": lst.get("image", ""), "scraped_at": today, "first_seen": cached.get("first_seen", today) if cached else today, "last_changed": today if not cached or cached.get("price") != price else cached.get("last_changed", today), } if not validate_listing(result, "bazos"): continue results.append(result) properties_fetched += 1 if (i + 1) % 20 == 0: logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...") logger.info(f"\n{'=' * 60}") logger.info(f"Výsledky Bazoš:") logger.info(f" Předfiltrováno: {len(pre_filtered)}") logger.info(f" Z cache (přeskočeno): {cache_hits}") logger.info(f" Vyloučeno (panel/síd): {excluded_panel}") logger.info(f" Vyloučeno (patro): {excluded_floor}") logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}") logger.info(f" Vyloučeno (bez detailu): {excluded_detail}") logger.info(f" Vyloučeno (plocha det): {excluded_area_detail}") logger.info(f" ✓ Vyhovující byty: {len(results)}") logger.info(f"{'=' * 60}") write_stats(STATS_FILE, { "source": "Bazoš", "timestamp": _run_ts, "duration_sec": round(time.time() - _run_start, 1), "success": True, "accepted": len(results), "fetched": len(all_listings), "pages": page - 1, "cache_hits": cache_hits, "excluded": { "bez dispozice": excluded_no_disp, "dispozice": excluded_disp, "cena": excluded_price, "plocha": excluded_area + excluded_area_detail, "bez GPS": excluded_no_gps, "panel/síd": excluded_panel, "patro": excluded_floor, "bez detailu": excluded_detail, }, }) return results if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape apartments from Bazoš.cz") parser.add_argument("--max-pages", type=int, default=None, help="Maximum number of listing pages to scrape") parser.add_argument("--max-properties", type=int, default=None, help="Maximum number of properties to fetch details for") parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level (default: INFO)") args = parser.parse_args() logging.basicConfig( level=getattr(logging, args.log_level), format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()] ) _run_ts = datetime.now().isoformat(timespec="seconds") start = time.time() try: estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) except Exception as e: logger.error(f"Scraper failed: {e}", exc_info=True) write_stats(STATS_FILE, { "source": "Bazoš", "timestamp": _run_ts, "duration_sec": round(time.time() - start, 1), "success": False, "accepted": 0, "fetched": 0, "error": str(e), }) raise if estates: json_path = Path("byty_bazos.json") json_path.write_text( json.dumps(estates, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed = time.time() - start logger.info(f"\n✓ Data uložena: {json_path.resolve()}") logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") else: logger.info("\nŽádné byty z Bazoše neodpovídají kritériím :(")