From 27e5b05f882c82475c2c61ac3aff6d12cc1b48c1 Mon Sep 17 00:00:00 2001 From: Marie Michalova Date: Fri, 6 Mar 2026 09:47:37 +0100 Subject: [PATCH] =?UTF-8?q?Add=20Bazo=C5=A1.cz=20as=20new=20apartment=20sc?= =?UTF-8?q?raper=20source?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New scraper for reality.bazos.cz with full HTML parsing (no API), GPS extraction from Google Maps links, panel/sídliště filtering, floor/area parsing from free text, and pagination fix for Bazoš's numeric locality codes. Integrated into merge pipeline and map with purple (#7B1FA2) markers. Co-Authored-By: Claude Opus 4.6 --- merge_and_map.py | 3 +- run_all.sh | 7 +- scrape_and_map.py | 4 +- scrape_bazos.py | 560 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 569 insertions(+), 5 deletions(-) create mode 100644 scrape_bazos.py diff --git a/merge_and_map.py b/merge_and_map.py index 1eb9406..335b758 100644 --- a/merge_and_map.py +++ b/merge_and_map.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN a CityHome, +Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN, CityHome a Bazoše, deduplikuje a vygeneruje mapu. Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát. PSN a CityHome mají při deduplikaci prioritu (načtou se první). @@ -44,6 +44,7 @@ def main(): ("Realingo", "byty_realingo.json"), ("Bezrealitky", "byty_bezrealitky.json"), ("iDNES", "byty_idnes.json"), + ("Bazoš", "byty_bazos.json"), ] all_estates = [] diff --git a/run_all.sh b/run_all.sh index 79f682d..e19b75f 100755 --- a/run_all.sh +++ b/run_all.sh @@ -13,7 +13,7 @@ RED='\033[0;31m' BOLD='\033[1m' NC='\033[0m' -TOTAL=6 +TOTAL=7 CURRENT=0 FAILED=0 START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S") @@ -98,6 +98,9 @@ PID_CH=$! wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); } wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); } +step "Bazoš" +python3 scrape_bazos.py $SCRAPER_ARGS || { echo -e "${RED}✗ Bazoš selhalo${NC}"; FAILED=$((FAILED + 1)); } + step "Realingo" python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); } @@ -117,7 +120,7 @@ python3 generate_status.py --start-time "$START_TIME" --duration "$DURATION" $KE echo "" echo "============================================================" if [ $FAILED -eq 0 ]; then - echo -e "${GREEN}${BOLD}Hotovo! Všech 6 zdrojů úspěšně staženo.${NC}" + echo -e "${GREEN}${BOLD}Hotovo! Všech 7 zdrojů úspěšně staženo.${NC}" else echo -e "${RED}${BOLD}Hotovo s $FAILED chybami.${NC}" fi diff --git a/scrape_and_map.py b/scrape_and_map.py index 0b49717..f8ee4db 100644 --- a/scrape_and_map.py +++ b/scrape_and_map.py @@ -480,8 +480,8 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"): floor_note = '
⚠ 2. NP — zvážit klidnost lokality' source = e.get("source", "sreality") - source_labels = {"sreality": "Sreality", "realingo": "Realingo", "bezrealitky": "Bezrealitky", "idnes": "iDNES", "psn": "PSN", "cityhome": "CityHome"} - source_colors = {"sreality": "#1976D2", "realingo": "#00897B", "bezrealitky": "#E91E63", "idnes": "#FF6F00", "psn": "#D32F2F", "cityhome": "#D32F2F"} + source_labels = {"sreality": "Sreality", "realingo": "Realingo", "bezrealitky": "Bezrealitky", "idnes": "iDNES", "psn": "PSN", "cityhome": "CityHome", "bazos": "Bazoš"} + source_colors = {"sreality": "#1976D2", "realingo": "#00897B", "bezrealitky": "#E91E63", "idnes": "#FF6F00", "psn": "#D32F2F", "cityhome": "#D32F2F", "bazos": "#7B1FA2"} source_label = source_labels.get(source, source) source_color = source_colors.get(source, "#999") diff --git a/scrape_bazos.py b/scrape_bazos.py new file mode 100644 index 0000000..21091d4 --- /dev/null +++ b/scrape_bazos.py @@ -0,0 +1,560 @@ +#!/usr/bin/env python3 +""" +Bazoš.cz scraper. +Stáhne byty na prodej v Praze a vyfiltruje podle kritérií. +Výstup: byty_bazos.json +""" +from __future__ import annotations + +import argparse +from datetime import datetime +import json +import logging +import math +import re +import time +import urllib.request +import urllib.parse +from pathlib import Path +from scraper_stats import write_stats, validate_listing + +STATS_FILE = "stats_bazos.json" + +logger = logging.getLogger(__name__) + +# ── Konfigurace ───────────────────────────────────────────────────────────── + +MAX_PRICE = 14_000_000 +MIN_AREA = 69 +MIN_FLOOR = 2 +PER_PAGE = 20 # Bazoš vrací 20 na stránku + +WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"} + +# Regex patterns pro parsování dispozice, plochy a patra z textu +DISP_RE = re.compile(r'(\d)\s*\+\s*(kk|1)', re.IGNORECASE) +AREA_RE = re.compile(r'(\d+(?:[.,]\d+)?)\s*m[²2\s,.]', re.IGNORECASE) +FLOOR_RE = re.compile(r'(\d+)\s*[./]\s*(\d+)\s*(?:NP|patr|podlaž|floor)', re.IGNORECASE) +FLOOR_RE2 = re.compile(r'(\d+)\.\s*(?:NP|patr[eouě]|podlaž[ií])', re.IGNORECASE) +FLOOR_RE3 = re.compile(r'(?:patr[eouě]|podlaž[ií]|NP)\s*[:\s]*(\d+)', re.IGNORECASE) +PANEL_RE = re.compile(r'panel(?:ov|ák|\.)', re.IGNORECASE) +SIDLISTE_RE = re.compile(r'sídliště|sidliste|panelák', re.IGNORECASE) + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml", + "Accept-Language": "cs,en;q=0.9", +} + +BASE_URL = "https://reality.bazos.cz" +SEARCH_PARAMS = "hledat=&rubriky=reality&hlokalita=Praha&humkreis=25&cenado={max_price}&kitx=ano" + + +def fetch_url(url: str, retries: int = 3) -> str: + """Fetch URL and return HTML string with retry on transient errors.""" + for attempt in range(retries): + try: + logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}") + req = urllib.request.Request(url, headers=HEADERS) + resp = urllib.request.urlopen(req, timeout=30) + html = resp.read().decode("utf-8", errors="replace") + logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes") + return html + except urllib.error.HTTPError: + raise + except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e: + if attempt < retries - 1: + wait = (attempt + 1) * 3 + logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}") + time.sleep(wait) + else: + logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True) + raise + + +def format_price(price: int) -> str: + s = str(price) + parts = [] + while s: + parts.append(s[-3:]) + s = s[:-3] + return " ".join(reversed(parts)) + " Kč" + + +def parse_price(text: str) -> int: + """Parse price from text like '5 250 000 Kč' → 5250000.""" + cleaned = re.sub(r'[^\d]', '', text) + return int(cleaned) if cleaned else 0 + + +def parse_disposition(text: str) -> str | None: + """Parse disposition from title/description like '3+kk', '4+1'.""" + m = DISP_RE.search(text) + if m: + rooms = m.group(1) + suffix = m.group(2).lower() + return f"{rooms}+{suffix}" + return None + + +def parse_area(text: str) -> float | None: + """Parse area from text like '82 m²' → 82.0.""" + m = AREA_RE.search(text) + if m: + return float(m.group(1).replace(',', '.')) + return None + + +def parse_floor(text: str) -> int | None: + """Parse floor number from description.""" + for pattern in [FLOOR_RE, FLOOR_RE2, FLOOR_RE3]: + m = pattern.search(text) + if m: + return int(m.group(1)) + return None + + +def is_panel(text: str) -> bool: + """Check if description mentions panel construction.""" + return bool(PANEL_RE.search(text)) + + +def is_sidliste(text: str) -> bool: + """Check if description mentions housing estate.""" + return bool(SIDLISTE_RE.search(text)) + + +def fetch_listing_page(offset: int = 0, pagination_params: str | None = None) -> tuple[list[dict], int, str | None]: + """ + Fetch a page of listings from Bazoš. + Returns (list of basic listing dicts, total count, pagination_params for next pages). + """ + if pagination_params and offset > 0: + # Use resolved numeric params from first page's pagination links + url = f"{BASE_URL}/prodam/byt/{offset}/?{pagination_params}" + else: + params = SEARCH_PARAMS.format(max_price=MAX_PRICE) + if offset > 0: + url = f"{BASE_URL}/prodam/byt/{offset}/?{params}" + else: + url = f"{BASE_URL}/prodam/byt/?{params}" + + html = fetch_url(url) + + # Parse total count: "Zobrazeno 1-20 z 727" + total = 0 + total_match = re.search(r'z\s+([\d\s]+)\s', html) + if total_match: + total = int(total_match.group(1).replace(' ', '')) + + # Extract resolved pagination params from first page (Bazoš converts + # hlokalita=Praha → hlokalita=11000, and pagination only works with numeric form) + resolved_params = None + pag_link = re.search(r'href="/prodam/byt/\d+/\?([^"]+)"', html) + if pag_link: + resolved_params = pag_link.group(1) + + # Parse listings — split by listing blocks (class="inzeraty inzeratyflex") + listings = [] + all_blocks = re.split(r'
', html)[1:] # skip before first + + for block in all_blocks: + # Extract URL and ID from first link (/inzerat/XXXXXX/slug.php) + url_match = re.search(r'href="(/inzerat/(\d+)/[^"]*)"', block) + if not url_match: + continue + detail_path = url_match.group(1) + listing_id = int(url_match.group(2)) + + # Title — class=nadpis (without quotes) or class="nadpis" + title_match = re.search(r'class=.?nadpis.?[^>]*>\s*]*>([^<]+)', block) + title = title_match.group(1).strip() if title_match else "" + + # Price — inside within inzeratycena + price_match = re.search(r'class="inzeratycena"[^>]*>.*?]*>([^<]+)', block, re.DOTALL) + if not price_match: + # Fallback: direct text in inzeratycena + price_match = re.search(r'class="inzeratycena"[^>]*>\s*(?:)?([^<]+)', block) + price_text = price_match.group(1).strip() if price_match else "" + price = parse_price(price_text) + + # Location + loc_match = re.search(r'class="inzeratylok"[^>]*>(.*?)
', block, re.DOTALL) + location = "" + if loc_match: + location = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip() + location = re.sub(r'\s+', ' ', location) + + # Date — [5.3. 2026] + date_match = re.search(r'\[(\d+\.\d+\.\s*\d{4})\]', block) + date_str = date_match.group(1).strip() if date_match else "" + + # Description preview — class=popis (without quotes) or class="popis" + desc_match = re.search(r'class=.?popis.?[^>]*>(.*?)', block, re.DOTALL) + description = "" + if desc_match: + description = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip() + description = re.sub(r'\s+', ' ', description) + + # Image — + img_match = re.search(r']*src="([^"]+)"[^>]*class="obrazek"', block) + if not img_match: + img_match = re.search(r'class="obrazek"[^>]*src="([^"]+)"', block) + image = img_match.group(1) if img_match else "" + if "empty.gif" in image: + image = "" + + listings.append({ + "id": listing_id, + "title": title, + "price": price, + "location": location, + "date": date_str, + "description": description, + "detail_path": detail_path, + "image": image, + }) + + logger.debug(f"Offset {offset}: found {len(listings)} listings, total={total}") + return listings, total, resolved_params + + +def fetch_detail(path: str) -> dict | None: + """Fetch listing detail page and extract GPS, full description.""" + try: + url = f"{BASE_URL}{path}" + html = fetch_url(url) + + result = {} + + # GPS from Google Maps link + gps_match = re.search(r'google\.com/maps[^"]*place/([\d.]+),([\d.]+)', html) + if gps_match: + result["lat"] = float(gps_match.group(1)) + result["lon"] = float(gps_match.group(2)) + + # Full description — Bazoš uses unquoted class=popisdetail + desc_match = re.search(r'class=.?popisdetail.?[^>]*>(.*?)', html, re.DOTALL) + if desc_match: + desc = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip() + desc = re.sub(r'\s+', ' ', desc) + result["description"] = desc + + # Location from detail + loc_match = re.search(r'Lokalita:\s*]*>(.*?)', html, re.DOTALL) + if loc_match: + loc = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip() + loc = re.sub(r'\s+', ' ', loc) + result["detail_location"] = loc + + return result + + except Exception as e: + logger.warning(f"Detail fetch failed for {path}: {e}") + return None + + +def load_cache(json_path: str = "byty_bazos.json") -> dict[int, dict]: + """Load previously scraped data as cache keyed by hash_id.""" + path = Path(json_path) + if not path.exists(): + return {} + try: + data = json.loads(path.read_text(encoding="utf-8")) + return {e["hash_id"]: e for e in data if "hash_id" in e} + except (json.JSONDecodeError, KeyError): + return {} + + +def scrape(max_pages: int | None = None, max_properties: int | None = None): + _run_start = time.time() + _run_ts = datetime.now().isoformat(timespec="seconds") + cache = load_cache() + today = datetime.now().strftime("%Y-%m-%d") + + logger.info("=" * 60) + logger.info("Stahuji inzeráty z Bazoš.cz") + logger.info(f"Cena: do {format_price(MAX_PRICE)}") + logger.info(f"Min. plocha: {MIN_AREA} m²") + logger.info(f"Patro: od {MIN_FLOOR}. NP") + logger.info(f"Region: Praha") + if cache: + logger.info(f"Cache: {len(cache)} bytů z minulého běhu") + if max_pages: + logger.info(f"Max. stran: {max_pages}") + if max_properties: + logger.info(f"Max. bytů: {max_properties}") + logger.info("=" * 60) + + # Step 1: Fetch listing pages + logger.info("\nFáze 1: Stahování seznamu inzerátů...") + all_listings = {} # id -> listing dict (dedup) + page = 1 + offset = 0 + total = None + pagination_params = None # resolved numeric params from first page + + while True: + if max_pages and page > max_pages: + logger.debug(f"Max pages limit reached: {max_pages}") + break + + logger.info(f"Strana {page} (offset {offset}) ...") + listings, total_count, resolved = fetch_listing_page(offset, pagination_params) + if resolved and not pagination_params: + pagination_params = resolved + logger.debug(f"Resolved pagination params: {pagination_params}") + + if total is None and total_count > 0: + total = total_count + total_pages = math.ceil(total / PER_PAGE) + logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran") + + if not listings: + logger.debug(f"No listings found on page {page}, stopping") + break + + for lst in listings: + lid = lst["id"] + if lid not in all_listings: + all_listings[lid] = lst + + page += 1 + offset += PER_PAGE + if total and offset >= total: + break + time.sleep(0.5) + + logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů") + + # Step 2: Pre-filter by disposition, price, area from listing data + pre_filtered = [] + excluded_disp = 0 + excluded_price = 0 + excluded_area = 0 + excluded_no_disp = 0 + + for lst in all_listings.values(): + title_and_desc = f"{lst['title']} {lst['description']}" + + # Parse disposition + disp = parse_disposition(title_and_desc) + if not disp: + excluded_no_disp += 1 + logger.debug(f"Filter: id={lst['id']} - excluded (no disposition found in '{lst['title']}')") + continue + if disp not in WANTED_DISPOSITIONS: + excluded_disp += 1 + logger.debug(f"Filter: id={lst['id']} - excluded (disposition {disp})") + continue + + # Price + price = lst["price"] + if price <= 0 or price > MAX_PRICE: + excluded_price += 1 + logger.debug(f"Filter: id={lst['id']} - excluded (price {price})") + continue + + # Area (if parseable from listing) + area = parse_area(title_and_desc) + if area is not None and area < MIN_AREA: + excluded_area += 1 + logger.debug(f"Filter: id={lst['id']} - excluded (area {area} m²)") + continue + + lst["_disposition"] = disp + lst["_area"] = area + pre_filtered.append(lst) + + logger.info(f"\nPo předfiltraci:") + logger.info(f" Vyloučeno (bez dispozice): {excluded_no_disp}") + logger.info(f" Vyloučeno (dispozice): {excluded_disp}") + logger.info(f" Vyloučeno (cena): {excluded_price}") + logger.info(f" Vyloučeno (plocha): {excluded_area}") + logger.info(f" Zbývá: {len(pre_filtered)}") + + # Step 3: Fetch details (for GPS + full description) + logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...") + results = [] + excluded_panel = 0 + excluded_floor = 0 + excluded_no_gps = 0 + excluded_detail = 0 + excluded_area_detail = 0 + cache_hits = 0 + properties_fetched = 0 + + for i, lst in enumerate(pre_filtered): + if max_properties and properties_fetched >= max_properties: + logger.debug(f"Max properties limit reached: {max_properties}") + break + + listing_id = lst["id"] + price = lst["price"] + + # Check cache + cached = cache.get(listing_id) + if cached and cached.get("price") == price: + cache_hits += 1 + logger.debug(f"Cache hit for id={listing_id}") + results.append(cached) + continue + + time.sleep(0.4) + detail = fetch_detail(lst["detail_path"]) + + if not detail: + excluded_detail += 1 + logger.debug(f"Filter: id={listing_id} - excluded (detail fetch failed)") + continue + + # GPS required + lat = detail.get("lat") + lon = detail.get("lon") + if not lat or not lon: + excluded_no_gps += 1 + logger.debug(f"Filter: id={listing_id} - excluded (no GPS)") + continue + + # Full text for filtering + full_desc = detail.get("description", "") + full_text = f"{lst['title']} {lst['description']} {full_desc}" + + # Panel check + if is_panel(full_text): + excluded_panel += 1 + logger.info(f"✗ Vyloučen #{listing_id}: panelová stavba") + continue + + # Sídliště check + if is_sidliste(full_text): + excluded_panel += 1 + logger.info(f"✗ Vyloučen #{listing_id}: sídliště") + continue + + # Floor + floor = parse_floor(full_text) + if floor is not None and floor < MIN_FLOOR: + excluded_floor += 1 + logger.debug(f"Filter: id={listing_id} - excluded (floor {floor})") + continue + + # Area — re-check from detail if not found before + area = lst.get("_area") or parse_area(full_desc) + if area is not None and area < MIN_AREA: + excluded_area_detail += 1 + logger.debug(f"Filter: id={listing_id} - excluded (area {area} m² from detail)") + continue + + disp = lst["_disposition"] + locality = detail.get("detail_location") or lst["location"] + + result = { + "hash_id": listing_id, + "name": f"Prodej bytu {disp} {int(area) if area else '?'} m²", + "price": price, + "price_formatted": format_price(price), + "locality": locality, + "lat": lat, + "lon": lon, + "disposition": disp, + "floor": floor, + "area": area, + "building_type": "neuvedeno", + "ownership": "neuvedeno", + "url": f"{BASE_URL}{lst['detail_path']}", + "source": "bazos", + "image": lst.get("image", ""), + "scraped_at": today, + "first_seen": cached.get("first_seen", today) if cached else today, + "last_changed": today if not cached or cached.get("price") != price else cached.get("last_changed", today), + } + + if not validate_listing(result, "bazos"): + continue + + results.append(result) + properties_fetched += 1 + + if (i + 1) % 20 == 0: + logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...") + + logger.info(f"\n{'=' * 60}") + logger.info(f"Výsledky Bazoš:") + logger.info(f" Předfiltrováno: {len(pre_filtered)}") + logger.info(f" Z cache (přeskočeno): {cache_hits}") + logger.info(f" Vyloučeno (panel/síd): {excluded_panel}") + logger.info(f" Vyloučeno (patro): {excluded_floor}") + logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}") + logger.info(f" Vyloučeno (bez detailu): {excluded_detail}") + logger.info(f" Vyloučeno (plocha det): {excluded_area_detail}") + logger.info(f" ✓ Vyhovující byty: {len(results)}") + logger.info(f"{'=' * 60}") + + write_stats(STATS_FILE, { + "source": "Bazoš", + "timestamp": _run_ts, + "duration_sec": round(time.time() - _run_start, 1), + "success": True, + "accepted": len(results), + "fetched": len(all_listings), + "pages": page - 1, + "cache_hits": cache_hits, + "excluded": { + "bez dispozice": excluded_no_disp, + "dispozice": excluded_disp, + "cena": excluded_price, + "plocha": excluded_area + excluded_area_detail, + "bez GPS": excluded_no_gps, + "panel/síd": excluded_panel, + "patro": excluded_floor, + "bez detailu": excluded_detail, + }, + }) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Scrape apartments from Bazoš.cz") + parser.add_argument("--max-pages", type=int, default=None, + help="Maximum number of listing pages to scrape") + parser.add_argument("--max-properties", type=int, default=None, + help="Maximum number of properties to fetch details for") + parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Logging level (default: INFO)") + args = parser.parse_args() + + logging.basicConfig( + level=getattr(logging, args.log_level), + format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", + handlers=[logging.StreamHandler()] + ) + + _run_ts = datetime.now().isoformat(timespec="seconds") + start = time.time() + try: + estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties) + except Exception as e: + logger.error(f"Scraper failed: {e}", exc_info=True) + write_stats(STATS_FILE, { + "source": "Bazoš", + "timestamp": _run_ts, + "duration_sec": round(time.time() - start, 1), + "success": False, + "accepted": 0, + "fetched": 0, + "error": str(e), + }) + raise + + if estates: + json_path = Path("byty_bazos.json") + json_path.write_text( + json.dumps(estates, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + elapsed = time.time() - start + logger.info(f"\n✓ Data uložena: {json_path.resolve()}") + logger.info(f"⏱ Celkový čas: {elapsed:.0f} s") + else: + logger.info("\nŽádné byty z Bazoše neodpovídají kritériím :(")