Add Bazoš.cz scraper + project docs #7

Merged
kacerr merged 2 commits from feature/bazos-scraper into main 2026-03-09 10:28:33 +00:00
4 changed files with 569 additions and 5 deletions
Showing only changes of commit 27e5b05f88 - Show all commits

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN a CityHome, Sloučí data ze Sreality, Realinga, Bezrealitek, iDNES, PSN, CityHome a Bazoše,
deduplikuje a vygeneruje mapu. deduplikuje a vygeneruje mapu.
Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát. Deduplikace: stejná ulice (z locality) + stejná cena + stejná plocha = duplikát.
PSN a CityHome mají při deduplikaci prioritu (načtou se první). PSN a CityHome mají při deduplikaci prioritu (načtou se první).
@@ -44,6 +44,7 @@ def main():
("Realingo", "byty_realingo.json"), ("Realingo", "byty_realingo.json"),
("Bezrealitky", "byty_bezrealitky.json"), ("Bezrealitky", "byty_bezrealitky.json"),
("iDNES", "byty_idnes.json"), ("iDNES", "byty_idnes.json"),
("Bazoš", "byty_bazos.json"),
] ]
all_estates = [] all_estates = []

View File

@@ -13,7 +13,7 @@ RED='\033[0;31m'
BOLD='\033[1m' BOLD='\033[1m'
NC='\033[0m' NC='\033[0m'
TOTAL=6 TOTAL=7
CURRENT=0 CURRENT=0
FAILED=0 FAILED=0
START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S") START_TIME=$(date -u +"%Y-%m-%dT%H:%M:%S")
@@ -98,6 +98,9 @@ PID_CH=$!
wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); } wait $PID_PSN || { echo -e "${RED}✗ PSN selhalo${NC}"; FAILED=$((FAILED + 1)); }
wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); } wait $PID_CH || { echo -e "${RED}✗ CityHome selhalo${NC}"; FAILED=$((FAILED + 1)); }
step "Bazoš"
python3 scrape_bazos.py $SCRAPER_ARGS || { echo -e "${RED}✗ Bazoš selhalo${NC}"; FAILED=$((FAILED + 1)); }
step "Realingo" step "Realingo"
python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); } python3 scrape_realingo.py $SCRAPER_ARGS || { echo -e "${RED}✗ Realingo selhalo${NC}"; FAILED=$((FAILED + 1)); }
@@ -117,7 +120,7 @@ python3 generate_status.py --start-time "$START_TIME" --duration "$DURATION" $KE
echo "" echo ""
echo "============================================================" echo "============================================================"
if [ $FAILED -eq 0 ]; then if [ $FAILED -eq 0 ]; then
echo -e "${GREEN}${BOLD}Hotovo! Všech 6 zdrojů úspěšně staženo.${NC}" echo -e "${GREEN}${BOLD}Hotovo! Všech 7 zdrojů úspěšně staženo.${NC}"
else else
echo -e "${RED}${BOLD}Hotovo s $FAILED chybami.${NC}" echo -e "${RED}${BOLD}Hotovo s $FAILED chybami.${NC}"
fi fi

View File

@@ -480,8 +480,8 @@ def generate_map(estates: list[dict], output_path: str = "mapa_bytu.html"):
floor_note = '<br><span style="color:#FF9800;font-weight:bold;">⚠ 2. NP — zvážit klidnost lokality</span>' floor_note = '<br><span style="color:#FF9800;font-weight:bold;">⚠ 2. NP — zvážit klidnost lokality</span>'
source = e.get("source", "sreality") source = e.get("source", "sreality")
source_labels = {"sreality": "Sreality", "realingo": "Realingo", "bezrealitky": "Bezrealitky", "idnes": "iDNES", "psn": "PSN", "cityhome": "CityHome"} source_labels = {"sreality": "Sreality", "realingo": "Realingo", "bezrealitky": "Bezrealitky", "idnes": "iDNES", "psn": "PSN", "cityhome": "CityHome", "bazos": "Bazoš"}
source_colors = {"sreality": "#1976D2", "realingo": "#00897B", "bezrealitky": "#E91E63", "idnes": "#FF6F00", "psn": "#D32F2F", "cityhome": "#D32F2F"} source_colors = {"sreality": "#1976D2", "realingo": "#00897B", "bezrealitky": "#E91E63", "idnes": "#FF6F00", "psn": "#D32F2F", "cityhome": "#D32F2F", "bazos": "#7B1FA2"}
source_label = source_labels.get(source, source) source_label = source_labels.get(source, source)
source_color = source_colors.get(source, "#999") source_color = source_colors.get(source, "#999")

560
scrape_bazos.py Normal file
View File

@@ -0,0 +1,560 @@
#!/usr/bin/env python3
"""
Bazoš.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_bazos.json
"""
from __future__ import annotations
import argparse
from datetime import datetime
import json
import logging
import math
import re
import time
import urllib.request
import urllib.parse
from pathlib import Path
from scraper_stats import write_stats, validate_listing
STATS_FILE = "stats_bazos.json"
logger = logging.getLogger(__name__)
# ── Konfigurace ─────────────────────────────────────────────────────────────
MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 20 # Bazoš vrací 20 na stránku
WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}
# Regex patterns pro parsování dispozice, plochy a patra z textu
DISP_RE = re.compile(r'(\d)\s*\+\s*(kk|1)', re.IGNORECASE)
AREA_RE = re.compile(r'(\d+(?:[.,]\d+)?)\s*m[²2\s,.]', re.IGNORECASE)
FLOOR_RE = re.compile(r'(\d+)\s*[./]\s*(\d+)\s*(?:NP|patr|podlaž|floor)', re.IGNORECASE)
FLOOR_RE2 = re.compile(r'(\d+)\.\s*(?:NP|patr[eouě]|podlaž[ií])', re.IGNORECASE)
FLOOR_RE3 = re.compile(r'(?:patr[eouě]|podlaž[ií]|NP)\s*[:\s]*(\d+)', re.IGNORECASE)
PANEL_RE = re.compile(r'panel(?:ov|ák|\.)', re.IGNORECASE)
SIDLISTE_RE = re.compile(r'sídliště|sidliste|panelák', re.IGNORECASE)
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "cs,en;q=0.9",
}
BASE_URL = "https://reality.bazos.cz"
SEARCH_PARAMS = "hledat=&rubriky=reality&hlokalita=Praha&humkreis=25&cenado={max_price}&kitx=ano"
def fetch_url(url: str, retries: int = 3) -> str:
"""Fetch URL and return HTML string with retry on transient errors."""
for attempt in range(retries):
try:
logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=30)
html = resp.read().decode("utf-8", errors="replace")
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
return html
except urllib.error.HTTPError:
raise
except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
if attempt < retries - 1:
wait = (attempt + 1) * 3
logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
time.sleep(wait)
else:
logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
raise
def format_price(price: int) -> str:
s = str(price)
parts = []
while s:
parts.append(s[-3:])
s = s[:-3]
return " ".join(reversed(parts)) + ""
def parse_price(text: str) -> int:
"""Parse price from text like '5 250 000 Kč' → 5250000."""
cleaned = re.sub(r'[^\d]', '', text)
return int(cleaned) if cleaned else 0
def parse_disposition(text: str) -> str | None:
"""Parse disposition from title/description like '3+kk', '4+1'."""
m = DISP_RE.search(text)
if m:
rooms = m.group(1)
suffix = m.group(2).lower()
return f"{rooms}+{suffix}"
return None
def parse_area(text: str) -> float | None:
"""Parse area from text like '82 m²' → 82.0."""
m = AREA_RE.search(text)
if m:
return float(m.group(1).replace(',', '.'))
return None
def parse_floor(text: str) -> int | None:
"""Parse floor number from description."""
for pattern in [FLOOR_RE, FLOOR_RE2, FLOOR_RE3]:
m = pattern.search(text)
if m:
return int(m.group(1))
return None
def is_panel(text: str) -> bool:
"""Check if description mentions panel construction."""
return bool(PANEL_RE.search(text))
def is_sidliste(text: str) -> bool:
"""Check if description mentions housing estate."""
return bool(SIDLISTE_RE.search(text))
def fetch_listing_page(offset: int = 0, pagination_params: str | None = None) -> tuple[list[dict], int, str | None]:
"""
Fetch a page of listings from Bazoš.
Returns (list of basic listing dicts, total count, pagination_params for next pages).
"""
if pagination_params and offset > 0:
# Use resolved numeric params from first page's pagination links
url = f"{BASE_URL}/prodam/byt/{offset}/?{pagination_params}"
else:
params = SEARCH_PARAMS.format(max_price=MAX_PRICE)
if offset > 0:
url = f"{BASE_URL}/prodam/byt/{offset}/?{params}"
else:
url = f"{BASE_URL}/prodam/byt/?{params}"
html = fetch_url(url)
# Parse total count: "Zobrazeno 1-20 z 727"
total = 0
total_match = re.search(r'z\s+([\d\s]+)\s', html)
if total_match:
total = int(total_match.group(1).replace(' ', ''))
# Extract resolved pagination params from first page (Bazoš converts
# hlokalita=Praha → hlokalita=11000, and pagination only works with numeric form)
resolved_params = None
pag_link = re.search(r'href="/prodam/byt/\d+/\?([^"]+)"', html)
if pag_link:
resolved_params = pag_link.group(1)
# Parse listings — split by listing blocks (class="inzeraty inzeratyflex")
listings = []
all_blocks = re.split(r'<div class="inzeraty\s+inzeratyflex">', html)[1:] # skip before first
for block in all_blocks:
# Extract URL and ID from first link (/inzerat/XXXXXX/slug.php)
url_match = re.search(r'href="(/inzerat/(\d+)/[^"]*)"', block)
if not url_match:
continue
detail_path = url_match.group(1)
listing_id = int(url_match.group(2))
# Title — class=nadpis (without quotes) or class="nadpis"
title_match = re.search(r'class=.?nadpis.?[^>]*>\s*<a[^>]*>([^<]+)</a>', block)
title = title_match.group(1).strip() if title_match else ""
# Price — inside <span translate="no"> within inzeratycena
price_match = re.search(r'class="inzeratycena"[^>]*>.*?<span[^>]*>([^<]+)</span>', block, re.DOTALL)
if not price_match:
# Fallback: direct text in inzeratycena
price_match = re.search(r'class="inzeratycena"[^>]*>\s*(?:<b>)?([^<]+)', block)
price_text = price_match.group(1).strip() if price_match else ""
price = parse_price(price_text)
# Location
loc_match = re.search(r'class="inzeratylok"[^>]*>(.*?)</div>', block, re.DOTALL)
location = ""
if loc_match:
location = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip()
location = re.sub(r'\s+', ' ', location)
# Date — [5.3. 2026]
date_match = re.search(r'\[(\d+\.\d+\.\s*\d{4})\]', block)
date_str = date_match.group(1).strip() if date_match else ""
# Description preview — class=popis (without quotes) or class="popis"
desc_match = re.search(r'class=.?popis.?[^>]*>(.*?)</div>', block, re.DOTALL)
description = ""
if desc_match:
description = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip()
description = re.sub(r'\s+', ' ', description)
# Image — <img ... class="obrazek" ... src="...">
img_match = re.search(r'<img[^>]*src="([^"]+)"[^>]*class="obrazek"', block)
if not img_match:
img_match = re.search(r'class="obrazek"[^>]*src="([^"]+)"', block)
image = img_match.group(1) if img_match else ""
if "empty.gif" in image:
image = ""
listings.append({
"id": listing_id,
"title": title,
"price": price,
"location": location,
"date": date_str,
"description": description,
"detail_path": detail_path,
"image": image,
})
logger.debug(f"Offset {offset}: found {len(listings)} listings, total={total}")
return listings, total, resolved_params
def fetch_detail(path: str) -> dict | None:
"""Fetch listing detail page and extract GPS, full description."""
try:
url = f"{BASE_URL}{path}"
html = fetch_url(url)
result = {}
# GPS from Google Maps link
gps_match = re.search(r'google\.com/maps[^"]*place/([\d.]+),([\d.]+)', html)
if gps_match:
result["lat"] = float(gps_match.group(1))
result["lon"] = float(gps_match.group(2))
# Full description — Bazoš uses unquoted class=popisdetail
desc_match = re.search(r'class=.?popisdetail.?[^>]*>(.*?)</div>', html, re.DOTALL)
if desc_match:
desc = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip()
desc = re.sub(r'\s+', ' ', desc)
result["description"] = desc
# Location from detail
loc_match = re.search(r'Lokalita:</td>\s*<td[^>]*>(.*?)</td>', html, re.DOTALL)
if loc_match:
loc = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip()
loc = re.sub(r'\s+', ' ', loc)
result["detail_location"] = loc
return result
except Exception as e:
logger.warning(f"Detail fetch failed for {path}: {e}")
return None
def load_cache(json_path: str = "byty_bazos.json") -> dict[int, dict]:
"""Load previously scraped data as cache keyed by hash_id."""
path = Path(json_path)
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
return {e["hash_id"]: e for e in data if "hash_id" in e}
except (json.JSONDecodeError, KeyError):
return {}
def scrape(max_pages: int | None = None, max_properties: int | None = None):
_run_start = time.time()
_run_ts = datetime.now().isoformat(timespec="seconds")
cache = load_cache()
today = datetime.now().strftime("%Y-%m-%d")
logger.info("=" * 60)
logger.info("Stahuji inzeráty z Bazoš.cz")
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
logger.info(f"Min. plocha: {MIN_AREA}")
logger.info(f"Patro: od {MIN_FLOOR}. NP")
logger.info(f"Region: Praha")
if cache:
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
if max_pages:
logger.info(f"Max. stran: {max_pages}")
if max_properties:
logger.info(f"Max. bytů: {max_properties}")
logger.info("=" * 60)
# Step 1: Fetch listing pages
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
all_listings = {} # id -> listing dict (dedup)
page = 1
offset = 0
total = None
pagination_params = None # resolved numeric params from first page
while True:
if max_pages and page > max_pages:
logger.debug(f"Max pages limit reached: {max_pages}")
break
logger.info(f"Strana {page} (offset {offset}) ...")
listings, total_count, resolved = fetch_listing_page(offset, pagination_params)
if resolved and not pagination_params:
pagination_params = resolved
logger.debug(f"Resolved pagination params: {pagination_params}")
if total is None and total_count > 0:
total = total_count
total_pages = math.ceil(total / PER_PAGE)
logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")
if not listings:
logger.debug(f"No listings found on page {page}, stopping")
break
for lst in listings:
lid = lst["id"]
if lid not in all_listings:
all_listings[lid] = lst
page += 1
offset += PER_PAGE
if total and offset >= total:
break
time.sleep(0.5)
logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů")
# Step 2: Pre-filter by disposition, price, area from listing data
pre_filtered = []
excluded_disp = 0
excluded_price = 0
excluded_area = 0
excluded_no_disp = 0
for lst in all_listings.values():
title_and_desc = f"{lst['title']} {lst['description']}"
# Parse disposition
disp = parse_disposition(title_and_desc)
if not disp:
excluded_no_disp += 1
logger.debug(f"Filter: id={lst['id']} - excluded (no disposition found in '{lst['title']}')")
continue
if disp not in WANTED_DISPOSITIONS:
excluded_disp += 1
logger.debug(f"Filter: id={lst['id']} - excluded (disposition {disp})")
continue
# Price
price = lst["price"]
if price <= 0 or price > MAX_PRICE:
excluded_price += 1
logger.debug(f"Filter: id={lst['id']} - excluded (price {price})")
continue
# Area (if parseable from listing)
area = parse_area(title_and_desc)
if area is not None and area < MIN_AREA:
excluded_area += 1
logger.debug(f"Filter: id={lst['id']} - excluded (area {area} m²)")
continue
lst["_disposition"] = disp
lst["_area"] = area
pre_filtered.append(lst)
logger.info(f"\nPo předfiltraci:")
logger.info(f" Vyloučeno (bez dispozice): {excluded_no_disp}")
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
logger.info(f" Vyloučeno (cena): {excluded_price}")
logger.info(f" Vyloučeno (plocha): {excluded_area}")
logger.info(f" Zbývá: {len(pre_filtered)}")
# Step 3: Fetch details (for GPS + full description)
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
results = []
excluded_panel = 0
excluded_floor = 0
excluded_no_gps = 0
excluded_detail = 0
excluded_area_detail = 0
cache_hits = 0
properties_fetched = 0
for i, lst in enumerate(pre_filtered):
if max_properties and properties_fetched >= max_properties:
logger.debug(f"Max properties limit reached: {max_properties}")
break
listing_id = lst["id"]
price = lst["price"]
# Check cache
cached = cache.get(listing_id)
if cached and cached.get("price") == price:
cache_hits += 1
logger.debug(f"Cache hit for id={listing_id}")
results.append(cached)
continue
time.sleep(0.4)
detail = fetch_detail(lst["detail_path"])
if not detail:
excluded_detail += 1
logger.debug(f"Filter: id={listing_id} - excluded (detail fetch failed)")
continue
# GPS required
lat = detail.get("lat")
lon = detail.get("lon")
if not lat or not lon:
excluded_no_gps += 1
logger.debug(f"Filter: id={listing_id} - excluded (no GPS)")
continue
# Full text for filtering
full_desc = detail.get("description", "")
full_text = f"{lst['title']} {lst['description']} {full_desc}"
# Panel check
if is_panel(full_text):
excluded_panel += 1
logger.info(f"✗ Vyloučen #{listing_id}: panelová stavba")
continue
# Sídliště check
if is_sidliste(full_text):
excluded_panel += 1
logger.info(f"✗ Vyloučen #{listing_id}: sídliště")
continue
# Floor
floor = parse_floor(full_text)
if floor is not None and floor < MIN_FLOOR:
excluded_floor += 1
logger.debug(f"Filter: id={listing_id} - excluded (floor {floor})")
continue
# Area — re-check from detail if not found before
area = lst.get("_area") or parse_area(full_desc)
if area is not None and area < MIN_AREA:
excluded_area_detail += 1
logger.debug(f"Filter: id={listing_id} - excluded (area {area} m² from detail)")
continue
disp = lst["_disposition"]
locality = detail.get("detail_location") or lst["location"]
result = {
"hash_id": listing_id,
"name": f"Prodej bytu {disp} {int(area) if area else '?'}",
"price": price,
"price_formatted": format_price(price),
"locality": locality,
"lat": lat,
"lon": lon,
"disposition": disp,
"floor": floor,
"area": area,
"building_type": "neuvedeno",
"ownership": "neuvedeno",
"url": f"{BASE_URL}{lst['detail_path']}",
"source": "bazos",
"image": lst.get("image", ""),
"scraped_at": today,
"first_seen": cached.get("first_seen", today) if cached else today,
"last_changed": today if not cached or cached.get("price") != price else cached.get("last_changed", today),
}
if not validate_listing(result, "bazos"):
continue
results.append(result)
properties_fetched += 1
if (i + 1) % 20 == 0:
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
logger.info(f"\n{'=' * 60}")
logger.info(f"Výsledky Bazoš:")
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
logger.info(f" Z cache (přeskočeno): {cache_hits}")
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
logger.info(f" Vyloučeno (patro): {excluded_floor}")
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
logger.info(f" Vyloučeno (plocha det): {excluded_area_detail}")
logger.info(f" ✓ Vyhovující byty: {len(results)}")
logger.info(f"{'=' * 60}")
write_stats(STATS_FILE, {
"source": "Bazoš",
"timestamp": _run_ts,
"duration_sec": round(time.time() - _run_start, 1),
"success": True,
"accepted": len(results),
"fetched": len(all_listings),
"pages": page - 1,
"cache_hits": cache_hits,
"excluded": {
"bez dispozice": excluded_no_disp,
"dispozice": excluded_disp,
"cena": excluded_price,
"plocha": excluded_area + excluded_area_detail,
"bez GPS": excluded_no_gps,
"panel/síd": excluded_panel,
"patro": excluded_floor,
"bez detailu": excluded_detail,
},
})
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape apartments from Bazoš.cz")
parser.add_argument("--max-pages", type=int, default=None,
help="Maximum number of listing pages to scrape")
parser.add_argument("--max-properties", type=int, default=None,
help="Maximum number of properties to fetch details for")
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level (default: INFO)")
args = parser.parse_args()
logging.basicConfig(
level=getattr(logging, args.log_level),
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()]
)
_run_ts = datetime.now().isoformat(timespec="seconds")
start = time.time()
try:
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
except Exception as e:
logger.error(f"Scraper failed: {e}", exc_info=True)
write_stats(STATS_FILE, {
"source": "Bazoš",
"timestamp": _run_ts,
"duration_sec": round(time.time() - start, 1),
"success": False,
"accepted": 0,
"fetched": 0,
"error": str(e),
})
raise
if estates:
json_path = Path("byty_bazos.json")
json_path.write_text(
json.dumps(estates, ensure_ascii=False, indent=2),
encoding="utf-8",
)
elapsed = time.time() - start
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
else:
logger.info("\nŽádné byty z Bazoše neodpovídají kritériím :(")