Add validation mode, structured logging, and CLI args to all scrapers
- Replace print() with Python logging module across all 6 scrapers for configurable log levels (DEBUG/INFO/WARNING/ERROR) - Add --max-pages, --max-properties, and --log-level CLI arguments to each scraper via argparse for limiting scrape scope - Add validation Make targets (validation, validation-local, validation-local-debug) for quick test runs with limited data - Update run_all.sh to parse and forward CLI args to all scrapers - Update mapa_bytu.html with latest scrape results Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,13 +6,17 @@ Výstup: byty_realingo.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Konfigurace (sdílená se Sreality scraperem) ─────────────────────────────
|
||||
|
||||
MAX_PRICE = 13_500_000
|
||||
@@ -55,44 +59,57 @@ def fetch_listing_page(page: int = 1) -> tuple[list[dict], int]:
|
||||
else:
|
||||
url = f"{BASE_URL}/prodej_byty/praha/{page}_strana/"
|
||||
|
||||
logger.debug(f"HTTP GET request: {url}")
|
||||
logger.debug(f"Headers: {HEADERS}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
html, re.DOTALL
|
||||
)
|
||||
if not match:
|
||||
return [], 0
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
|
||||
return offer_list["data"], offer_list["total"]
|
||||
|
||||
|
||||
def fetch_detail(listing_url: str) -> dict | None:
|
||||
"""Fetch detail page for a listing to get floor, building type, etc."""
|
||||
try:
|
||||
url = f"{BASE_URL}{listing_url}"
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
html, re.DOTALL
|
||||
)
|
||||
if not match:
|
||||
logger.debug("No __NEXT_DATA__ script found in HTML")
|
||||
return [], 0
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
offer_list = data["props"]["pageProps"]["store"]["offer"]["list"]
|
||||
logger.debug(f"Page {page}: found {len(offer_list['data'])} items, total={offer_list['total']}")
|
||||
return offer_list["data"], offer_list["total"]
|
||||
except (urllib.error.URLError, ConnectionError, OSError) as e:
|
||||
logger.error(f"HTTP request failed for {url}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def fetch_detail(listing_url: str) -> dict | None:
|
||||
"""Fetch detail page for a listing to get floor, building type, etc."""
|
||||
try:
|
||||
url = f"{BASE_URL}{listing_url}"
|
||||
logger.debug(f"HTTP GET request: {url}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
|
||||
match = re.search(
|
||||
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
|
||||
html, re.DOTALL
|
||||
)
|
||||
if not match:
|
||||
logger.debug("No __NEXT_DATA__ script found in detail page")
|
||||
return None
|
||||
|
||||
data = json.loads(match.group(1))
|
||||
details = data["props"]["pageProps"]["store"]["offer"]["details"]
|
||||
# Get first (only) detail entry
|
||||
for detail_data in details.values():
|
||||
logger.debug(f"Detail fetched for {listing_url}")
|
||||
return detail_data
|
||||
except Exception as e:
|
||||
print(f" Warning: detail fetch failed for {listing_url}: {e}")
|
||||
logger.warning(f"Detail fetch failed for {listing_url}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
@@ -117,34 +134,42 @@ def load_cache(json_path: str = "byty_realingo.json") -> dict[int, dict]:
|
||||
return {}
|
||||
|
||||
|
||||
def scrape():
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
cache = load_cache()
|
||||
|
||||
print("=" * 60)
|
||||
print("Stahuji inzeráty z Realingo.cz")
|
||||
print(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
print(f"Min. plocha: {MIN_AREA} m²")
|
||||
print(f"Patro: od {MIN_FLOOR}. NP")
|
||||
print(f"Region: Praha")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Stahuji inzeráty z Realingo.cz")
|
||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
||||
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
||||
logger.info(f"Region: Praha")
|
||||
if cache:
|
||||
print(f"Cache: {len(cache)} bytů z minulého běhu")
|
||||
print("=" * 60)
|
||||
logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
|
||||
if max_pages:
|
||||
logger.info(f"Max. stran: {max_pages}")
|
||||
if max_properties:
|
||||
logger.info(f"Max. bytů: {max_properties}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Step 1: Fetch all listing pages
|
||||
print("\nFáze 1: Stahování seznamu inzerátů...")
|
||||
logger.info("\nFáze 1: Stahování seznamu inzerátů...")
|
||||
all_listings = []
|
||||
page = 1
|
||||
total = None
|
||||
|
||||
while True:
|
||||
print(f" Strana {page} ...")
|
||||
if max_pages and page > max_pages:
|
||||
logger.debug(f"Max pages limit reached: {max_pages}")
|
||||
break
|
||||
logger.info(f"Strana {page} ...")
|
||||
items, total_count = fetch_listing_page(page)
|
||||
if total is None:
|
||||
total = total_count
|
||||
total_pages = math.ceil(total / PER_PAGE)
|
||||
print(f" → Celkem {total} inzerátů, {total_pages} stran")
|
||||
logger.info(f"→ Celkem {total} inzerátů, {total_pages} stran")
|
||||
|
||||
if not items:
|
||||
logger.debug(f"No items found on page {page}, stopping")
|
||||
break
|
||||
|
||||
all_listings.extend(items)
|
||||
@@ -153,7 +178,7 @@ def scrape():
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\n Staženo: {len(all_listings)} inzerátů")
|
||||
logger.info(f"\nStaženo: {len(all_listings)} inzerátů")
|
||||
|
||||
# Step 2: Pre-filter by category, price, area from listing data
|
||||
pre_filtered = []
|
||||
@@ -163,50 +188,60 @@ def scrape():
|
||||
excluded_no_gps = 0
|
||||
|
||||
for item in all_listings:
|
||||
item_id = item.get("id")
|
||||
cat = item.get("category", "")
|
||||
if cat not in WANTED_CATEGORIES:
|
||||
excluded_category += 1
|
||||
logger.debug(f"Filter: id={item_id} - excluded (category {cat})")
|
||||
continue
|
||||
|
||||
price = item.get("price", {}).get("total", 0) or 0
|
||||
if price > MAX_PRICE or price == 0:
|
||||
excluded_price += 1
|
||||
logger.debug(f"Filter: id={item_id} - excluded (price {price})")
|
||||
continue
|
||||
|
||||
area = item.get("area", {}).get("main")
|
||||
if area is not None and area < MIN_AREA:
|
||||
excluded_area += 1
|
||||
logger.debug(f"Filter: id={item_id} - excluded (area {area} m²)")
|
||||
continue
|
||||
|
||||
loc = item.get("location", {})
|
||||
if not loc.get("latitude") or not loc.get("longitude"):
|
||||
excluded_no_gps += 1
|
||||
logger.debug(f"Filter: id={item_id} - excluded (no GPS)")
|
||||
continue
|
||||
|
||||
pre_filtered.append(item)
|
||||
|
||||
print(f"\nPo předfiltraci:")
|
||||
print(f" Vyloučeno (dispozice): {excluded_category}")
|
||||
print(f" Vyloučeno (cena): {excluded_price}")
|
||||
print(f" Vyloučeno (plocha): {excluded_area}")
|
||||
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
||||
print(f" Zbývá: {len(pre_filtered)}")
|
||||
logger.info(f"\nPo předfiltraci:")
|
||||
logger.info(f" Vyloučeno (dispozice): {excluded_category}")
|
||||
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
||||
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
||||
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
||||
logger.info(f" Zbývá: {len(pre_filtered)}")
|
||||
|
||||
# Step 3: Fetch details for remaining listings (floor, building type)
|
||||
print(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
|
||||
logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
|
||||
results = []
|
||||
excluded_panel = 0
|
||||
excluded_floor = 0
|
||||
excluded_detail = 0
|
||||
cache_hits = 0
|
||||
properties_fetched = 0
|
||||
|
||||
for i, item in enumerate(pre_filtered):
|
||||
if max_properties and properties_fetched >= max_properties:
|
||||
logger.debug(f"Max properties limit reached: {max_properties}")
|
||||
break
|
||||
# Check cache — if hash_id exists and price unchanged, reuse
|
||||
item_id = int(item["id"])
|
||||
item_price = item.get("price", {}).get("total", 0) or 0
|
||||
cached = cache.get(item_id)
|
||||
if cached and cached.get("price") == item_price:
|
||||
cache_hits += 1
|
||||
logger.debug(f"Cache hit for id={item_id}")
|
||||
results.append(cached)
|
||||
continue
|
||||
|
||||
@@ -215,6 +250,7 @@ def scrape():
|
||||
|
||||
if not detail_data:
|
||||
excluded_detail += 1
|
||||
logger.debug(f"Filter: id={item_id} - excluded (detail fetch failed)")
|
||||
continue
|
||||
|
||||
detail = detail_data.get("offer", {}).get("detail", {})
|
||||
@@ -225,20 +261,23 @@ def scrape():
|
||||
building_type = detail.get("buildingType", "")
|
||||
if building_type == "PANEL":
|
||||
excluded_panel += 1
|
||||
print(f" ✗ Vyloučen #{item['id']}: panel")
|
||||
logger.debug(f"Filter: id={item['id']} - excluded (panel construction)")
|
||||
logger.info(f"✗ Vyloučen #{item['id']}: panel")
|
||||
continue
|
||||
|
||||
# Check building position — exclude sídliště
|
||||
building_position = detail.get("buildingPosition", "")
|
||||
if building_position and "ESTATE" in str(building_position).upper():
|
||||
excluded_panel += 1
|
||||
print(f" ✗ Vyloučen #{item['id']}: sídliště")
|
||||
logger.debug(f"Filter: id={item['id']} - excluded (building estate)")
|
||||
logger.info(f"✗ Vyloučen #{item['id']}: sídliště")
|
||||
continue
|
||||
|
||||
# Check floor
|
||||
floor = detail.get("floor")
|
||||
if floor is not None and floor < MIN_FLOOR:
|
||||
excluded_floor += 1
|
||||
logger.debug(f"Filter: id={item_id} - excluded (floor {floor})")
|
||||
continue
|
||||
|
||||
# Map building type
|
||||
@@ -277,26 +316,43 @@ def scrape():
|
||||
"image": "",
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
if (i + 1) % 20 == 0:
|
||||
print(f" Zpracováno {i + 1}/{len(pre_filtered)} ...")
|
||||
logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Výsledky Realingo:")
|
||||
print(f" Předfiltrováno: {len(pre_filtered)}")
|
||||
print(f" Z cache (přeskočeno): {cache_hits}")
|
||||
print(f" Vyloučeno (panel/síd): {excluded_panel}")
|
||||
print(f" Vyloučeno (patro): {excluded_floor}")
|
||||
print(f" Vyloučeno (bez detailu): {excluded_detail}")
|
||||
print(f" ✓ Vyhovující byty: {len(results)}")
|
||||
print(f"{'=' * 60}")
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"Výsledky Realingo:")
|
||||
logger.info(f" Předfiltrováno: {len(pre_filtered)}")
|
||||
logger.info(f" Z cache (přeskočeno): {cache_hits}")
|
||||
logger.info(f" Vyloučeno (panel/síd): {excluded_panel}")
|
||||
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
||||
logger.info(f" Vyloučeno (bez detailu): {excluded_detail}")
|
||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Scrape apartments from Realingo.cz")
|
||||
parser.add_argument("--max-pages", type=int, default=None,
|
||||
help="Maximum number of listing pages to scrape")
|
||||
parser.add_argument("--max-properties", type=int, default=None,
|
||||
help="Maximum number of properties to fetch details for")
|
||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging level (default: INFO)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
estates = scrape()
|
||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||
|
||||
if estates:
|
||||
json_path = Path("byty_realingo.json")
|
||||
@@ -305,7 +361,7 @@ if __name__ == "__main__":
|
||||
encoding="utf-8",
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
print(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
print(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
else:
|
||||
print("\nŽádné byty z Realinga neodpovídají kritériím :(")
|
||||
logger.info("\nŽádné byty z Realinga neodpovídají kritériím :(")
|
||||
|
||||
Reference in New Issue
Block a user