Add validation mode, structured logging, and CLI args to all scrapers
- Replace print() with Python logging module across all 6 scrapers for configurable log levels (DEBUG/INFO/WARNING/ERROR) - Add --max-pages, --max-properties, and --log-level CLI arguments to each scraper via argparse for limiting scrape scope - Add validation Make targets (validation, validation-local, validation-local-debug) for quick test runs with limited data - Update run_all.sh to parse and forward CLI args to all scrapers - Update mapa_bytu.html with latest scrape results Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
107
scrape_psn.py
107
scrape_psn.py
@@ -6,12 +6,16 @@ Výstup: byty_psn.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
||||
|
||||
MAX_PRICE = 14_000_000
|
||||
@@ -47,6 +51,8 @@ PRAGUE_PROJECTS = [
|
||||
|
||||
def fetch_url(url: str) -> str:
|
||||
"""Fetch URL via curl (urllib SSL too old for Cloudflare)."""
|
||||
logger.debug(f"HTTP GET request (via curl): {url}")
|
||||
logger.debug(f"User-Agent: {UA}")
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "-L", "--max-time", "30",
|
||||
"-H", f"User-Agent: {UA}",
|
||||
@@ -55,7 +61,9 @@ def fetch_url(url: str) -> str:
|
||||
capture_output=True, text=True, timeout=60
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.error(f"curl failed (return code {result.returncode}): {result.stderr[:200]}")
|
||||
raise RuntimeError(f"curl failed ({result.returncode}): {result.stderr[:200]}")
|
||||
logger.debug(f"HTTP response: size={len(result.stdout)} bytes")
|
||||
return result.stdout
|
||||
|
||||
|
||||
@@ -101,14 +109,18 @@ def format_price(price: int) -> str:
|
||||
return " ".join(reversed(parts)) + " Kč"
|
||||
|
||||
|
||||
def scrape():
|
||||
print("=" * 60)
|
||||
print("Stahuji inzeráty z PSN.cz")
|
||||
print(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
print(f"Min. plocha: {MIN_AREA} m²")
|
||||
print(f"Patro: od {MIN_FLOOR}. NP")
|
||||
print(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
|
||||
print("=" * 60)
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
logger.info("=" * 60)
|
||||
logger.info("Stahuji inzeráty z PSN.cz")
|
||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
||||
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
||||
logger.info(f"Region: Praha ({len(PRAGUE_PROJECTS)} projektů)")
|
||||
if max_pages:
|
||||
logger.info(f"Max. stran: {max_pages}")
|
||||
if max_properties:
|
||||
logger.info(f"Max. bytů: {max_properties}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Fetch units from each Prague project
|
||||
all_units = []
|
||||
@@ -118,21 +130,25 @@ def scrape():
|
||||
project_units = []
|
||||
|
||||
while True:
|
||||
if max_pages and page > max_pages:
|
||||
logger.debug(f"Max pages limit reached: {max_pages}")
|
||||
break
|
||||
url = f"{BASE_URL}/projekt/{proj['slug']}?page={page}"
|
||||
print(f" {proj['name']} — strana {page} ...")
|
||||
logger.info(f"{proj['name']} — strana {page} ...")
|
||||
time.sleep(0.5)
|
||||
|
||||
try:
|
||||
html = fetch_url(url)
|
||||
except Exception as e:
|
||||
print(f" Chyba: {e}")
|
||||
logger.error(f"Fetch error for {proj['name']}: {e}", exc_info=True)
|
||||
break
|
||||
|
||||
units = extract_units_from_html(html)
|
||||
logger.debug(f"Project {proj['slug']} page {page}: extracted {len(units)} units")
|
||||
|
||||
if not units:
|
||||
if page == 1:
|
||||
print(f" → 0 jednotek")
|
||||
logger.info(f"→ 0 jednotek")
|
||||
break
|
||||
|
||||
# Add project info to each unit
|
||||
@@ -146,7 +162,7 @@ def scrape():
|
||||
project_units.extend(units)
|
||||
|
||||
if page == 1:
|
||||
print(f" → {len(units)} jednotek na stránce")
|
||||
logger.info(f"→ {len(units)} jednotek na stránce")
|
||||
|
||||
# Check if there might be more pages
|
||||
# If we got fewer than expected or same units, stop
|
||||
@@ -170,10 +186,10 @@ def scrape():
|
||||
elif not slug:
|
||||
unique_units.append(u)
|
||||
|
||||
print(f"\n Staženo celkem: {len(unique_units)} unikátních jednotek")
|
||||
logger.info(f"\nStaženo celkem: {len(unique_units)} unikátních jednotek")
|
||||
|
||||
# Filter
|
||||
print(f"\nFiltrování...")
|
||||
logger.info(f"\nFiltrování...")
|
||||
results = []
|
||||
excluded_sold = 0
|
||||
excluded_type = 0
|
||||
@@ -182,37 +198,47 @@ def scrape():
|
||||
excluded_area = 0
|
||||
excluded_floor = 0
|
||||
excluded_panel = 0
|
||||
properties_fetched = 0
|
||||
|
||||
for unit in unique_units:
|
||||
if max_properties and properties_fetched >= max_properties:
|
||||
logger.debug(f"Max properties limit reached: {max_properties}")
|
||||
break
|
||||
unit_id = unit.get("id", unit.get("slug", "unknown"))
|
||||
# Only free units
|
||||
is_free = unit.get("is_free", False)
|
||||
is_sold = unit.get("is_sold", False)
|
||||
if is_sold or not is_free:
|
||||
excluded_sold += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (sold/not free)")
|
||||
continue
|
||||
|
||||
# Only apartments
|
||||
category = str(unit.get("category", "")).lower()
|
||||
if "byt" not in category and "ateliér" not in category:
|
||||
excluded_type += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (not apartment, category={category})")
|
||||
continue
|
||||
|
||||
# Disposition
|
||||
disp = unit.get("disposition", "")
|
||||
if disp not in WANTED_DISPOSITIONS:
|
||||
excluded_disp += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (disposition {disp})")
|
||||
continue
|
||||
|
||||
# Price
|
||||
price = unit.get("price_czk") or unit.get("action_price_czk") or 0
|
||||
if price <= 0 or price > MAX_PRICE:
|
||||
excluded_price += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (price {price})")
|
||||
continue
|
||||
|
||||
# Area
|
||||
area = unit.get("total_area") or unit.get("floor_area") or 0
|
||||
if area < MIN_AREA:
|
||||
excluded_area += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (area {area} m²)")
|
||||
continue
|
||||
|
||||
# Floor
|
||||
@@ -228,13 +254,15 @@ def scrape():
|
||||
|
||||
if floor is not None and floor < MIN_FLOOR:
|
||||
excluded_floor += 1
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (floor {floor})")
|
||||
continue
|
||||
|
||||
# Construction — check for panel
|
||||
build_type = str(unit.get("build_type", "")).lower()
|
||||
if "panel" in build_type:
|
||||
excluded_panel += 1
|
||||
print(f" ✗ Vyloučen: panel ({build_type})")
|
||||
logger.debug(f"Filter: id={unit_id} - excluded (panel construction)")
|
||||
logger.info(f"✗ Vyloučen: panel ({build_type})")
|
||||
continue
|
||||
|
||||
# Build construction label
|
||||
@@ -272,26 +300,43 @@ def scrape():
|
||||
"image": "",
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Výsledky PSN:")
|
||||
print(f" Celkem jednotek: {len(unique_units)}")
|
||||
print(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
print(f" Vyloučeno (typ): {excluded_type}")
|
||||
print(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
print(f" Vyloučeno (cena): {excluded_price}")
|
||||
print(f" Vyloučeno (plocha): {excluded_area}")
|
||||
print(f" Vyloučeno (patro): {excluded_floor}")
|
||||
print(f" Vyloučeno (panel): {excluded_panel}")
|
||||
print(f" ✓ Vyhovující byty: {len(results)}")
|
||||
print(f"{'=' * 60}")
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"Výsledky PSN:")
|
||||
logger.info(f" Celkem jednotek: {len(unique_units)}")
|
||||
logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
logger.info(f" Vyloučeno (typ): {excluded_type}")
|
||||
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
||||
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
||||
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
||||
logger.info(f" Vyloučeno (panel): {excluded_panel}")
|
||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Scrape apartments from PSN.cz")
|
||||
parser.add_argument("--max-pages", type=int, default=None,
|
||||
help="Maximum number of listing pages per project to scrape")
|
||||
parser.add_argument("--max-properties", type=int, default=None,
|
||||
help="Maximum number of properties to include in results")
|
||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging level (default: INFO)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
estates = scrape()
|
||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||
|
||||
if estates:
|
||||
json_path = Path("byty_psn.json")
|
||||
@@ -300,7 +345,7 @@ if __name__ == "__main__":
|
||||
encoding="utf-8",
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
print(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
print(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
else:
|
||||
print("\nŽádné byty z PSN neodpovídají kritériím :(")
|
||||
logger.info("\nŽádné byty z PSN neodpovídají kritériím :(")
|
||||
|
||||
Reference in New Issue
Block a user