Add validation mode, structured logging, and CLI args to all scrapers
- Replace print() with Python logging module across all 6 scrapers for configurable log levels (DEBUG/INFO/WARNING/ERROR) - Add --max-pages, --max-properties, and --log-level CLI arguments to each scraper via argparse for limiting scrape scope - Add validation Make targets (validation, validation-local, validation-local-debug) for quick test runs with limited data - Update run_all.sh to parse and forward CLI args to all scrapers - Update mapa_bytu.html with latest scrape results Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,12 +6,16 @@ Výstup: byty_cityhome.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Konfigurace ─────────────────────────────────────────────────────────────
|
||||
|
||||
MAX_PRICE = 14_000_000
|
||||
@@ -33,14 +37,20 @@ def fetch_url(url: str) -> str:
|
||||
"""Fetch URL and return HTML string."""
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.debug(f"HTTP GET request (attempt {attempt + 1}/3): {url}")
|
||||
logger.debug(f"Headers: {HEADERS}")
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
return resp.read().decode("utf-8")
|
||||
html = resp.read().decode("utf-8")
|
||||
logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
|
||||
return html
|
||||
except (ConnectionResetError, ConnectionError, urllib.error.URLError) as e:
|
||||
if attempt < 2:
|
||||
time.sleep((attempt + 1) * 2)
|
||||
print(f" Retry {attempt + 1}: {e}")
|
||||
wait = (attempt + 1) * 2
|
||||
logger.warning(f"Connection error (retry {attempt + 1}/3 after {wait}s): {e}")
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error(f"HTTP request failed after 3 attempts: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
@@ -171,22 +181,24 @@ def extract_project_gps(html: str) -> dict[str, tuple[float, float]]:
|
||||
return gps_data
|
||||
|
||||
|
||||
def scrape():
|
||||
print("=" * 60)
|
||||
print("Stahuji inzeráty z CityHome (city-home.cz)")
|
||||
print(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
print(f"Min. plocha: {MIN_AREA} m²")
|
||||
print(f"Patro: od {MIN_FLOOR}. NP")
|
||||
print("=" * 60)
|
||||
def scrape(max_pages: int | None = None, max_properties: int | None = None):
|
||||
logger.info("=" * 60)
|
||||
logger.info("Stahuji inzeráty z CityHome (city-home.cz)")
|
||||
logger.info(f"Cena: do {format_price(MAX_PRICE)}")
|
||||
logger.info(f"Min. plocha: {MIN_AREA} m²")
|
||||
logger.info(f"Patro: od {MIN_FLOOR}. NP")
|
||||
if max_properties:
|
||||
logger.info(f"Max. bytů: {max_properties}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Step 1: Fetch the main filter page
|
||||
print("\nFáze 1: Stahování seznamu bytů...")
|
||||
logger.info("\nFáze 1: Stahování seznamu bytů...")
|
||||
html = fetch_url(f"{BASE_URL}/filtr-nemovitosti1")
|
||||
all_listings = parse_filter_page(html)
|
||||
print(f" Nalezeno: {len(all_listings)} jednotek")
|
||||
logger.info(f"Nalezeno: {len(all_listings)} jednotek")
|
||||
|
||||
# Step 2: Collect unique project slugs from detail URLs to fetch GPS
|
||||
print("\nFáze 2: Stahování GPS souřadnic projektů...")
|
||||
logger.info("\nFáze 2: Stahování GPS souřadnic projektů...")
|
||||
project_slugs = set()
|
||||
for listing in all_listings:
|
||||
url = listing.get("url", "")
|
||||
@@ -201,20 +213,22 @@ def scrape():
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
locality_url = f"{BASE_URL}/projekty/{slug}/lokalita"
|
||||
logger.debug(f"Fetching project GPS: {locality_url}")
|
||||
loc_html = fetch_url(locality_url)
|
||||
gps = extract_project_gps(loc_html)
|
||||
if gps:
|
||||
# Take first entry (the project itself)
|
||||
first_name, (lat, lon) = next(iter(gps.items()))
|
||||
project_gps[slug] = (lat, lon)
|
||||
print(f" ✓ {slug}: {lat}, {lon}")
|
||||
logger.info(f"✓ {slug}: {lat}, {lon}")
|
||||
else:
|
||||
print(f" ✗ {slug}: GPS nenalezeno")
|
||||
logger.info(f"✗ {slug}: GPS nenalezeno")
|
||||
except Exception as e:
|
||||
print(f" ✗ {slug}: chyba ({e})")
|
||||
logger.warning(f"Error fetching GPS for {slug}: {e}", exc_info=True)
|
||||
logger.info(f"✗ {slug}: chyba ({e})")
|
||||
|
||||
# Step 3: Filter listings
|
||||
print(f"\nFáze 3: Filtrování...")
|
||||
logger.info(f"\nFáze 3: Filtrování...")
|
||||
results = []
|
||||
excluded_sold = 0
|
||||
excluded_type = 0
|
||||
@@ -223,45 +237,57 @@ def scrape():
|
||||
excluded_area = 0
|
||||
excluded_floor = 0
|
||||
excluded_no_gps = 0
|
||||
properties_fetched = 0
|
||||
|
||||
for listing in all_listings:
|
||||
if max_properties and properties_fetched >= max_properties:
|
||||
logger.debug(f"Max properties limit reached: {max_properties}")
|
||||
break
|
||||
unit_name = listing.get("unit_name", "unknown")
|
||||
# Only available units
|
||||
if listing["free"] != "yes":
|
||||
excluded_sold += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (not free)")
|
||||
continue
|
||||
|
||||
# Only apartments (unittype=2)
|
||||
if listing["unittype"] != 2:
|
||||
excluded_type += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (not apartment, unittype={listing['unittype']})")
|
||||
continue
|
||||
|
||||
# Only sales
|
||||
if listing["transaction"] != "prodej":
|
||||
excluded_type += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (not sale, transaction={listing['transaction']})")
|
||||
continue
|
||||
|
||||
# Disposition
|
||||
disp = listing["disposition"]
|
||||
if disp not in WANTED_DISPOSITIONS:
|
||||
excluded_disp += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (disposition {disp})")
|
||||
continue
|
||||
|
||||
# Price
|
||||
price = listing["price"]
|
||||
if price <= 0 or price > MAX_PRICE:
|
||||
excluded_price += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (price {price})")
|
||||
continue
|
||||
|
||||
# Area
|
||||
area = listing["area"]
|
||||
if area < MIN_AREA:
|
||||
excluded_area += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (area {area} m²)")
|
||||
continue
|
||||
|
||||
# Floor
|
||||
floor = listing["floor"]
|
||||
if floor is not None and floor < MIN_FLOOR:
|
||||
excluded_floor += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (floor {floor})")
|
||||
continue
|
||||
|
||||
# GPS from project
|
||||
@@ -272,6 +298,7 @@ def scrape():
|
||||
|
||||
if not gps:
|
||||
excluded_no_gps += 1
|
||||
logger.debug(f"Filter: {unit_name} - excluded (no GPS for project {slug})")
|
||||
continue
|
||||
|
||||
lat, lon = gps
|
||||
@@ -294,26 +321,43 @@ def scrape():
|
||||
"image": "",
|
||||
}
|
||||
results.append(result)
|
||||
properties_fetched += 1
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Výsledky CityHome:")
|
||||
print(f" Celkem jednotek: {len(all_listings)}")
|
||||
print(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
print(f" Vyloučeno (typ): {excluded_type}")
|
||||
print(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
print(f" Vyloučeno (cena): {excluded_price}")
|
||||
print(f" Vyloučeno (plocha): {excluded_area}")
|
||||
print(f" Vyloučeno (patro): {excluded_floor}")
|
||||
print(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
||||
print(f" ✓ Vyhovující byty: {len(results)}")
|
||||
print(f"{'=' * 60}")
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"Výsledky CityHome:")
|
||||
logger.info(f" Celkem jednotek: {len(all_listings)}")
|
||||
logger.info(f" Vyloučeno (prodáno): {excluded_sold}")
|
||||
logger.info(f" Vyloučeno (typ): {excluded_type}")
|
||||
logger.info(f" Vyloučeno (dispozice): {excluded_disp}")
|
||||
logger.info(f" Vyloučeno (cena): {excluded_price}")
|
||||
logger.info(f" Vyloučeno (plocha): {excluded_area}")
|
||||
logger.info(f" Vyloučeno (patro): {excluded_floor}")
|
||||
logger.info(f" Vyloučeno (bez GPS): {excluded_no_gps}")
|
||||
logger.info(f" ✓ Vyhovující byty: {len(results)}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Scrape apartments from CityHome")
|
||||
parser.add_argument("--max-pages", type=int, default=None,
|
||||
help="Maximum number of listing pages to scrape (not applicable for CityHome)")
|
||||
parser.add_argument("--max-properties", type=int, default=None,
|
||||
help="Maximum number of properties to include in results")
|
||||
parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging level (default: INFO)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, args.log_level),
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
estates = scrape()
|
||||
estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
|
||||
|
||||
if estates:
|
||||
json_path = Path("byty_cityhome.json")
|
||||
@@ -322,7 +366,7 @@ if __name__ == "__main__":
|
||||
encoding="utf-8",
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
print(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
print(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
|
||||
logger.info(f"⏱ Celkový čas: {elapsed:.0f} s")
|
||||
else:
|
||||
print("\nŽádné byty z CityHome neodpovídají kritériím :(")
|
||||
logger.info("\nŽádné byty z CityHome neodpovídají kritériím :(")
|
||||
|
||||
Reference in New Issue
Block a user