#!/usr/bin/env python3
"""
Bazoš.cz scraper.
Stáhne byty na prodej v Praze a vyfiltruje podle kritérií.
Výstup: byty_bazos.json
"""
from __future__ import annotations

import argparse
from datetime import datetime
import json
import logging
import math
import re
import time
import urllib.request
import urllib.parse
from pathlib import Path
from scraper_stats import write_stats, validate_listing

STATS_FILE = "stats_bazos.json"

logger = logging.getLogger(__name__)

# ── Konfigurace ─────────────────────────────────────────────────────────────

MAX_PRICE = 14_000_000
MIN_AREA = 69
MIN_FLOOR = 2
PER_PAGE = 20  # Bazoš vrací 20 na stránku

WANTED_DISPOSITIONS = {"3+kk", "3+1", "4+kk", "4+1", "5+kk", "5+1", "6+kk", "6+1"}

# Regex patterns pro parsování dispozice, plochy a patra z textu
DISP_RE = re.compile(r'(\d)\s*\+\s*(kk|1)', re.IGNORECASE)
AREA_RE = re.compile(r'(\d+(?:[.,]\d+)?)\s*m[²2\s,.]', re.IGNORECASE)
FLOOR_RE = re.compile(r'(\d+)\s*[./]\s*(\d+)\s*(?:NP|patr|podlaž|floor)', re.IGNORECASE)
FLOOR_RE2 = re.compile(r'(\d+)\.\s*(?:NP|patr[eouě]|podlaž[ií])', re.IGNORECASE)
FLOOR_RE3 = re.compile(r'(?:patr[eouě]|podlaž[ií]|NP)\s*[:\s]*(\d+)', re.IGNORECASE)
PANEL_RE = re.compile(r'panel(?:ov|ák|\.)', re.IGNORECASE)
SIDLISTE_RE = re.compile(r'sídliště|sidliste|panelák', re.IGNORECASE)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml",
    "Accept-Language": "cs,en;q=0.9",
}

BASE_URL = "https://reality.bazos.cz"
SEARCH_PARAMS = "hledat=&rubriky=reality&hlokalita=Praha&humkreis=25&cenado={max_price}&kitx=ano"


def fetch_url(url: str, retries: int = 3) -> str:
    """Fetch URL and return HTML string with retry on transient errors."""
    for attempt in range(retries):
        try:
            logger.debug(f"HTTP GET request (attempt {attempt + 1}/{retries}): {url}")
            req = urllib.request.Request(url, headers=HEADERS)
            resp = urllib.request.urlopen(req, timeout=30)
            html = resp.read().decode("utf-8", errors="replace")
            logger.debug(f"HTTP response: status={resp.status}, size={len(html)} bytes")
            return html
        except urllib.error.HTTPError:
            raise
        except (ConnectionResetError, ConnectionError, urllib.error.URLError, OSError) as e:
            if attempt < retries - 1:
                wait = (attempt + 1) * 3
                logger.warning(f"Connection error (retry {attempt + 1}/{retries} after {wait}s): {e}")
                time.sleep(wait)
            else:
                logger.error(f"HTTP request failed after {retries} attempts: {e}", exc_info=True)
                raise


def format_price(price: int) -> str:
    s = str(price)
    parts = []
    while s:
        parts.append(s[-3:])
        s = s[:-3]
    return " ".join(reversed(parts)) + " Kč"


def parse_price(text: str) -> int:
    """Parse price from text like '5 250 000 Kč' → 5250000."""
    cleaned = re.sub(r'[^\d]', '', text)
    return int(cleaned) if cleaned else 0


def parse_disposition(text: str) -> str | None:
    """Parse disposition from title/description like '3+kk', '4+1'."""
    m = DISP_RE.search(text)
    if m:
        rooms = m.group(1)
        suffix = m.group(2).lower()
        return f"{rooms}+{suffix}"
    return None


def parse_area(text: str) -> float | None:
    """Parse area from text like '82 m²' → 82.0."""
    m = AREA_RE.search(text)
    if m:
        return float(m.group(1).replace(',', '.'))
    return None


def parse_floor(text: str) -> int | None:
    """Parse floor number from description."""
    for pattern in [FLOOR_RE, FLOOR_RE2, FLOOR_RE3]:
        m = pattern.search(text)
        if m:
            return int(m.group(1))
    return None


def is_panel(text: str) -> bool:
    """Check if description mentions panel construction."""
    return bool(PANEL_RE.search(text))


def is_sidliste(text: str) -> bool:
    """Check if description mentions housing estate."""
    return bool(SIDLISTE_RE.search(text))


def fetch_listing_page(offset: int = 0, pagination_params: str | None = None) -> tuple[list[dict], int, str | None]:
    """
    Fetch a page of listings from Bazoš.
    Returns (list of basic listing dicts, total count, pagination_params for next pages).
    """
    if pagination_params and offset > 0:
        # Use resolved numeric params from first page's pagination links
        url = f"{BASE_URL}/prodam/byt/{offset}/?{pagination_params}"
    else:
        params = SEARCH_PARAMS.format(max_price=MAX_PRICE)
        if offset > 0:
            url = f"{BASE_URL}/prodam/byt/{offset}/?{params}"
        else:
            url = f"{BASE_URL}/prodam/byt/?{params}"

    html = fetch_url(url)

    # Parse total count: "Zobrazeno 1-20 z 727"
    total = 0
    total_match = re.search(r'z\s+([\d\s]+)\s', html)
    if total_match:
        total = int(total_match.group(1).replace(' ', ''))

    # Extract resolved pagination params from first page (Bazoš converts
    # hlokalita=Praha → hlokalita=11000, and pagination only works with numeric form)
    resolved_params = None
    pag_link = re.search(r'href="/prodam/byt/\d+/\?([^"]+)"', html)
    if pag_link:
        resolved_params = pag_link.group(1)

    # Parse listings — split by listing blocks (class="inzeraty inzeratyflex")
    listings = []
    all_blocks = re.split(r'<div class="inzeraty\s+inzeratyflex">', html)[1:]  # skip before first

    for block in all_blocks:
        # Extract URL and ID from first link (/inzerat/XXXXXX/slug.php)
        url_match = re.search(r'href="(/inzerat/(\d+)/[^"]*)"', block)
        if not url_match:
            continue
        detail_path = url_match.group(1)
        listing_id = int(url_match.group(2))

        # Title — class=nadpis (without quotes) or class="nadpis"
        title_match = re.search(r'class=.?nadpis.?[^>]*>\s*<a[^>]*>([^<]+)</a>', block)
        title = title_match.group(1).strip() if title_match else ""

        # Price — inside <span translate="no"> within inzeratycena
        price_match = re.search(r'class="inzeratycena"[^>]*>.*?<span[^>]*>([^<]+)</span>', block, re.DOTALL)
        if not price_match:
            # Fallback: direct text in inzeratycena
            price_match = re.search(r'class="inzeratycena"[^>]*>\s*(?:<b>)?([^<]+)', block)
        price_text = price_match.group(1).strip() if price_match else ""
        price = parse_price(price_text)

        # Location
        loc_match = re.search(r'class="inzeratylok"[^>]*>(.*?)</div>', block, re.DOTALL)
        location = ""
        if loc_match:
            location = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip()
            location = re.sub(r'\s+', ' ', location)

        # Date — [5.3. 2026]
        date_match = re.search(r'\[(\d+\.\d+\.\s*\d{4})\]', block)
        date_str = date_match.group(1).strip() if date_match else ""

        # Description preview — class=popis (without quotes) or class="popis"
        desc_match = re.search(r'class=.?popis.?[^>]*>(.*?)</div>', block, re.DOTALL)
        description = ""
        if desc_match:
            description = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip()
            description = re.sub(r'\s+', ' ', description)

        # Image — <img ... class="obrazek" ... src="...">
        img_match = re.search(r'<img[^>]*src="([^"]+)"[^>]*class="obrazek"', block)
        if not img_match:
            img_match = re.search(r'class="obrazek"[^>]*src="([^"]+)"', block)
        image = img_match.group(1) if img_match else ""
        if "empty.gif" in image:
            image = ""

        listings.append({
            "id": listing_id,
            "title": title,
            "price": price,
            "location": location,
            "date": date_str,
            "description": description,
            "detail_path": detail_path,
            "image": image,
        })

    logger.debug(f"Offset {offset}: found {len(listings)} listings, total={total}")
    return listings, total, resolved_params


def fetch_detail(path: str) -> dict | None:
    """Fetch listing detail page and extract GPS, full description."""
    try:
        url = f"{BASE_URL}{path}"
        html = fetch_url(url)

        result = {}

        # GPS from Google Maps link
        gps_match = re.search(r'google\.com/maps[^"]*place/([\d.]+),([\d.]+)', html)
        if gps_match:
            result["lat"] = float(gps_match.group(1))
            result["lon"] = float(gps_match.group(2))

        # Full description — Bazoš uses unquoted class=popisdetail
        desc_match = re.search(r'class=.?popisdetail.?[^>]*>(.*?)</div>', html, re.DOTALL)
        if desc_match:
            desc = re.sub(r'<[^>]+>', ' ', desc_match.group(1)).strip()
            desc = re.sub(r'\s+', ' ', desc)
            result["description"] = desc

        # Location from detail
        loc_match = re.search(r'Lokalita:</td>\s*<td[^>]*>(.*?)</td>', html, re.DOTALL)
        if loc_match:
            loc = re.sub(r'<[^>]+>', ' ', loc_match.group(1)).strip()
            loc = re.sub(r'\s+', ' ', loc)
            result["detail_location"] = loc

        return result

    except Exception as e:
        logger.warning(f"Detail fetch failed for {path}: {e}")
        return None


def load_cache(json_path: str = "byty_bazos.json") -> dict[int, dict]:
    """Load previously scraped data as cache keyed by hash_id."""
    path = Path(json_path)
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return {e["hash_id"]: e for e in data if "hash_id" in e}
    except (json.JSONDecodeError, KeyError):
        return {}


def scrape(max_pages: int | None = None, max_properties: int | None = None):
    _run_start = time.time()
    _run_ts = datetime.now().isoformat(timespec="seconds")
    cache = load_cache()
    today = datetime.now().strftime("%Y-%m-%d")

    logger.info("=" * 60)
    logger.info("Stahuji inzeráty z Bazoš.cz")
    logger.info(f"Cena: do {format_price(MAX_PRICE)}")
    logger.info(f"Min. plocha: {MIN_AREA} m²")
    logger.info(f"Patro: od {MIN_FLOOR}. NP")
    logger.info(f"Region: Praha")
    if cache:
        logger.info(f"Cache: {len(cache)} bytů z minulého běhu")
    if max_pages:
        logger.info(f"Max. stran: {max_pages}")
    if max_properties:
        logger.info(f"Max. bytů: {max_properties}")
    logger.info("=" * 60)

    # Step 1: Fetch listing pages
    logger.info("\nFáze 1: Stahování seznamu inzerátů...")
    all_listings = {}  # id -> listing dict (dedup)
    page = 1
    offset = 0
    total = None
    pagination_params = None  # resolved numeric params from first page

    while True:
        if max_pages and page > max_pages:
            logger.debug(f"Max pages limit reached: {max_pages}")
            break

        logger.info(f"Strana {page} (offset {offset}) ...")
        listings, total_count, resolved = fetch_listing_page(offset, pagination_params)
        if resolved and not pagination_params:
            pagination_params = resolved
            logger.debug(f"Resolved pagination params: {pagination_params}")

        if total is None and total_count > 0:
            total = total_count
            total_pages = math.ceil(total / PER_PAGE)
            logger.info(f"→ Celkem {total} inzerátů, ~{total_pages} stran")

        if not listings:
            logger.debug(f"No listings found on page {page}, stopping")
            break

        for lst in listings:
            lid = lst["id"]
            if lid not in all_listings:
                all_listings[lid] = lst

        page += 1
        offset += PER_PAGE
        if total and offset >= total:
            break
        time.sleep(0.5)

    logger.info(f"\nStaženo: {len(all_listings)} unikátních inzerátů")

    # Step 2: Pre-filter by disposition, price, area from listing data
    pre_filtered = []
    excluded_disp = 0
    excluded_price = 0
    excluded_area = 0
    excluded_no_disp = 0

    for lst in all_listings.values():
        title_and_desc = f"{lst['title']} {lst['description']}"

        # Parse disposition
        disp = parse_disposition(title_and_desc)
        if not disp:
            excluded_no_disp += 1
            logger.debug(f"Filter: id={lst['id']} - excluded (no disposition found in '{lst['title']}')")
            continue
        if disp not in WANTED_DISPOSITIONS:
            excluded_disp += 1
            logger.debug(f"Filter: id={lst['id']} - excluded (disposition {disp})")
            continue

        # Price
        price = lst["price"]
        if price <= 0 or price > MAX_PRICE:
            excluded_price += 1
            logger.debug(f"Filter: id={lst['id']} - excluded (price {price})")
            continue

        # Area (if parseable from listing)
        area = parse_area(title_and_desc)
        if area is not None and area < MIN_AREA:
            excluded_area += 1
            logger.debug(f"Filter: id={lst['id']} - excluded (area {area} m²)")
            continue

        lst["_disposition"] = disp
        lst["_area"] = area
        pre_filtered.append(lst)

    logger.info(f"\nPo předfiltraci:")
    logger.info(f"  Vyloučeno (bez dispozice): {excluded_no_disp}")
    logger.info(f"  Vyloučeno (dispozice):     {excluded_disp}")
    logger.info(f"  Vyloučeno (cena):          {excluded_price}")
    logger.info(f"  Vyloučeno (plocha):        {excluded_area}")
    logger.info(f"  Zbývá:                     {len(pre_filtered)}")

    # Step 3: Fetch details (for GPS + full description)
    logger.info(f"\nFáze 2: Stahování detailů ({len(pre_filtered)} bytů)...")
    results = []
    excluded_panel = 0
    excluded_floor = 0
    excluded_no_gps = 0
    excluded_detail = 0
    excluded_area_detail = 0
    cache_hits = 0
    properties_fetched = 0

    for i, lst in enumerate(pre_filtered):
        if max_properties and properties_fetched >= max_properties:
            logger.debug(f"Max properties limit reached: {max_properties}")
            break

        listing_id = lst["id"]
        price = lst["price"]

        # Check cache
        cached = cache.get(listing_id)
        if cached and cached.get("price") == price:
            cache_hits += 1
            logger.debug(f"Cache hit for id={listing_id}")
            results.append(cached)
            continue

        time.sleep(0.4)
        detail = fetch_detail(lst["detail_path"])

        if not detail:
            excluded_detail += 1
            logger.debug(f"Filter: id={listing_id} - excluded (detail fetch failed)")
            continue

        # GPS required
        lat = detail.get("lat")
        lon = detail.get("lon")
        if not lat or not lon:
            excluded_no_gps += 1
            logger.debug(f"Filter: id={listing_id} - excluded (no GPS)")
            continue

        # Full text for filtering
        full_desc = detail.get("description", "")
        full_text = f"{lst['title']} {lst['description']} {full_desc}"

        # Panel check
        if is_panel(full_text):
            excluded_panel += 1
            logger.info(f"✗ Vyloučen #{listing_id}: panelová stavba")
            continue

        # Sídliště check
        if is_sidliste(full_text):
            excluded_panel += 1
            logger.info(f"✗ Vyloučen #{listing_id}: sídliště")
            continue

        # Floor
        floor = parse_floor(full_text)
        if floor is not None and floor < MIN_FLOOR:
            excluded_floor += 1
            logger.debug(f"Filter: id={listing_id} - excluded (floor {floor})")
            continue

        # Area — re-check from detail if not found before
        area = lst.get("_area") or parse_area(full_desc)
        if area is not None and area < MIN_AREA:
            excluded_area_detail += 1
            logger.debug(f"Filter: id={listing_id} - excluded (area {area} m² from detail)")
            continue

        disp = lst["_disposition"]
        locality = detail.get("detail_location") or lst["location"]

        result = {
            "hash_id": listing_id,
            "name": f"Prodej bytu {disp} {int(area) if area else '?'} m²",
            "price": price,
            "price_formatted": format_price(price),
            "locality": locality,
            "lat": lat,
            "lon": lon,
            "disposition": disp,
            "floor": floor,
            "area": area,
            "building_type": "neuvedeno",
            "ownership": "neuvedeno",
            "url": f"{BASE_URL}{lst['detail_path']}",
            "source": "bazos",
            "image": lst.get("image", ""),
            "scraped_at": today,
            "first_seen": cached.get("first_seen", today) if cached else today,
            "last_changed": today if not cached or cached.get("price") != price else cached.get("last_changed", today),
        }

        if not validate_listing(result, "bazos"):
            continue

        results.append(result)
        properties_fetched += 1

        if (i + 1) % 20 == 0:
            logger.info(f"Zpracováno {i + 1}/{len(pre_filtered)} ...")

    logger.info(f"\n{'=' * 60}")
    logger.info(f"Výsledky Bazoš:")
    logger.info(f"  Předfiltrováno:        {len(pre_filtered)}")
    logger.info(f"  Z cache (přeskočeno): {cache_hits}")
    logger.info(f"  Vyloučeno (panel/síd): {excluded_panel}")
    logger.info(f"  Vyloučeno (patro):     {excluded_floor}")
    logger.info(f"  Vyloučeno (bez GPS):   {excluded_no_gps}")
    logger.info(f"  Vyloučeno (bez detailu): {excluded_detail}")
    logger.info(f"  Vyloučeno (plocha det): {excluded_area_detail}")
    logger.info(f"  ✓ Vyhovující byty:    {len(results)}")
    logger.info(f"{'=' * 60}")

    write_stats(STATS_FILE, {
        "source": "Bazoš",
        "timestamp": _run_ts,
        "duration_sec": round(time.time() - _run_start, 1),
        "success": True,
        "accepted": len(results),
        "fetched": len(all_listings),
        "pages": page - 1,
        "cache_hits": cache_hits,
        "excluded": {
            "bez dispozice": excluded_no_disp,
            "dispozice": excluded_disp,
            "cena": excluded_price,
            "plocha": excluded_area + excluded_area_detail,
            "bez GPS": excluded_no_gps,
            "panel/síd": excluded_panel,
            "patro": excluded_floor,
            "bez detailu": excluded_detail,
        },
    })
    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scrape apartments from Bazoš.cz")
    parser.add_argument("--max-pages", type=int, default=None,
                        help="Maximum number of listing pages to scrape")
    parser.add_argument("--max-properties", type=int, default=None,
                        help="Maximum number of properties to fetch details for")
    parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
                        help="Logging level (default: INFO)")
    args = parser.parse_args()

    logging.basicConfig(
        level=getattr(logging, args.log_level),
        format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
        handlers=[logging.StreamHandler()]
    )

    _run_ts = datetime.now().isoformat(timespec="seconds")
    start = time.time()
    try:
        estates = scrape(max_pages=args.max_pages, max_properties=args.max_properties)
    except Exception as e:
        logger.error(f"Scraper failed: {e}", exc_info=True)
        write_stats(STATS_FILE, {
            "source": "Bazoš",
            "timestamp": _run_ts,
            "duration_sec": round(time.time() - start, 1),
            "success": False,
            "accepted": 0,
            "fetched": 0,
            "error": str(e),
        })
        raise

    if estates:
        json_path = Path("byty_bazos.json")
        json_path.write_text(
            json.dumps(estates, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
        elapsed = time.time() - start
        logger.info(f"\n✓ Data uložena: {json_path.resolve()}")
        logger.info(f"⏱  Celkový čas: {elapsed:.0f} s")
    else:
        logger.info("\nŽádné byty z Bazoše neodpovídají kritériím :(")