#!/usr/bin/env python3
from __future__ import annotations

import csv
import hashlib
import shutil
import tempfile
import time
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple
from urllib.parse import urlparse

import requests
import xml.etree.ElementTree as ET


SITEMAP_INDEX_URL = "https://draeger-it.blog/sitemap_index.xml"
OUT_CSV = "html_report.csv"

TIMEOUT_SEC = 30
DELAY_SEC = 0.05

# Wenn True, wird das Temp-Verzeichnis am Ende gelöscht
CLEANUP_TEMP_DIR = False

HEADERS = {
    "User-Agent": "HTMLSizeAuditBot/1.0 (+https://draeger-it.blog/)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    # requests entpackt gzip automatisch -> wir speichern die unkomprimierten HTML-Bytes
    "Accept-Encoding": "gzip",
}


@dataclass
class Row:
    url: str
    lastmod: str
    status: int
    final_url: str
    content_type: str
    file_path: str
    size_bytes: int
    size_kb: float
    lines: int
    error: str


def is_http_url(u: str) -> bool:
    try:
        p = urlparse(u)
        return p.scheme in ("http", "https") and bool(p.netloc)
    except Exception:
        return False


def fetch_xml(session: requests.Session, url: str) -> bytes:
    r = session.get(url, headers=HEADERS, timeout=TIMEOUT_SEC)
    r.raise_for_status()
    return r.content


def parse_sitemap_index(session: requests.Session, sitemap_index_url: str) -> List[str]:
    xml_bytes = fetch_xml(session, sitemap_index_url)
    root = ET.fromstring(xml_bytes)
    # sitemapindex -> <sitemap><loc>...</loc></sitemap>
    locs = []
    for loc in root.findall(".//{*}loc"):
        if loc.text:
            u = loc.text.strip()
            if is_http_url(u):
                locs.append(u)
    return locs


def parse_urlset(session: requests.Session, sitemap_url: str) -> List[Tuple[str, str]]:
    """
    Returns list of (url, lastmod). lastmod can be empty.
    """
    xml_bytes = fetch_xml(session, sitemap_url)
    root = ET.fromstring(xml_bytes)

    out: List[Tuple[str, str]] = []
    for url_node in root.findall(".//{*}url"):
        loc_node = url_node.find("{*}loc")
        lastmod_node = url_node.find("{*}lastmod")

        if loc_node is None or not loc_node.text:
            continue

        loc = loc_node.text.strip()
        lastmod = lastmod_node.text.strip() if (lastmod_node is not None and lastmod_node.text) else ""
        if is_http_url(loc):
            out.append((loc, lastmod))

    return out


def collect_all_pages(session: requests.Session) -> List[Tuple[str, str]]:
    """
    Supports sitemap_index.xml that points to multiple sitemaps (urlsets).
    """
    sitemap_urls = parse_sitemap_index(session, SITEMAP_INDEX_URL)

    all_pages: List[Tuple[str, str]] = []
    for sm in sitemap_urls:
        try:
            all_pages.extend(parse_urlset(session, sm))
        except Exception as e:
            print(f"[WARN] Konnte Sitemap nicht lesen: {sm} ({e})")

    # Deduplicate while preserving order
    seen = set()
    unique: List[Tuple[str, str]] = []
    for u, lm in all_pages:
        if u not in seen:
            unique.append((u, lm))
            seen.add(u)
    return unique


def safe_filename(url: str) -> str:
    """
    Stable filename per URL (avoid Windows path issues).
    """
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]
    # optional: add a short readable part
    path = urlparse(url).path.strip("/").replace("/", "_")
    path = path[:60] if path else "root"
    return f"{path}__{h}.html"


def download_html_to_file(session: requests.Session, url: str, target_file: Path) -> Tuple[requests.Response, str]:
    """
    Downloads a URL and stores the HTML response bytes into target_file.
    Returns (response, error_msg). error_msg empty if ok.
    """
    try:
        r = session.get(url, headers=HEADERS, timeout=TIMEOUT_SEC, allow_redirects=True)
    except Exception as e:
        return None, f"request_error: {e}"

    # Content-Type check (optional but useful)
    ct = (r.headers.get("Content-Type") or "").lower()
    if r.status_code != 200:
        return r, f"http_status: {r.status_code}"

    if "text/html" not in ct and "application/xhtml" not in ct:
        # still write content for debugging if you want; here we skip writing
        return r, f"not_html: {ct}"

    # Save uncompressed HTML bytes (requests already decompressed gzip)
    try:
        target_file.write_bytes(r.content)
    except Exception as e:
        return r, f"file_write_error: {e}"

    return r, ""


def count_lines_from_bytes(data: bytes, encoding_hint: str | None = None) -> int:
    """
    Counts lines in HTML (best-effort decode).
    """
    # Try utf-8 first; fallback to latin-1 to avoid decode errors
    for enc in filter(None, [encoding_hint, "utf-8", "cp1252", "latin-1"]):
        try:
            text = data.decode(enc, errors="strict")
            return len(text.splitlines()) or 1
        except Exception:
            continue
    # Last resort
    text = data.decode("utf-8", errors="ignore")
    return len(text.splitlines()) or 1


def main() -> None:
    session = requests.Session()

    temp_dir = Path(tempfile.mkdtemp(prefix="html_audit_"))
    print(f"[INFO] Temp-Verzeichnis: {temp_dir}")

    pages = collect_all_pages(session)
    print(f"[INFO] Gefundene URLs: {len(pages)}")

    rows: List[Row] = []

    for i, (url, lastmod) in enumerate(pages, start=1):
        fname = safe_filename(url)
        fpath = temp_dir / fname

        t0 = time.time()
        resp, err = download_html_to_file(session, url, fpath)
        elapsed_ms = int((time.time() - t0) * 1000)

        status = resp.status_code if resp is not None else 0
        final_url = resp.url if resp is not None else ""
        content_type = resp.headers.get("Content-Type", "") if resp is not None else ""

        size_bytes = 0
        lines = 0

        if not err and fpath.exists():
            data = fpath.read_bytes()
            size_bytes = len(data)
            # encoding hint from response (if present)
            encoding_hint = resp.encoding if (resp is not None and resp.encoding) else None
            lines = count_lines_from_bytes(data, encoding_hint=encoding_hint)

        rows.append(
            Row(
                url=url,
                lastmod=lastmod,
                status=status,
                final_url=final_url,
                content_type=content_type,
                file_path=str(fpath),
                size_bytes=size_bytes,
                size_kb=round(size_bytes / 1024, 2),
                lines=lines,
                error=err,
            )
        )

        print(f"[{i}/{len(pages)}] {status} {url} -> {rows[-1].size_kb} KB, {rows[-1].lines} lines ({elapsed_ms} ms)"
              + (f"  ERROR: {err}" if err else ""))

        time.sleep(DELAY_SEC)

    # Write CSV
    with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")
        writer.writerow([
            "url",
            "lastmod",
            "status",
            "final_url",
            "content_type",
            "file_path",
            "size_bytes",
            "size_kb",
            "lines",
            "error",
        ])
        for r in rows:
            writer.writerow([
                r.url,
                r.lastmod,
                r.status,
                r.final_url,
                r.content_type,
                r.file_path,
                r.size_bytes,
                r.size_kb,
                r.lines,
                r.error,
            ])

    print(f"[DONE] CSV gespeichert: {OUT_CSV}")

    if CLEANUP_TEMP_DIR:
        shutil.rmtree(temp_dir, ignore_errors=True)
        print("[INFO] Temp-Verzeichnis gelöscht.")
    else:
        print("[INFO] Temp-Verzeichnis bleibt erhalten (CLEANUP_TEMP_DIR=False).")


if __name__ == "__main__":
    main()