"""
LLM-based product categorizer using Claude via the Anthropic API.

Normal flow (no research site configured, or product data is sufficient):
    Claude receives product info + categories list → calls categorize_product tool → done.

Research flow (manufacturer research site configured and Claude needs more data):
    Claude receives product info + categories list + fetch_product_info tool.
    Claude may call fetch_product_info (fetches the manufacturer's product page),
    then uses the returned content to call categorize_product.
    The loop runs for at most MAX_FETCH_ITERATIONS fetch calls before forcing
    Claude to categorize with what it has.

Prompt-caching strategy:
    - System prompt: marked ephemeral (static across all products).
    - Categories list: marked ephemeral (same for every product in a batch).
    - Product description: never cached (unique per product).
"""

from __future__ import annotations

import logging
import re
import urllib.parse
import urllib.request
from typing import Any, Dict, List, Optional

try:
    import anthropic as _anthropic
    _AVAILABLE = True
except ImportError:
    _AVAILABLE = False

log = logging.getLogger(__name__)

_MODEL = "claude-haiku-4-5-20251001"
MAX_FETCH_ITERATIONS = 3   # max web lookups per product before forcing categorization
_FETCH_CONTENT_LIMIT = 4000  # characters of page text returned to Claude

_SYSTEM_PROMPT = (
    "You are an expert product categorization AI for an electronics component distributor. "
    "Your task is to analyze product information and assign it to the most appropriate "
    "category from the provided list.\n\n"
    "Guidelines:\n"
    "- First examine all product data provided (SKU, MPN, name, manufacturer, attributes).\n"
    "- If the data is sufficient to confidently categorize the product, call categorize_product immediately.\n"
    "- If the data is insufficient and a fetch_product_info tool is available, use it to look up "
    "the product on the manufacturer's website before categorizing.\n"
    "- Only use category codes that appear in the provided list — never invent new codes.\n"
    "- Set confidence between 0.0 (no confidence) and 1.0 (completely certain).\n"
    "- Provide a brief reasoning sentence explaining your choice."
)

_CATEGORIZE_TOOL: Dict[str, Any] = {
    "name": "categorize_product",
    "description": "Assign the product to the most appropriate category from the provided list.",
    "input_schema": {
        "type": "object",
        "properties": {
            "category_code": {
                "type": "string",
                "description": "The code of the selected category (must be from the provided list).",
            },
            "category_label": {
                "type": "string",
                "description": "The human-readable label of the selected category.",
            },
            "confidence": {
                "type": "number",
                "description": "Confidence in this categorization from 0.0 (none) to 1.0 (certain).",
                "minimum": 0.0,
                "maximum": 1.0,
            },
            "reasoning": {
                "type": "string",
                "description": "One or two sentences explaining why this category was chosen.",
            },
        },
        "required": ["category_code", "category_label", "confidence", "reasoning"],
    },
}


def _make_fetch_tool(site_name: str) -> Dict[str, Any]:
    return {
        "name": "fetch_product_info",
        "description": (
            f"Look up a product on {site_name}'s website to gather more information before "
            "categorizing. Use this when the provided product data is insufficient to make a "
            "confident categorization decision. Provide the exact part number or SKU."
        ),
        "input_schema": {
            "type": "object",
            "properties": {
                "part_number": {
                    "type": "string",
                    "description": "The part number or SKU to look up on the manufacturer website.",
                }
            },
            "required": ["part_number"],
        },
    }


def _fetch_product_page(part_number: str, research_site: Dict[str, Any]) -> str:
    """
    Fetch the manufacturer's product page for part_number and return stripped text.

    Enforces that the constructed URL stays within the site's configured domain.
    Returns an error string (not raised) so Claude can continue gracefully.
    """
    template: str = research_site.get("product_url_template", "")
    if not template or "{part_number}" not in template:
        return "Error: research site URL template is not configured correctly."

    # Construct URL — URL-encode the part number but keep common separator chars.
    encoded = urllib.parse.quote(str(part_number).strip(), safe="-._~")
    url = template.replace("{part_number}", encoded)

    # Restrict to the domain declared in the template.
    allowed_host = urllib.parse.urlparse(template).netloc
    actual_host  = urllib.parse.urlparse(url).netloc
    if actual_host != allowed_host:
        log.warning("Fetch URL host mismatch: %s vs %s — blocked.", actual_host, allowed_host)
        return "Error: constructed URL is outside the allowed domain."

    log.info("Fetching manufacturer product page: %s", url)
    try:
        req = urllib.request.Request(
            url,
            headers={
                "User-Agent": (
                    "Mozilla/5.0 (compatible; AICatsBot/1.0; product categorization research)"
                ),
                "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
            },
        )
        with urllib.request.urlopen(req, timeout=10) as resp:
            charset = resp.headers.get_content_charset("utf-8")
            raw_html = resp.read(131072).decode(charset, errors="replace")  # cap at 128 KB
    except Exception as exc:
        log.warning("Failed to fetch %s: %s", url, exc)
        return f"Could not fetch product page ({exc}). Try categorizing with available data."

    # Strip scripts, styles, then all HTML tags; collapse whitespace.
    text = re.sub(r"<script[^>]*>.*?</script>", " ", raw_html, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<style[^>]*>.*?</style>",  " ", text,     flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    if not text:
        return "Page loaded but no readable content was found."

    return text[:_FETCH_CONTENT_LIMIT]


def _find_research_site(
    product: Dict[str, Any],
    research_sites: List[Dict[str, Any]],
) -> Optional[Dict[str, Any]]:
    """
    Match the product's manufacturer field against research site keywords.

    Returns the first matching active site, or None.
    """
    manufacturer = (product.get("manufacturer") or "").lower().strip()
    if not manufacturer:
        return None

    for site in research_sites:
        if not site.get("is_active", True):
            continue
        keywords = [
            k.strip().lower()
            for k in str(site.get("name_match_keywords", "")).split(",")
            if k.strip()
        ]
        for kw in keywords:
            if kw and kw in manufacturer:
                return site

    return None


def categorize(
    api_key: str,
    product: Dict[str, Any],
    attributes: List[Dict[str, Any]],
    categories: List[Dict[str, Any]],
    research_sites: Optional[List[Dict[str, Any]]] = None,
) -> Optional[Dict[str, Any]]:
    """
    Categorize a product using Claude, optionally looking up the manufacturer website.

    Args:
        api_key:        Anthropic API key from system_settings.
        product:        Product DB row as a dict.
        attributes:     List of product_attribute rows ({attribute_name, attribute_value}).
        categories:     List of {code, label} dicts representing available categories.
        research_sites: List of manufacturer research site rows from DB (may be None or []).

    Returns:
        {"code": str, "label": str, "confidence": float, "reasoning": str}
        on success, or None if unavailable / failed.
    """
    if not _AVAILABLE:
        log.debug("anthropic package not installed; skipping LLM categorization.")
        return None

    if not api_key:
        log.debug("No claude_api_key configured; skipping LLM categorization.")
        return None

    if not categories:
        log.debug("No categories loaded; skipping LLM categorization.")
        return None

    # ------------------------------------------------------------------
    # Match product manufacturer to a research site (if any configured)
    # ------------------------------------------------------------------
    research_site = _find_research_site(product, research_sites or [])

    # ------------------------------------------------------------------
    # Build tool list
    # ------------------------------------------------------------------
    tools: List[Dict[str, Any]] = []
    if research_site:
        tools.append(_make_fetch_tool(research_site["manufacturer_name"]))
    tools.append(_CATEGORIZE_TOOL)

    # When no research site: force categorize_product immediately.
    # When research site available: auto — Claude decides whether to fetch first.
    tool_choice: Dict[str, Any] = (
        {"type": "auto"}
        if research_site
        else {"type": "tool", "name": "categorize_product"}
    )

    # ------------------------------------------------------------------
    # Build categories text (cacheable — same for every product in batch)
    # ------------------------------------------------------------------
    categories_text = "\n".join(
        f"- {c['code']}: {c['label']}" for c in categories
    )

    # ------------------------------------------------------------------
    # Build product description (dynamic per product — not cached)
    # ------------------------------------------------------------------
    lines: List[str] = []
    if product.get("sku"):
        lines.append(f"SKU: {product['sku']}")
    if product.get("mpn"):
        lines.append(f"MPN/Part Number: {product['mpn']}")
    if product.get("name"):
        lines.append(f"Name: {product['name']}")
    if product.get("manufacturer"):
        lines.append(f"Manufacturer: {product['manufacturer']}")
    if product.get("manufacturer_category"):
        lines.append(f"Manufacturer Category: {product['manufacturer_category']}")
    for attr in attributes[:20]:  # cap to avoid token bloat
        name = (attr.get("attribute_name") or "").strip()
        val  = (attr.get("attribute_value") or "").strip()
        if name and val:
            lines.append(f"{name}: {val}")

    product_text = "\n".join(lines) or "No product information available."

    # ------------------------------------------------------------------
    # Initial messages
    # ------------------------------------------------------------------
    messages: List[Dict[str, Any]] = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Available categories:\n{categories_text}",
                    "cache_control": {"type": "ephemeral"},
                },
                {
                    "type": "text",
                    "text": f"Product to categorize:\n{product_text}",
                },
            ],
        }
    ]

    # ------------------------------------------------------------------
    # Agentic loop
    # ------------------------------------------------------------------
    client = _anthropic.Anthropic(api_key=api_key)
    fetch_count = 0

    for _iteration in range(MAX_FETCH_ITERATIONS + 1):
        try:
            response = client.messages.create(
                model=_MODEL,
                max_tokens=1024,
                system=[
                    {
                        "type": "text",
                        "text": _SYSTEM_PROMPT,
                        "cache_control": {"type": "ephemeral"},
                    }
                ],
                tools=tools,
                tool_choice=tool_choice,
                messages=messages,
            )
        except Exception as exc:
            log.warning("LLM API call failed: %s", exc)
            return None

        # Append the assistant turn.
        messages.append({"role": "assistant", "content": response.content})

        # Process tool use blocks.
        tool_results: List[Dict[str, Any]] = []

        for block in response.content:
            if block.type != "tool_use":
                continue

            if block.name == "categorize_product":
                # ── Success path ──────────────────────────────────────────
                inp        = block.input
                code       = str(inp.get("category_code",  "")).strip()
                label      = str(inp.get("category_label", "")).strip()
                confidence = float(inp.get("confidence", 0.5))
                reasoning  = str(inp.get("reasoning",  "")).strip()

                valid_codes = {c["code"] for c in categories}
                if code not in valid_codes:
                    log.warning("LLM returned unknown category code '%s'; discarding.", code)
                    return None

                return {
                    "code":       code,
                    "label":      label,
                    "confidence": min(1.0, max(0.0, confidence)),
                    "reasoning":  reasoning,
                }

            elif block.name == "fetch_product_info" and research_site:
                # ── Fetch path ────────────────────────────────────────────
                if fetch_count >= MAX_FETCH_ITERATIONS:
                    # Tell Claude it has hit the fetch limit.
                    content = (
                        "Fetch limit reached. Please categorize the product using "
                        "the information already available."
                    )
                else:
                    part_number = str(block.input.get("part_number", "")).strip()
                    content     = _fetch_product_page(part_number, research_site)
                    fetch_count += 1
                    log.info(
                        "Product %s: fetch #%d returned %d chars.",
                        product.get("id"), fetch_count, len(content),
                    )

                tool_results.append({
                    "type":        "tool_result",
                    "tool_use_id": block.id,
                    "content":     content,
                })

        if not tool_results:
            # No tool use block at all — shouldn't happen but bail safely.
            log.warning("LLM response contained no tool_use block (iteration %d).", _iteration)
            break

        # After hitting the fetch limit, force Claude to categorize.
        if fetch_count >= MAX_FETCH_ITERATIONS:
            tool_choice = {"type": "tool", "name": "categorize_product"}

        messages.append({"role": "user", "content": tool_results})

    log.warning("LLM agentic loop exhausted without a categorize_product call.")
    return None