"""
Merchant name normalizer — Story 9.3.

Cleans raw statement strings using a built-in normalization table,
then falls back to regex heuristics.

Example: "AMZN*MARKETPLACE 04/22" → "Amazon"
         "SQ *COFFEE SHOP 123456" → "Coffee Shop"
"""
from __future__ import annotations

import re

# ── built-in normalization table ──────────────────────────────────────────────
# (pattern, normalized_name)  — applied in order, first match wins
_BUILTIN_RULES: list[tuple[re.Pattern, str]] = [
    (re.compile(r"AMZN\*|AMAZON\s*MKTPL\*|AMAZON\s*MARKETPLACE|AMAZON\.COM\*", re.I), "Amazon"),
    (re.compile(r"AMAZON\s*PRIME\*|AMAZON\s*PRIME\b", re.I),        "Amazon Prime"),
    (re.compile(r"AMAZON\s*WEB\s*SERVICES|AWS", re.I),               "AWS"),
    (re.compile(r"AMAZON\.COM\s+Amzn\.com", re.I),                   "Amazon"),
    (re.compile(r"AMAZON\.COM", re.I),                                "Amazon"),
    (re.compile(r"APPLE\.COM/BILL|APPLE\s+ITUNES|APPLE\s+STORE", re.I), "Apple"),
    (re.compile(r"NETFLIX\.COM|NETFLIX", re.I),                      "Netflix"),
    (re.compile(r"SPOTIFY\s*USA|SPOTIFY", re.I),                     "Spotify"),
    (re.compile(r"HULU\.COM|HULU", re.I),                            "Hulu"),
    (re.compile(r"DISNEY\+|DISNEY\s*PLUS", re.I),                    "Disney+"),
    (re.compile(r"YOUTUBE\s*PREMIUM|GOOGLE\s*YOUTUBE", re.I),        "YouTube Premium"),
    (re.compile(r"GOOGLE\s*\*GSUITE|GOOGLE\s*WORKSPACE", re.I),      "Google Workspace"),
    (re.compile(r"\bGOOGLE\b", re.I),                                "Google"),
    (re.compile(r"MICROSOFT\s*\*|MICROSOFT\s*365|MSFT", re.I),       "Microsoft"),
    (re.compile(r"PAYPAL\s*\*", re.I),                               "PayPal"),
    (re.compile(r"VENMO", re.I),                                     "Venmo"),
    (re.compile(r"UBER\s*\*EATS|UBEREATS", re.I),                    "Uber Eats"),
    (re.compile(r"UBER\s*\*TRIP|UBER\b", re.I),                      "Uber"),
    (re.compile(r"LYFT\s*\*", re.I),                                 "Lyft"),
    (re.compile(r"DOORDASH\*|DOORDASH", re.I),                       "DoorDash"),
    (re.compile(r"GRUBHUB\*|GRUBHUB", re.I),                         "Grubhub"),
    (re.compile(r"INSTACART\*|INSTACART", re.I),                     "Instacart"),
    (re.compile(r"WALMART(?:\s*\.COM)?", re.I),                      "Walmart"),
    (re.compile(r"TARGET\s*(?:\.COM)?(?:\s*T-\d+)?", re.I),         "Target"),
    (re.compile(r"COSTCO\s*WHSE|COSTCO", re.I),                      "Costco"),
    (re.compile(r"WHOLE\s*FOODS\s*MARKET?", re.I),                   "Whole Foods"),
    (re.compile(r"TRADER\s*JOE", re.I),                              "Trader Joe's"),
    (re.compile(r"KROGER", re.I),                                    "Kroger"),
    (re.compile(r"\bALDI\b", re.I),                                  "Aldi"),
    (re.compile(r"STARBUCKS", re.I),                                 "Starbucks"),
    (re.compile(r"DUNKIN|DUNKIN'\s*DONUTS", re.I),                   "Dunkin'"),
    (re.compile(r"MCDONALD'?S|MCDONALDS", re.I),                     "McDonald's"),
    (re.compile(r"CHICK-FIL-A|CHICKFILA", re.I),                    "Chick-fil-A"),
    (re.compile(r"CHIPOTLE", re.I),                                  "Chipotle"),
    (re.compile(r"SUBWAY\b", re.I),                                  "Subway"),
    (re.compile(r"CVS\s*PHARMACY|CVS\s*\d+", re.I),                  "CVS Pharmacy"),
    (re.compile(r"WALGREEN|WALGREENS", re.I),                        "Walgreens"),
    (re.compile(r"RITE\s*AID", re.I),                                "Rite Aid"),
    (re.compile(r"BEST\s*BUY", re.I),                                "Best Buy"),
    (re.compile(r"THE\s+HOME\s+DEPOT|HOME\s*DEPOT", re.I),           "Home Depot"),
    (re.compile(r"LOWE'?S|LOWES\s+#", re.I),                        "Lowe's"),
    (re.compile(r"\bKOHL'?S\b|\bKOHLS\b", re.I),                    "Kohl's"),
    (re.compile(r"SHELL\s*OIL|SHELL\s*\d+", re.I),                  "Shell"),
    (re.compile(r"BP\s*#\d+|BP\s*GAS", re.I),                       "BP"),
    (re.compile(r"EXXON\s*MOBIL|EXXON", re.I),                      "ExxonMobil"),
    (re.compile(r"CHEVRON\s+\d+|CHEVRON", re.I),                     "Chevron"),
    # Square: "SQ *MERCHANT NAME CityState" → extract just merchant name
    (re.compile(r"SQ\s*\*([^#]+?)(?:\s+\d|\s{2,}|\s*-\s*[A-Z]{2,}|$)", re.I), None),
    # Toast: "TST*MERCHANT - City State" → stop at " - "
    (re.compile(r"TST\*\s*(.+?)\s+-\s+[A-Z]", re.I),                None),
    (re.compile(r"TST\*\s*(.+)", re.I),                              None),
]

# ── regex heuristics applied after table lookup ───────────────────────────────
_STRIP_PATTERNS: list[re.Pattern] = [
    re.compile(r"\s+\d{2}/\d{2}(?:/\d{2,4})?$"),           # trailing date MM/DD
    re.compile(r"\s+\d{4,}$"),                               # trailing reference number
    re.compile(r"#\s*\d+\s*$"),                              # trailing store number
    re.compile(r"\s+[A-Z]{2}$"),                             # trailing spaced state code
    re.compile(r"\s{2,}"),                                   # multiple spaces → single
    re.compile(r"^\s+|\s+$"),                                # leading/trailing whitespace
]


def normalize(raw: str) -> str:
    """
    Normalize a raw merchant string to a clean display name.

    1. Check built-in rules (first match wins)
    2. Apply regex heuristics to strip noise
    3. Title-case the result
    """
    if not raw:
        return raw

    raw_stripped = raw.strip()

    # Step 1: built-in rules
    for pattern, normalized in _BUILTIN_RULES:
        m = pattern.search(raw_stripped)
        if m:
            if normalized is not None:
                return normalized
            # Pattern with None means extract a capture group
            captured = m.group(1).strip() if m.lastindex else raw_stripped
            return _clean_heuristics(captured)

    # Step 2: heuristics only
    return _clean_heuristics(raw_stripped)


def _clean_heuristics(text: str) -> str:
    """Apply stripping heuristics and title-case."""
    result = text
    for pattern in _STRIP_PATTERNS:
        result = pattern.sub(lambda m: " " if m.re.pattern == r"\s{2,}" else "", result)
    return result.strip().title()


def apply_user_mappings(raw: str, user_mappings: list[dict]) -> str | None:
    """
    Check user-confirmed MerchantMapping records against raw merchant string.

    user_mappings: list of dicts with 'raw_pattern' and 'normalized' keys
    Returns normalized name if a user mapping matches, else None.
    """
    raw_upper = raw.upper()
    for mapping in user_mappings:
        pattern = mapping.get("raw_pattern", "")
        if pattern and pattern.upper() in raw_upper:
            return mapping.get("normalized", "")
    return None