"""
Kohl's Rewards Visa (Capital One) statement parser — Story 9.8.

Tested against: Kohl's Visa ending 0532, period Apr 26 – May 26, 2026.

Format (Transactions section, page 2+):
  SAYRE GREENE #XXXX: Transactions
  Trans Date  Post Date  Description  Amount
  Apr 29  Apr 30  TST*CHUNKYS - MANCHESTEManchesterNH  $80.09

  - Dates: abbreviated month + day (e.g. "Apr 29"), year from statement header
  - Amount: "$XXX.XX" (positive for purchases)
  - Section headers: "SAYRE GREENE #XXXX: Transactions"
                     "SAYRE GREENE #XXXX: Payments, Credits and Adjustments"
  - Skip: "Trans Date", "Total Transactions", "Fees", "Interest Charged"

Year is extracted from: "Apr 26, 2026 - May 26, 2026 | N days in Billing Cycle"
"""
from __future__ import annotations

import re
from datetime import date, datetime
from decimal import Decimal

from app.services.pdf_parsers.base import StagedTransaction, ParseError, compute_dedup_hash
from app.services.merchant_normalizer import normalize

_PARSER_VERSION = "1.0.0"

# "Apr 29 Apr 30 DESCRIPTION $80.09"
# Two dates (each = 3-letter month + space + 1-2 digit day), then description, then $amount
_TXN_RE = re.compile(
    r"^([A-Za-z]{3}\s+\d{1,2})\s+([A-Za-z]{3}\s+\d{1,2})\s+(.+?)\s+\$(-?\d{1,3}(?:,\d{3})*\.\d{2})\s*$"
)

_MONTH_ABBR = {
    "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
    "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
}

_SECTION_TRANSACTIONS  = re.compile(r":\s*Transactions\s*$", re.I)
_SECTION_PAYMENTS      = re.compile(r":\s*Payments,?\s*Credits", re.I)
_ENDS_SECTION          = re.compile(
    r"^(Total (Transactions|Fees)|Fees$|Interest Charged|Totals Year|"
    r"Interest Charge Calculation|Additional Information)",
    re.I,
)
_SKIP_RE = re.compile(
    r"^(Trans Date|Visit Kohls|Page \d+|Kohl.s Rewards|Apr \d+, 20\d\d|"
    r"Payment Information|Account Summary|SAYRE |© 20|ETC-)",
    re.I,
)


def parse(pdf_path: str) -> tuple[list[StagedTransaction], list[ParseError]]:
    """Parse a Kohl's Rewards Visa (Capital One) statement PDF."""
    try:
        import pdfplumber
    except ImportError:
        return [], [ParseError(0, "", "pdfplumber not installed", _PARSER_VERSION)]

    transactions: list[StagedTransaction] = []
    errors: list[ParseError] = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            all_lines: list[tuple[int, str]] = []
            for page_num, page in enumerate(pdf.pages, start=1):
                for line in (page.extract_text() or "").split("\n"):
                    all_lines.append((page_num, line.strip()))

        statement_year = _extract_year(all_lines)

        # State: 'none' | 'payments' | 'transactions' | 'done'
        state = "none"

        for page_num, line in all_lines:
            if not line:
                continue

            if _SECTION_TRANSACTIONS.search(line):
                state = "transactions"
                continue
            if _SECTION_PAYMENTS.search(line):
                state = "payments"
                continue
            if _ENDS_SECTION.match(line):
                if state != "none":
                    state = "done"
                continue

            if state not in ("transactions", "payments"):
                continue
            if _SKIP_RE.match(line):
                continue

            m = _TXN_RE.match(line)
            if not m:
                continue

            trans_date_str = m.group(1)   # e.g. "Apr 29"
            description    = m.group(3).strip()
            amount_str     = m.group(4)

            try:
                txn_date = _parse_month_day(trans_date_str, statement_year)
            except ValueError as exc:
                errors.append(ParseError(page_num, line, str(exc), _PARSER_VERSION))
                continue

            try:
                amount = Decimal(amount_str.replace(",", ""))
            except Exception as exc:
                errors.append(ParseError(page_num, line, str(exc), _PARSER_VERSION))
                continue

            is_credit = amount < Decimal("0") or state == "payments"
            abs_amount = abs(amount)
            # Strip Capital One city+state PDF artifacts before normalizing.
            # Pattern: CityNameCityNameST or text after " - CityState"
            clean_desc = _strip_location(description)
            norm = normalize(clean_desc)
            dedup_hash = compute_dedup_hash(norm, abs_amount, txn_date)

            transactions.append(StagedTransaction(
                date=txn_date,
                merchant_raw=description,
                merchant_normalized=norm,
                amount=abs_amount,
                is_credit=is_credit,
                issuer="kohls",
                dedup_hash=dedup_hash,
                confidence_score=0.90,
                raw_text=line,
            ))

    except Exception as exc:
        errors.append(ParseError(0, str(exc), "Failed to open or read PDF", _PARSER_VERSION))

    return transactions, errors


def _parse_month_day(text: str, year: int) -> date:
    """Parse "Apr 29" → date(year, 4, 29)."""
    parts = text.strip().split()
    month = _MONTH_ABBR.get(parts[0].lower())
    if not month:
        raise ValueError(f"Unknown month abbreviation: {parts[0]}")
    day = int(parts[1])
    return date(year, month, day)


_US_STATES = {
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA",
    "KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
    "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT",
    "VA","WA","WV","WI","WY","DC",
}

# Matches: text followed by " - " and a location suffix
_LOCATION_DASH_RE = re.compile(r"^(.+?)\s+-\s+[A-Za-z].{0,20}[A-Z]{2}$")
# Matches: word directly concatenated with a known 2-letter state code at the end
_LOCATION_CONCAT_RE = re.compile(r"^(.*?)\s*\S+([A-Z]{2})$")


def _strip_location(text: str) -> str:
    """
    Strip Capital One city+state PDF rendering artifacts.
    e.g. "TST*CHUNKYS - MANCHESTEManchesterNH" → "TST*CHUNKYS"
         "WALGREENS #10378EAST HAMPSTEANH"      → "WALGREENS #10378"
         "KOHLS 0538SALEMNH"                    → "KOHLS 0538"
    """
    # Strip " - CityState" suffix
    m = _LOCATION_DASH_RE.match(text)
    if m:
        last_two = m.group(0)[-2:].upper()
        if last_two in _US_STATES:
            return m.group(1).strip()

    # Strip concatenated CityStateCode at the end (no space before state)
    # Only if the last 2 chars are a known US state abbreviation
    if len(text) >= 2 and text[-2:].upper() in _US_STATES:
        # Walk back to find where the city name starts (last run of uppercase letters)
        stripped = re.sub(r'[A-Za-z]+[A-Z]{2}$', '', text).strip()
        if stripped and len(stripped) >= 3:
            return stripped

    return text


def _extract_year(lines: list[tuple[int, str]]) -> int:
    """Extract year from 'Apr 26, 2026 - May 26, 2026 | N days in Billing Cycle'."""
    for _, line in lines:
        m = re.search(r"\b(20\d{2})\b", line)
        if m:
            return int(m.group(1))
    return datetime.now().year
