"""
Extractor de PDF mejorado — Convastro
Reconoce datos de:
- Menús de restaurante (platos, precios, categorías)
- Facturas de proveedores (costes, IVA)
- Resúmenes de TPV (ventas, platos vendidos, servicios)
- Escandallo / fichas técnicas (ingredientes, costes)
"""

import re
from typing import Dict, List, Optional

try:
    import pdfplumber
    HAS_PDF = True
except ImportError:
    HAS_PDF = False


class PDFExtractor:
    """Extracción inteligente de datos desde PDFs de hostelería."""

    def __init__(self):
        self.available = HAS_PDF

    def extract(self, filepath: str) -> Dict:
        """Extrae datos del PDF y detecta el tipo de documento."""
        if not HAS_PDF:
            return {"error": "pdfplumber no instalado", "available": False}

        try:
            with pdfplumber.open(filepath) as pdf:
                all_text = ""
                all_tables = []
                for page in pdf.pages:
                    text = page.extract_text() or ""
                    all_text += text + "\n"
                    tables = page.extract_tables() or []
                    all_tables.extend(tables)

            if not all_text.strip():
                return {"error": "PDF vacío o escaneado (sin texto)", "available": True}

            # Detectar tipo de documento
            doc_type = self._detect_type(all_text)

            result = {
                "tipo_documento": doc_type,
                "texto_completo": all_text[:5000],
                "tablas_encontradas": len(all_tables),
                "available": True,
            }

            if doc_type == "menu":
                result["platos"] = self._extract_menu(all_text, all_tables)
            elif doc_type == "factura":
                result["factura"] = self._extract_invoice(all_text, all_tables)
            elif doc_type == "tpv":
                result["tpv"] = self._extract_tpv(all_text, all_tables)
            elif doc_type == "escandallo":
                result["escandallo"] = self._extract_escandallo(all_text, all_tables)
            else:
                # Intentar extraer todo lo posible
                result["platos"] = self._extract_menu(all_text, all_tables)
                result["datos_generales"] = self._extract_numbers(all_text)

            return result

        except Exception as e:
            return {"error": f"Error leyendo PDF: {str(e)}", "available": True}

    # ========================================
    # DETECCIÓN DE TIPO DE DOCUMENTO
    # ========================================

    def _detect_type(self, text: str) -> str:
        text_lower = text.lower()

        tpv_keywords = ["cierre", "resumen de ventas", "z report", "caja", "turno",
                         "ticket", "mesa", "camarero", "servicio comida", "servicio cena",
                         "total ventas", "operaciones", "forma de pago", "efectivo",
                         "tarjeta", "agora", "revo", "glop", "icg", "last.app",
                         "informe diario", "informe z", "arqueo"]
        if sum(1 for k in tpv_keywords if k in text_lower) >= 3:
            return "tpv"

        factura_keywords = ["factura", "nif", "cif", "base imponible", "iva",
                            "proveedor", "albarán", "nº factura", "total factura",
                            "forma de pago", "vencimiento", "irpf"]
        if sum(1 for k in factura_keywords if k in text_lower) >= 3:
            return "factura"

        escandallo_keywords = ["escandallo", "ficha técnica", "ingrediente", "rendimiento",
                               "merma", "coste unitario", "gramaje", "ración", "mise en place"]
        if sum(1 for k in escandallo_keywords if k in text_lower) >= 2:
            return "escandallo"

        menu_keywords = ["menú", "carta", "entrante", "principal", "postre", "precio",
                         "plato", "pvp", "€", "primero", "segundo", "aperitivo",
                         "bebida", "ensalada", "carne", "pescado", "marisco"]
        if sum(1 for k in menu_keywords if k in text_lower) >= 2:
            return "menu"

        return "generico"

    # ========================================
    # EXTRACCIÓN DE MENÚ / CARTA
    # ========================================

    def _extract_menu(self, text: str, tables: List) -> List[Dict]:
        platos = []

        # Método 1: Tablas
        for table in tables:
            platos.extend(self._parse_menu_table(table))

        # Método 2: Regex en texto
        if len(platos) < 3:
            platos.extend(self._parse_menu_text(text))

        # Deduplicar
        seen = set()
        unique = []
        for p in platos:
            key = (p["nombre"].lower().strip(), p.get("precio", 0))
            if key not in seen and p["nombre"].strip():
                seen.add(key)
                unique.append(p)

        return unique

    def _parse_menu_table(self, table: List) -> List[Dict]:
        """Extrae platos de una tabla PDF."""
        platos = []
        if not table or len(table) < 2:
            return platos

        # Detectar columnas
        header = [str(c).lower().strip() if c else "" for c in table[0]]
        col_map = self._detect_columns(header)

        for row in table[1:]:
            if not row or all(not c for c in row):
                continue

            try:
                nombre = str(row[col_map.get("nombre", 0)] or "").strip()
                if not nombre or len(nombre) < 2:
                    continue

                precio = self._parse_price(row[col_map.get("precio", -1)] if col_map.get("precio", -1) >= 0 else None)
                coste = self._parse_price(row[col_map.get("coste", -1)] if col_map.get("coste", -1) >= 0 else None)
                uds = self._parse_int(row[col_map.get("unidades", -1)] if col_map.get("unidades", -1) >= 0 else None)

                # Si no hay precio en la columna, buscar en el nombre
                if not precio:
                    precio = self._find_price_in_text(nombre)
                    if precio:
                        nombre = re.sub(r'\s*[\d,.]+\s*€?\s*$', '', nombre).strip()

                if nombre and (precio or coste or uds):
                    platos.append({
                        "nombre": nombre,
                        "precio": precio or 0,
                        "coste_mp": coste or 0,
                        "unidades": uds or 0,
                        "categoria": self._guess_category(nombre),
                    })
            except (IndexError, ValueError):
                continue

        return platos

    def _detect_columns(self, header: List[str]) -> Dict:
        """Detecta qué columna contiene qué dato."""
        col_map = {}
        for i, h in enumerate(header):
            h = h.lower()
            if any(w in h for w in ["plato", "nombre", "descripci", "artículo", "producto", "concepto"]):
                col_map["nombre"] = i
            elif any(w in h for w in ["pvp", "precio", "venta", "p.v", "importe", "€"]):
                col_map["precio"] = i
            elif any(w in h for w in ["coste", "costo", "mp", "compra"]):
                col_map["coste"] = i
            elif any(w in h for w in ["ud", "cant", "unidad", "vendid", "qty", "cantidad"]):
                col_map["unidades"] = i
            elif any(w in h for w in ["categ", "tipo", "familia", "grupo"]):
                col_map["categoria"] = i

        # Si no detectó nombre, asumir primera columna de texto
        if "nombre" not in col_map:
            for i, h in enumerate(header):
                if not any(c.isdigit() for c in h) and len(h) > 1:
                    col_map["nombre"] = i
                    break
            if "nombre" not in col_map:
                col_map["nombre"] = 0

        return col_map

    def _parse_menu_text(self, text: str) -> List[Dict]:
        """Extrae platos del texto libre con múltiples patrones regex."""
        platos = []
        patterns = [
            # "Solomillo al roquefort 18,50€"
            r'([A-ZÀ-Ú][a-záéíóúñ\s]{3,40})\s+(\d{1,3}[.,]\d{2})\s*€',
            # "Solomillo al roquefort ... 18,50"
            r'([A-ZÀ-Ú][a-záéíóúñ\s]{3,40})\s*\.{2,}\s*(\d{1,3}[.,]\d{2})',
            # "Solomillo al roquefort    18.50"
            r'([A-ZÀ-Ú][a-záéíóúñ\s]{3,40})\s{3,}(\d{1,3}[.,]\d{2})',
            # "- Solomillo 18,50€"
            r'[-•·]\s*([A-Za-záéíóúñÀ-Ú][a-záéíóúñ\s]{3,40})\s+(\d{1,3}[.,]\d{2})\s*€?',
            # Tabular: "Solomillo    18.50    6.50    120"
            r'([A-ZÀ-Ú][a-záéíóúñ\s]{3,30})\s+(\d{1,3}[.,]\d{2})\s+(\d{1,3}[.,]\d{2})\s+(\d{1,5})',
        ]

        for pattern in patterns:
            matches = re.finditer(pattern, text)
            for m in matches:
                groups = m.groups()
                nombre = groups[0].strip()

                # Filtrar falsos positivos
                if any(w in nombre.lower() for w in ["total", "subtotal", "iva", "base", "descuento", "página"]):
                    continue

                plato = {
                    "nombre": nombre,
                    "precio": self._parse_price(groups[1]),
                    "coste_mp": self._parse_price(groups[2]) if len(groups) > 2 else 0,
                    "unidades": self._parse_int(groups[3]) if len(groups) > 3 else 0,
                    "categoria": self._guess_category(nombre),
                }
                platos.append(plato)

        return platos

    # ========================================
    # EXTRACCIÓN TPV (resumen de ventas)
    # ========================================

    def _extract_tpv(self, text: str, tables: List) -> Dict:
        """Extrae datos de un resumen/informe Z de TPV."""
        result = {
            "total_ventas": 0,
            "num_operaciones": 0,
            "ticket_medio": 0,
            "formas_pago": {},
            "platos_vendidos": [],
            "servicios": {},
        }

        text_lower = text.lower()

        # Total ventas
        for pattern in [
            r'total\s*(?:ventas|neto|bruto)\s*[:\s]*(\d{1,7}[.,]\d{2})',
            r'total\s*[:\s]*(\d{1,7}[.,]\d{2})\s*€',
            r'ventas\s*totales?\s*[:\s]*(\d{1,7}[.,]\d{2})',
            r'importe\s*total\s*[:\s]*(\d{1,7}[.,]\d{2})',
        ]:
            m = re.search(pattern, text_lower)
            if m:
                result["total_ventas"] = self._parse_price(m.group(1))
                break

        # Operaciones / tickets
        for pattern in [
            r'(?:nº|num|número)\s*(?:de\s*)?(?:operaciones?|tickets?|comandas?)\s*[:\s]*(\d+)',
            r'(\d+)\s*(?:operaciones?|tickets?|comandas?)',
        ]:
            m = re.search(pattern, text_lower)
            if m:
                result["num_operaciones"] = int(m.group(1))
                break

        # Ticket medio
        for pattern in [
            r'ticket\s*medio\s*[:\s]*(\d{1,5}[.,]\d{2})',
            r'media\s*(?:por\s*)?(?:ticket|mesa|comensal)\s*[:\s]*(\d{1,5}[.,]\d{2})',
        ]:
            m = re.search(pattern, text_lower)
            if m:
                result["ticket_medio"] = self._parse_price(m.group(1))
                break

        if result["total_ventas"] > 0 and result["num_operaciones"] > 0 and result["ticket_medio"] == 0:
            result["ticket_medio"] = result["total_ventas"] / result["num_operaciones"]

        # Formas de pago
        for fp in ["efectivo", "tarjeta", "bizum", "transferencia"]:
            m = re.search(rf'{fp}\s*[:\s]*(\d{{1,7}}[.,]\d{{2}})', text_lower)
            if m:
                result["formas_pago"][fp] = self._parse_price(m.group(1))

        # Platos vendidos (de tablas)
        for table in tables:
            result["platos_vendidos"].extend(self._parse_menu_table(table))

        # Servicios
        for serv in ["comida", "cena", "desayuno", "almuerzo"]:
            m = re.search(rf'{serv}\s*[:\s]*(\d{{1,7}}[.,]\d{{2}})', text_lower)
            if m:
                result["servicios"][serv] = self._parse_price(m.group(1))

        return result

    # ========================================
    # EXTRACCIÓN FACTURA
    # ========================================

    def _extract_invoice(self, text: str, tables: List) -> Dict:
        """Extrae datos de factura de proveedor."""
        result = {
            "proveedor": "",
            "nif": "",
            "fecha": "",
            "base_imponible": 0,
            "iva": 0,
            "total": 0,
            "items": [],
        }

        text_lower = text.lower()

        # NIF/CIF
        m = re.search(r'[nc]if\s*[:\s]*([A-Z0-9]{8,10})', text, re.IGNORECASE)
        if m: result["nif"] = m.group(1)

        # Fecha
        m = re.search(r'fecha\s*[:\s]*(\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4})', text_lower)
        if m: result["fecha"] = m.group(1)

        # Base imponible
        m = re.search(r'base\s*imponible\s*[:\s]*(\d{1,7}[.,]\d{2})', text_lower)
        if m: result["base_imponible"] = self._parse_price(m.group(1))

        # IVA
        m = re.search(r'iva\s*(?:\d+%?)?\s*[:\s]*(\d{1,7}[.,]\d{2})', text_lower)
        if m: result["iva"] = self._parse_price(m.group(1))

        # Total
        for p in [r'total\s*factura\s*[:\s]*(\d{1,7}[.,]\d{2})',
                   r'total\s*[:\s]*(\d{1,7}[.,]\d{2})\s*€']:
            m = re.search(p, text_lower)
            if m:
                result["total"] = self._parse_price(m.group(1))
                break

        # Items de tabla
        for table in tables:
            for row in table[1:] if len(table) > 1 else []:
                if row and len(row) >= 2:
                    nombre = str(row[0] or "").strip()
                    precio = self._find_price_in_row(row)
                    if nombre and precio:
                        result["items"].append({"item": nombre, "importe": precio})

        return result

    # ========================================
    # EXTRACCIÓN ESCANDALLO
    # ========================================

    def _extract_escandallo(self, text: str, tables: List) -> Dict:
        result = {
            "plato": "",
            "raciones": 0,
            "coste_total": 0,
            "coste_racion": 0,
            "ingredientes": [],
        }

        m = re.search(r'(?:plato|receta|ficha)\s*[:\s]*(.+)', text, re.IGNORECASE)
        if m: result["plato"] = m.group(1).strip()[:60]

        m = re.search(r'(?:raciones?|porciones?)\s*[:\s]*(\d+)', text, re.IGNORECASE)
        if m: result["raciones"] = int(m.group(1))

        for table in tables:
            for row in table[1:] if len(table) > 1 else []:
                if row and len(row) >= 2:
                    nombre = str(row[0] or "").strip()
                    coste = self._find_price_in_row(row)
                    if nombre and len(nombre) > 1:
                        result["ingredientes"].append({"ingrediente": nombre, "coste": coste or 0})

        result["coste_total"] = sum(i["coste"] for i in result["ingredientes"])
        if result["raciones"] > 0:
            result["coste_racion"] = result["coste_total"] / result["raciones"]

        return result

    # ========================================
    # UTILIDADES
    # ========================================

    def _parse_price(self, value) -> float:
        if value is None: return 0
        s = str(value).strip().replace('€', '').replace(' ', '')
        s = re.sub(r'[^\d.,\-]', '', s)
        if not s: return 0
        # 1.234,56 → 1234.56
        if ',' in s and '.' in s:
            if s.rindex(',') > s.rindex('.'):
                s = s.replace('.', '').replace(',', '.')
            else:
                s = s.replace(',', '')
        elif ',' in s:
            parts = s.split(',')
            if len(parts[-1]) <= 2:
                s = s.replace(',', '.')
            else:
                s = s.replace(',', '')
        try:
            return round(float(s), 2)
        except ValueError:
            return 0

    def _parse_int(self, value) -> int:
        if value is None: return 0
        s = re.sub(r'[^\d]', '', str(value))
        return int(s) if s else 0

    def _find_price_in_text(self, text: str) -> float:
        m = re.search(r'(\d{1,3}[.,]\d{2})\s*€?', text)
        return self._parse_price(m.group(1)) if m else 0

    def _find_price_in_row(self, row: List) -> float:
        """Busca el primer precio válido en una fila de tabla."""
        for cell in reversed(row):
            p = self._parse_price(cell)
            if p > 0:
                return p
        return 0

    def _extract_numbers(self, text: str) -> Dict:
        """Extrae todos los números relevantes del texto."""
        prices = re.findall(r'(\d{1,7}[.,]\d{2})\s*€', text)
        return {
            "precios_encontrados": [self._parse_price(p) for p in prices],
            "total_precios": len(prices),
        }

    def _guess_category(self, nombre: str) -> str:
        n = nombre.lower()
        if any(w in n for w in ["ensalada", "entrante", "aperitivo", "croqueta", "tapa", "sopa", "crema", "gazpacho", "jamón"]):
            return "entrante"
        if any(w in n for w in ["tarta", "postre", "helado", "flan", "brownie", "tiramisú", "natilla", "fruta", "coulant"]):
            return "postre"
        if any(w in n for w in ["café", "agua", "vino", "cerveza", "copa", "refresco", "zumo", "infusión", "té", "gin"]):
            return "bebida"
        if any(w in n for w in ["menú", "menu del día", "menu completo", "combinado"]):
            return "menu"
        return "principal"
