"""
Ollama Integration Service for BMC Hub
Handles supplier invoice extraction using Ollama LLM with CVR matching
"""

import json
import hashlib
import logging
import os
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from datetime import datetime
import re

from app.core.config import settings
from app.core.database import execute_insert, execute_query, execute_update, execute_query_single

logger = logging.getLogger(__name__)

class OllamaService:
    """Service for extracting supplier invoice data using Ollama LLM"""
    
    def __init__(self):
        self.endpoint = settings.OLLAMA_ENDPOINT
        self.model = settings.OLLAMA_MODEL
        self.system_prompt = self._build_system_prompt()
        logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")
    
    def _build_system_prompt(self) -> str:
        """Build Danish system prompt for invoice extraction with CVR"""
        own_cvr = getattr(settings, 'OWN_CVR', '29522790')
        # BMC har to CVR numre – begge er VORES (køber), aldrig leverandør
        own_cvr_rule = (
            f"4b. KRITISK - LEVERANDØR vs. MODTAGER:\n"
            f"  - På en dansk faktura er LEVERANDØREN (vendor) det firma der HAR SENDT fakturaen.\n"
            f"    De kendes på: firmalogo øverst, bankkonto/IBAN/Gironr. nedad, ingen 'Faktureres til' label.\n"
            f"  - MODTAGEREN (os, buyer) kendes på: navnes under 'Faktureres til', 'Att.', 'Kundenr.', adresseblok med vores navn.\n"
            f"  - BMC DENMARK APS og alle varianter af 'BMC' er ALDRIG leverandøren – det er os (modtageren).\n"
            f"  - CVR {own_cvr} er VORES eget CVR. Sæt ALDRIG vendor_cvr til {own_cvr}.\n"
            f"  - CVR 14416285 er også VORES CVR. Sæt ALDRIG vendor_cvr til 14416285.\n"
            f"  - Ignorer 'SE/CVR-nr.' der hører til modtager-blokken – brug KUN afsenderens CVR som vendor_cvr.\n"
        )
        return ("""Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer, kreditnotaer og leverandørdokumenter.

VIGTIGE REGLER:
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
2. Hvis et felt ikke findes, sæt det til null
3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)
4. Datoer skal være i format YYYY-MM-DD
""" + own_cvr_rule + """5. DANSKE PRISFORMATER: 
   - Tusind-separator kan være . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
   - Decimal-separator er , (komma): "1.234,56 kr"
   - I JSON output skal du bruge . (punkt) som decimal: 1234.56
   - Eksempel: "5.965,18 kr" → 5965.18 i JSON
   - Eksempel: "1.234,56 DKK" → 1234.56 i JSON
6. CVR-nummer skal være 8 cifre uden mellemrum
7. Moms/VAT skal udtrækkes fra hver linje hvis muligt
8. DOKUMENTTYPE DETEKTION:
   - "invoice" = Almindelig faktura
   - "credit_note" = Kreditnota (refusion, tilbagebetaling, korrektion)
   - Kig efter ord som: "Kreditnota", "Credit Note", "Refusion", "Tilbagebetaling", "Godtgørelse"
9. BELØB OG FORTEGN (ABSOLUT KRITISK):
   - **ALMINDELIGE FAKTURAER**: Alle beløb skal være POSITIVE tal (total_amount > 0, line_total > 0)
   - **KREDITNOTAER**: Alle beløb skal være NEGATIVE tal (total_amount < 0, line_total < 0)
   - Hvis dokumentet siger "Faktura" → document_type: "invoice" → POSITIVE beløb
   - Hvis dokumentet siger "Kreditnota" → document_type: "credit_note" → NEGATIVE beløb

JSON format skal være:
{
  "document_type": "invoice" eller "credit_note",
  "invoice_number": "fakturanummer eller kreditnota nummer",
  "vendor_name": "leverandør firmanavn",
  "vendor_cvr": "12345678",
  "invoice_date": "YYYY-MM-DD",
  "due_date": "YYYY-MM-DD",
  "currency": "DKK",
  "total_amount": 1234.56 (NEGATIVT for kreditnotaer),
  "vat_amount": 123.45 (NEGATIVT for kreditnotaer),
  "original_invoice_reference": "reference til original faktura (kun for kreditnotaer)",
  "lines": [
    {
      "line_number": 1,
      "description": "beskrivelse af varen/ydelsen",
      "quantity": antal_som_tal,
      "unit_price": pris_per_stk (NEGATIVT for kreditnotaer),
      "line_total": total_for_linjen (NEGATIVT for kreditnotaer),
      "vat_rate": 25.00,
      "vat_amount": moms_beløb (NEGATIVT for kreditnotaer),
      "confidence": 0.0_til_1.0
    }
  ],
  "confidence": gennemsnits_confidence,
  "raw_text_snippet": "første 200 tegn fra dokumentet"
}

EKSEMPEL PÅ FAKTURA (POSITIVE BELØB):
Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk iPhone 16 @ 5.965,18 DKK\\nMoms (25%): 1.491,30 DKK\\nTotal: 7.456,48 DKK"

Output: {
  "document_type": "invoice",
  "invoice_number": "2025-001",
  "vendor_name": "GlobalConnect A/S",
  "vendor_cvr": "12345678",
  "total_amount": 7456.48,
  "vat_amount": 1491.30,
  "lines": [{
    "line_number": 1,
    "description": "iPhone 16",
    "quantity": 1,
    "unit_price": 5965.18,
    "line_total": 5965.18,
    "vat_rate": 25.00,
    "vat_amount": 1491.30,
    "confidence": 0.95
  }],
  "confidence": 0.95
}

EKSEMPEL PÅ KREDITNOTA (NEGATIVE BELØB):
Input: "KREDITNOTA CN-2025-042\\nGlobalConnect A/S\\nCVR: 12345678\\nReference: Faktura 2025-001\\nTilbagebetaling:\\n1 stk iPhone 16 returneret @ -5.965,18 DKK\\nMoms (25%): -1.491,30 DKK\\nTotal: -7.456,48 DKK"

Output: {
  "document_type": "credit_note",
  "invoice_number": "CN-2025-042",
  "vendor_name": "GlobalConnect A/S",
  "vendor_cvr": "12345678",
  "original_invoice_reference": "2025-001",
  "total_amount": -7456.48,
  "vat_amount": -1491.30,
  "lines": [{
    "line_number": 1,
    "description": "iPhone 16 returneret",
    "quantity": 1,
    "unit_price": -5965.18,
    "line_total": -5965.18,
    "vat_rate": 25.00,
    "vat_amount": -1491.30,
    "confidence": 0.95
  }],
  "confidence": 0.95
}""")
    
    async def extract_from_text(self, text: str) -> Dict:
        """
        Extract structured invoice data from text using Ollama
        
        Args:
            text: Document text content
        
        Returns:
            Extracted data as dict with CVR, invoice number, amounts, etc.
        """
        
        # No truncation - send full text to AI
        prompt = f"{self.system_prompt}\n\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\n{text}\n\nReturner kun gyldig JSON:"
        
        logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")
        
        try:
            import httpx
            
            # Detect if using qwen3 model (requires Chat API)
            use_chat_api = self.model.startswith('qwen3')
            
            async with httpx.AsyncClient(timeout=1000.0) as client:
                if use_chat_api:
                    # qwen3 models use Chat API format
                    logger.info(f"🤖 Using Chat API for {self.model}")
                    response = await client.post(
                        f"{self.endpoint}/api/chat",
                        json={
                            "model": self.model,
                            "messages": [
                                {
                                    "role": "system",
                                    "content": self.system_prompt
                                },
                                {
                                    "role": "user",
                                    "content": f"NU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\n{text}\n\nVIGTIGT: Dit svar skal STARTE med {{ og SLUTTE med }} - ingen forklaring før eller efter JSON!"
                                }
                            ],
                            "stream": False,
                            "format": "json",
                            "think": False,
                            "options": {
                                "temperature": 0.1,
                                "top_p": 0.9,
                                "num_predict": 8000
                            }
                        }
                    )
                else:
                    # qwen2.5 and other models use Generate API format
                    logger.info(f"🤖 Using Generate API for {self.model}")
                    response = await client.post(
                        f"{self.endpoint}/api/generate",
                        json={
                            "model": self.model,
                            "prompt": prompt,
                            "stream": False,
                            "options": {
                                "temperature": 0.1,
                                "top_p": 0.9,
                                "num_predict": 8000
                            }
                        }
                    )
                
                if response.status_code != 200:
                    raise Exception(f"Ollama returned status {response.status_code}: {response.text}")
                
                result = response.json()
                
                # Extract response based on API type
                if use_chat_api:
                    # qwen3 models sometimes put the actual response in "thinking" field
                    raw_response = result.get("message", {}).get("content", "")
                    thinking = result.get("message", {}).get("thinking", "")
                    
                    # If content is empty but thinking has data, try to extract JSON from thinking
                    if not raw_response and thinking:
                        logger.info(f"💭 Content empty, attempting to extract JSON from thinking field (length: {len(thinking)})")
                        # Try to find JSON block in thinking text
                        json_start = thinking.find('{')
                        json_end = thinking.rfind('}') + 1
                        if json_start >= 0 and json_end > json_start:
                            potential_json = thinking[json_start:json_end]
                            logger.info(f"📦 Found potential JSON in thinking field (length: {len(potential_json)})")
                            raw_response = potential_json
                        else:
                            logger.warning(f"⚠️ No JSON found in thinking field, using full thinking as fallback")
                            raw_response = thinking
                    elif thinking:
                        logger.info(f"💭 Model thinking (length: {len(thinking)})")
                    
                    # DEBUG: Log full result structure
                    logger.info(f"📊 Chat API result keys: {list(result.keys())}")
                    logger.info(f"📊 Message keys: {list(result.get('message', {}).keys())}")
                else:
                    raw_response = result.get("response", "")
                
                logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")
                
                # Parse JSON from response
                extraction = self._parse_json_response(raw_response)
                
                # CRITICAL: Fix amount signs based on document_type
                # LLM sometimes returns negative amounts for invoices - fix this!
                document_type = extraction.get('document_type', 'invoice')
                
                if document_type == 'invoice':
                    # Normal invoices should have POSITIVE amounts
                    if extraction.get('total_amount') and extraction['total_amount'] < 0:
                        logger.warning(f"⚠️ Fixing negative total_amount for invoice: {extraction['total_amount']} → {abs(extraction['total_amount'])}")
                        extraction['total_amount'] = abs(extraction['total_amount'])
                    
                    if extraction.get('vat_amount') and extraction['vat_amount'] < 0:
                        extraction['vat_amount'] = abs(extraction['vat_amount'])
                    
                    # Fix line totals
                    if 'lines' in extraction:
                        for line in extraction['lines']:
                            if line.get('unit_price') and line['unit_price'] < 0:
                                line['unit_price'] = abs(line['unit_price'])
                            if line.get('line_total') and line['line_total'] < 0:
                                line['line_total'] = abs(line['line_total'])
                            if line.get('vat_amount') and line['vat_amount'] < 0:
                                line['vat_amount'] = abs(line['vat_amount'])
                
                elif document_type == 'credit_note':
                    # Credit notes should have NEGATIVE amounts
                    if extraction.get('total_amount') and extraction['total_amount'] > 0:
                        logger.warning(f"⚠️ Fixing positive total_amount for credit_note: {extraction['total_amount']} → {-abs(extraction['total_amount'])}")
                        extraction['total_amount'] = -abs(extraction['total_amount'])
                    
                    if extraction.get('vat_amount') and extraction['vat_amount'] > 0:
                        extraction['vat_amount'] = -abs(extraction['vat_amount'])
                    
                    # Fix line totals
                    if 'lines' in extraction:
                        for line in extraction['lines']:
                            if line.get('unit_price') and line['unit_price'] > 0:
                                line['unit_price'] = -abs(line['unit_price'])
                            if line.get('line_total') and line['line_total'] > 0:
                                line['line_total'] = -abs(line['line_total'])
                            if line.get('vat_amount') and line['vat_amount'] > 0:
                                line['vat_amount'] = -abs(line['vat_amount'])
                
                # Add raw response for debugging
                extraction['_raw_llm_response'] = raw_response
                
                return extraction
        
        except Exception as e:
            error_msg = f"Ollama extraction failed: {str(e)}"
            logger.error(f"❌ {error_msg}")
            
            error_str = str(e).lower()
            if "timeout" in error_str:
                return {
                    "error": f"Ollama timeout efter 1000 sekunder",
                    "confidence": 0.0
                }
            elif "connection" in error_str or "connect" in error_str:
                return {
                    "error": f"Kan ikke forbinde til Ollama på {self.endpoint}",
                    "confidence": 0.0
                }
            else:
                return {
                    "error": error_msg,
                    "confidence": 0.0
                }
    
    def _parse_json_response(self, response: str) -> Dict:
        """Parse JSON from LLM response with aggressive fallback strategies"""
        logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")

        # Find outermost JSON object
        start = response.find('{')
        end = response.rfind('}') + 1
        if start < 0 or end <= start:
            logger.error("❌ No JSON object found in response")
            return self._extract_fields_with_regex(response)

        json_str = response[start:end]

        # Strategy 1: direct parse
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass

        # Strategy 2: remove trailing commas before } or ]
        fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
        try:
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass

        # Strategy 3: remove JS-style comments (// and /* */)
        fixed = re.sub(r'//[^\n]*', '', fixed)
        fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
        try:
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass

        # Strategy 4: truncate at last valid closing brace
        # Walk backwards to find longest valid JSON prefix
        for i in range(len(fixed) - 1, start, -1):
            if fixed[i] == '}':
                candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
                # rebuild from inner start
                c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
                try:
                    data = json.loads(c2)
                    logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
                    return data
                except json.JSONDecodeError:
                    continue
                break

        # Strategy 5: regex extraction of key fields (always succeeds with partial data)
        logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
        return self._extract_fields_with_regex(response)

    def _extract_fields_with_regex(self, text: str) -> Dict:
        """Extract invoice fields from text using regex when JSON parsing fails"""
        def _find(pattern, default=None):
            m = re.search(pattern, text, re.IGNORECASE)
            return m.group(1).strip() if m else default

        def _find_num(pattern):
            m = re.search(pattern, text, re.IGNORECASE)
            if not m: return None
            val = m.group(1).replace('.', '').replace(',', '.')
            try: return float(val)
            except: return None

        result = {
            "document_type":   _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
            "invoice_number":  _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
            "vendor_name":     _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
            "vendor_cvr":      _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
            "invoice_date":    _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
            "due_date":        _find(r'"due_date"\s*:\s*"([^"]+)"'),
            "currency":        _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
            "total_amount":    _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
            "vat_amount":      _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
            "confidence":      0.5,
            "lines":           [],
            "_partial":        True,
        }
        logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
        return result

    
    def calculate_file_checksum(self, file_path: Path) -> str:
        """Calculate SHA256 checksum of file for duplicate detection"""
        sha256 = hashlib.sha256()
        with open(file_path, 'rb') as f:
            while chunk := f.read(8192):
                sha256.update(chunk)
        checksum = sha256.hexdigest()
        logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")
        return checksum
    
    async def _extract_text_from_file(self, file_path: Path) -> str:
        """Extract text from PDF, image, or text file"""
        suffix = file_path.suffix.lower()
        
        try:
            if suffix == '.pdf':
                return await self._extract_text_from_pdf(file_path)
            elif suffix in ['.png', '.jpg', '.jpeg']:
                return await self._extract_text_from_image(file_path)
            elif suffix in ['.txt', '.csv']:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    return f.read()
            else:
                raise ValueError(f"Unsupported file type: {suffix}")
        
        except Exception as e:
            logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")
            raise
    
    async def _extract_text_from_pdf(self, file_path: Path) -> str:
        """Extract text from PDF using pdfplumber (better table/layout support)"""
        try:
            import pdfplumber
            
            all_text = []
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    # Strategy: Use regular text extraction (includes tables)
                    # pdfplumber's extract_text() handles tables better than PyPDF2
                    page_text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
                    
                    if page_text:
                        all_text.append(page_text)
            
            text = "\\n".join(all_text)
            logger.info(f"📄 Extracted {len(text)} chars from PDF with pdfplumber")
            return text
        
        except Exception as e:
            logger.error(f"❌ PDF extraction failed: {e}")
            raise
    
    async def _extract_text_from_image(self, file_path: Path) -> str:
        """Extract text from image using Tesseract OCR"""
        try:
            import pytesseract
            from PIL import Image
            
            image = Image.open(file_path)
            
            # Use Danish + English for OCR
            text = pytesseract.image_to_string(image, lang='dan+eng')
            
            logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")
            return text
        
        except Exception as e:
            logger.error(f"❌ OCR extraction failed: {e}")
            # Fallback to English only
            try:
                text = pytesseract.image_to_string(Image.open(file_path), lang='eng')
                logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")
                return text
            except:
                raise
    
    def _get_mime_type(self, file_path: Path) -> str:
        """Get MIME type from file extension"""
        suffix = file_path.suffix.lower()
        mime_types = {
            '.pdf': 'application/pdf',
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.txt': 'text/plain',
            '.csv': 'text/csv'
        }
        return mime_types.get(suffix, 'application/octet-stream')
    
    async def quick_analysis_on_upload(self, pdf_text: str) -> Dict:
        """
        Quick analysis when file is uploaded - extracts critical fields only:
        - CVR number (to match vendor)
        - Document type (invoice vs credit note)
        - Invoice/credit note number
        
        This runs BEFORE template matching for early vendor detection.
        
        Args:
            pdf_text: Extracted text from PDF
        
        Returns:
            Dict with cvr, document_type, document_number, vendor_id, vendor_name, is_own_invoice
        """
        from app.core.config import settings
        
        logger.info("⚡ Running quick analysis on upload...")
        
        result = {
            "cvr": None,
            "document_type": None,  # 'invoice' or 'credit_note'
            "document_number": None,
            "vendor_id": None,
            "vendor_name": None,
            "is_own_invoice": False  # True if this is an outgoing invoice (BMC's own CVR)
        }
        
        # 1. FIND CVR NUMBER (8 digits)
        # Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678"
        # Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR
        # We need the SELLER's CVR (not BMC's own)
        
        cvr_patterns = [
            r'CVR[:\-\s]*(\d{8})',
            r'CVR[:\-\s]*nr\.?\s*(\d{8})',
            r'CVR[:\-\s]*nummer\s*(\d{8})',
            r'SE[:\-\s]*(\d{8})',  # SE = Svensk CVR, men også brugt i DK
            r'\b(\d{8})\b'  # Fallback: any 8-digit number
        ]
        
        # Find ALL CVR numbers in document
        found_cvrs = []
        for pattern in cvr_patterns:
            matches = re.finditer(pattern, pdf_text, re.IGNORECASE)
            for match in matches:
                cvr_candidate = match.group(1)
                # Validate it's a real CVR (starts with 1-4, not a random number)
                if cvr_candidate[0] in '1234' and cvr_candidate not in found_cvrs:
                    found_cvrs.append(cvr_candidate)
        
        # Remove BMC's own CVR from list (buyer CVR, not seller)
        vendor_cvrs = [cvr for cvr in found_cvrs if cvr != settings.OWN_CVR]
        
        if settings.OWN_CVR in found_cvrs:
            # This is a proper invoice where BMC is the buyer
            if len(vendor_cvrs) > 0:
                # Found vendor CVR - use the first non-BMC CVR
                result['cvr'] = vendor_cvrs[0]
                logger.info(f"📋 Found vendor CVR: {vendor_cvrs[0]} (ignored BMC CVR: {settings.OWN_CVR})")
                
                # Try to match vendor
                vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
                if vendor:
                    result['vendor_id'] = vendor['id']
                    result['vendor_name'] = vendor['name']
            else:
                # Only BMC's CVR found = this is an outgoing invoice
                result['is_own_invoice'] = True
                result['cvr'] = settings.OWN_CVR
                logger.warning(f"⚠️ OUTGOING INVOICE: Only BMC CVR found")
        elif len(vendor_cvrs) > 0:
            # No BMC CVR, but other CVR found - use first one
            result['cvr'] = vendor_cvrs[0]
            logger.info(f"📋 Found CVR: {vendor_cvrs[0]}")
            
            vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
            if vendor:
                result['vendor_id'] = vendor['id']
                result['vendor_name'] = vendor['name']
        
        # 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note)
        credit_keywords = [
            'kreditnota', 'credit note', 'creditnote', 'kreditfaktura',
            'refusion', 'tilbagebetaling', 'godtgørelse', 'tilbageførsel'
        ]
        
        text_lower = pdf_text.lower()
        is_credit_note = any(keyword in text_lower for keyword in credit_keywords)
        
        if is_credit_note:
            result['document_type'] = 'credit_note'
            logger.info("📄 Document type: CREDIT NOTE")
        else:
            result['document_type'] = 'invoice'
            logger.info("📄 Document type: INVOICE")
        
        # 3. EXTRACT DOCUMENT NUMBER
        # For invoices: "Faktura nr.", "Invoice number:", "Fakturanr."
        # For credit notes: "Kreditnota nr.", "Credit note number:"
        
        if result['document_type'] == 'credit_note':
            number_patterns = [
                r'kreditnota\s*(?:nr\.?|nummer)[:\s]*(\S+)',
                r'credit\s*note\s*(?:no\.?|number)[:\s]*(\S+)',
                r'kreditfaktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
            ]
        else:
            number_patterns = [
                r'faktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
                r'invoice\s*(?:no\.?|number)[:\s]*(\S+)',
                r'fakturanr\.?\s*[:\s]*(\S+)',
            ]
        
        for pattern in number_patterns:
            match = re.search(pattern, pdf_text, re.IGNORECASE)
            if match:
                result['document_number'] = match.group(1).strip()
                logger.info(f"🔢 Document number: {result['document_number']}")
                break
        
        logger.info(f"✅ Quick analysis complete: CVR={result['cvr']}, Type={result['document_type']}, Number={result['document_number']}, Vendor={result['vendor_name']}")
        return result

    def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
        """
        Match vendor from database using CVR number
        
        Args:
            vendor_cvr: CVR number from extraction
        
        Returns:
            Vendor dict if found, None otherwise
        """
        if not vendor_cvr:
            return None
        
        # Clean CVR (remove spaces, dashes)
        cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)
        
        if len(cvr_clean) != 8:
            logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")
            return None
        
        # Search vendors table
        vendor = execute_query_single(
            "SELECT * FROM vendors WHERE cvr_number = %s",
            (cvr_clean,))
        
        if vendor:
            logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")
            return vendor
        else:
            logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")
            return None

    async def generate_summary(self, text: str) -> str:
        """
        Generate a short summary of the text using Ollama
        """
        if not text:
            return ""
            
        system_prompt = "Du er en hjælpsom assistent, der laver korte, præcise resuméer på dansk."
        user_prompt = f"Lav et kort resumé (max 50 ord) af følgende tekst:\n\n{text}"
        
        try:
            import aiohttp

            logger.info(f"🧠 Generating summary with Ollama ({self.model})...")
            
            async with aiohttp.ClientSession() as session:
                payload = {
                    "model": self.model,
                    "prompt": system_prompt + "\n\n" + user_prompt,
                    "stream": False,
                    "options": {"temperature": 0.3}
                }
                async with session.post(f"{self.endpoint}/api/generate", json=payload, timeout=60.0) as response:
                    if response.status == 200:
                        data = await response.json()
                        summary = data.get("response", "").strip()
                        logger.info("✅ Summary generated")
                        return summary
                    else:
                        error_text = await response.text()
                        logger.error(f"❌ Ollama error: {error_text}")
                        return "Kunne ikke generere resumé (API fejl)."
                    
        except Exception as e:
            logger.error(f"❌ Ollama summary failed: {e}")
            return f"Ingen resumé (Fejl: {str(e)})"

# Global instance
ollama_service = OllamaService()