bmc_hub/app/services/ollama_service.py

"""
Ollama Integration Service for BMC Hub
Handles supplier invoice extraction using Ollama LLM with CVR matching
"""

import json
import hashlib
import logging
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from datetime import datetime
import re

from app.core.config import settings
from app.core.database import execute_insert, execute_query, execute_update

logger = logging.getLogger(__name__)

class OllamaService:
    """Service for extracting supplier invoice data using Ollama LLM"""
    
    def __init__(self):
        self.endpoint = settings.OLLAMA_ENDPOINT
        self.model = settings.OLLAMA_MODEL
        self.system_prompt = self._build_system_prompt()
        logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")
    
    def _build_system_prompt(self) -> str:
        """Build Danish system prompt for invoice extraction with CVR"""
        return """Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og leverandørdokumenter.

VIGTIGE REGLER:
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
2. Hvis et felt ikke findes, sæt det til null
3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)
4. Datoer skal være i format YYYY-MM-DD
5. Tal skal være decimaler (brug . som decimalseparator)
6. CVR-nummer skal være 8 cifre uden mellemrum
7. Moms/VAT skal udtrækkes fra hver linje hvis muligt

JSON format skal være:
{
  "document_type": "invoice",
  "invoice_number": "fakturanummer",
  "vendor_name": "leverandør firmanavn",
  "vendor_cvr": "12345678",
  "invoice_date": "YYYY-MM-DD",
  "due_date": "YYYY-MM-DD",
  "currency": "DKK",
  "total_amount": 1234.56,
  "vat_amount": 123.45,
  "lines": [
    {
      "line_number": 1,
      "description": "beskrivelse af varen/ydelsen",
      "quantity": antal_som_tal,
      "unit_price": pris_per_stk,
      "line_total": total_for_linjen,
      "vat_rate": 25.00,
      "vat_amount": moms_beløb,
      "confidence": 0.0_til_1.0
    }
  ],
  "confidence": gennemsnits_confidence,
  "raw_text_snippet": "første 200 tegn fra dokumentet"
}

EKSEMPEL:
Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk Fiber 100/100 Mbit @ 299,00 DKK\\nMoms (25%): 74,75 DKK\\nTotal: 373,75 DKK"

Output: {
  "document_type": "invoice",
  "invoice_number": "2025-001",
  "vendor_name": "GlobalConnect A/S",
  "vendor_cvr": "12345678",
  "total_amount": 373.75,
  "vat_amount": 74.75,
  "lines": [{
    "line_number": 1,
    "description": "Fiber 100/100 Mbit",
    "quantity": 1,
    "unit_price": 299.00,
    "line_total": 299.00,
    "vat_rate": 25.00,
    "vat_amount": 74.75,
    "confidence": 0.95
  }],
  "confidence": 0.95
}"""
    
    async def extract_from_text(self, text: str) -> Dict:
        """
        Extract structured invoice data from text using Ollama
        
        Args:
            text: Document text content
        
        Returns:
            Extracted data as dict with CVR, invoice number, amounts, etc.
        """
        
        # Truncate text if too long (keep first 4000 chars)
        if len(text) > 4000:
            text = text[:4000] + "\\n[... tekst afkortet ...]"
        
        prompt = f"{self.system_prompt}\\n\\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\\n{text}\\n\\nReturner kun gyldig JSON:"
        
        logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")
        
        try:
            import httpx
            
            async with httpx.AsyncClient(timeout=1000.0) as client:
                response = await client.post(
                    f"{self.endpoint}/api/generate",
                    json={
                        "model": self.model,
                        "prompt": prompt,
                        "stream": False,
                        "options": {
                            "temperature": 0.1,
                            "top_p": 0.9,
                            "num_predict": 2000
                        }
                    }
                )
                
                if response.status_code != 200:
                    raise Exception(f"Ollama returned status {response.status_code}: {response.text}")
                
                result = response.json()
                raw_response = result.get("response", "")
                
                logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")
                
                # Parse JSON from response
                extraction = self._parse_json_response(raw_response)
                
                # Add raw response for debugging
                extraction['_raw_llm_response'] = raw_response
                
                return extraction
        
        except Exception as e:
            error_msg = f"Ollama extraction failed: {str(e)}"
            logger.error(f"❌ {error_msg}")
            
            error_str = str(e).lower()
            if "timeout" in error_str:
                return {
                    "error": f"Ollama timeout efter 1000 sekunder",
                    "confidence": 0.0
                }
            elif "connection" in error_str or "connect" in error_str:
                return {
                    "error": f"Kan ikke forbinde til Ollama på {self.endpoint}",
                    "confidence": 0.0
                }
            else:
                return {
                    "error": error_msg,
                    "confidence": 0.0
                }
    
    def _parse_json_response(self, response: str) -> Dict:
        """Parse JSON from LLM response with improved error handling"""
        try:
            # Find JSON in response (between first { and last })
            start = response.find('{')
            end = response.rfind('}') + 1
            
            if start >= 0 and end > start:
                json_str = response[start:end]
                
                # Try to fix common JSON issues
                # Remove trailing commas before } or ]
                json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
                # Fix single quotes to double quotes (but not in values)
                # This is risky, so we only do it if initial parse fails
                
                try:
                    data = json.loads(json_str)
                    return data
                except json.JSONDecodeError:
                    # Try to fix common issues
                    # Replace single quotes with double quotes (simple approach)
                    fixed_json = json_str.replace("'", '"')
                    try:
                        data = json.loads(fixed_json)
                        logger.warning("⚠️ Fixed JSON with quote replacement")
                        return data
                    except:
                        pass
                    
                    # Last resort: log the problematic JSON
                    logger.error(f"❌ Problematic JSON: {json_str[:300]}")
                    raise
            else:
                raise ValueError("No JSON found in response")
        
        except json.JSONDecodeError as e:
            logger.error(f"❌ JSON parsing failed: {e}")
            logger.error(f"Raw response preview: {response[:500]}")
            return {
                "error": f"JSON parsing failed: {str(e)}",
                "confidence": 0.0,
                "raw_response": response[:500]
            }
    
    def calculate_file_checksum(self, file_path: Path) -> str:
        """Calculate SHA256 checksum of file for duplicate detection"""
        sha256 = hashlib.sha256()
        with open(file_path, 'rb') as f:
            while chunk := f.read(8192):
                sha256.update(chunk)
        checksum = sha256.hexdigest()
        logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")
        return checksum
    
    async def _extract_text_from_file(self, file_path: Path) -> str:
        """Extract text from PDF, image, or text file"""
        suffix = file_path.suffix.lower()
        
        try:
            if suffix == '.pdf':
                return await self._extract_text_from_pdf(file_path)
            elif suffix in ['.png', '.jpg', '.jpeg']:
                return await self._extract_text_from_image(file_path)
            elif suffix in ['.txt', '.csv']:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    return f.read()
            else:
                raise ValueError(f"Unsupported file type: {suffix}")
        
        except Exception as e:
            logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")
            raise
    
    async def _extract_text_from_pdf(self, file_path: Path) -> str:
        """Extract text from PDF using PyPDF2"""
        try:
            from PyPDF2 import PdfReader
            
            reader = PdfReader(file_path)
            text = ""
            
            for page_num, page in enumerate(reader.pages):
                page_text = page.extract_text()
                text += f"\\n--- Side {page_num + 1} ---\\n{page_text}"
            
            logger.info(f"📄 Extracted {len(text)} chars from PDF with {len(reader.pages)} pages")
            return text
        
        except Exception as e:
            logger.error(f"❌ PDF extraction failed: {e}")
            raise
    
    async def _extract_text_from_image(self, file_path: Path) -> str:
        """Extract text from image using Tesseract OCR"""
        try:
            import pytesseract
            from PIL import Image
            
            image = Image.open(file_path)
            
            # Use Danish + English for OCR
            text = pytesseract.image_to_string(image, lang='dan+eng')
            
            logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")
            return text
        
        except Exception as e:
            logger.error(f"❌ OCR extraction failed: {e}")
            # Fallback to English only
            try:
                text = pytesseract.image_to_string(Image.open(file_path), lang='eng')
                logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")
                return text
            except:
                raise
    
    def _get_mime_type(self, file_path: Path) -> str:
        """Get MIME type from file extension"""
        suffix = file_path.suffix.lower()
        mime_types = {
            '.pdf': 'application/pdf',
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.txt': 'text/plain',
            '.csv': 'text/csv'
        }
        return mime_types.get(suffix, 'application/octet-stream')
    
    def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
        """
        Match vendor from database using CVR number
        
        Args:
            vendor_cvr: CVR number from extraction
        
        Returns:
            Vendor dict if found, None otherwise
        """
        if not vendor_cvr:
            return None
        
        # Clean CVR (remove spaces, dashes)
        cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)
        
        if len(cvr_clean) != 8:
            logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")
            return None
        
        # Search vendors table
        vendor = execute_query(
            "SELECT * FROM vendors WHERE cvr = %s",
            (cvr_clean,),
            fetchone=True
        )
        
        if vendor:
            logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")
            return vendor
        else:
            logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")
            return None


# Global instance
ollama_service = OllamaService()
feat: Implement supplier invoices management with e-conomic integration - Added FastAPI views for supplier invoices in the billing frontend. - Created EconomicService for handling e-conomic API interactions, including safety modes for read-only and dry-run operations. - Developed database migration for supplier invoices, including tables for invoices, line items, and settings. - Documented kassekladde module features, architecture, API endpoints, and usage guide in KASSEKLADDE.md. - Implemented views for overdue invoices and pending e-conomic sync. 2025-12-07 03:29:54 +01:00			`"""`
			`Ollama Integration Service for BMC Hub`
			`Handles supplier invoice extraction using Ollama LLM with CVR matching`
			`"""`

			`import json`
			`import hashlib`
			`import logging`
			`from pathlib import Path`
			`from typing import Optional, Dict, List, Tuple`
			`from datetime import datetime`
			`import re`

			`from app.core.config import settings`
			`from app.core.database import execute_insert, execute_query, execute_update`

			`logger = logging.getLogger(__name__)`

			`class OllamaService:`
			`"""Service for extracting supplier invoice data using Ollama LLM"""`

			`def __init__(self):`
			`self.endpoint = settings.OLLAMA_ENDPOINT`
			`self.model = settings.OLLAMA_MODEL`
			`self.system_prompt = self._build_system_prompt()`
			`logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")`

			`def _build_system_prompt(self) -> str:`
			`"""Build Danish system prompt for invoice extraction with CVR"""`
			`return """Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og leverandørdokumenter.`

			`VIGTIGE REGLER:`
			`1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst`
			`2. Hvis et felt ikke findes, sæt det til null`
			`3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)`
			`4. Datoer skal være i format YYYY-MM-DD`
			`5. Tal skal være decimaler (brug . som decimalseparator)`
			`6. CVR-nummer skal være 8 cifre uden mellemrum`
			`7. Moms/VAT skal udtrækkes fra hver linje hvis muligt`

			`JSON format skal være:`
			`{`
			`"document_type": "invoice",`
			`"invoice_number": "fakturanummer",`
			`"vendor_name": "leverandør firmanavn",`
			`"vendor_cvr": "12345678",`
			`"invoice_date": "YYYY-MM-DD",`
			`"due_date": "YYYY-MM-DD",`
			`"currency": "DKK",`
			`"total_amount": 1234.56,`
			`"vat_amount": 123.45,`
			`"lines": [`
			`{`
			`"line_number": 1,`
			`"description": "beskrivelse af varen/ydelsen",`
			`"quantity": antal_som_tal,`
			`"unit_price": pris_per_stk,`
			`"line_total": total_for_linjen,`
			`"vat_rate": 25.00,`
			`"vat_amount": moms_beløb,`
			`"confidence": 0.0_til_1.0`
			`}`
			`],`
			`"confidence": gennemsnits_confidence,`
			`"raw_text_snippet": "første 200 tegn fra dokumentet"`
			`}`

			`EKSEMPEL:`
			`Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk Fiber 100/100 Mbit @ 299,00 DKK\\nMoms (25%): 74,75 DKK\\nTotal: 373,75 DKK"`

			`Output: {`
			`"document_type": "invoice",`
			`"invoice_number": "2025-001",`
			`"vendor_name": "GlobalConnect A/S",`
			`"vendor_cvr": "12345678",`
			`"total_amount": 373.75,`
			`"vat_amount": 74.75,`
			`"lines": [{`
			`"line_number": 1,`
			`"description": "Fiber 100/100 Mbit",`
			`"quantity": 1,`
			`"unit_price": 299.00,`
			`"line_total": 299.00,`
			`"vat_rate": 25.00,`
			`"vat_amount": 74.75,`
			`"confidence": 0.95`
			`}],`
			`"confidence": 0.95`
			`}"""`

			`async def extract_from_text(self, text: str) -> Dict:`
			`"""`
			`Extract structured invoice data from text using Ollama`

			`Args:`
			`text: Document text content`

			`Returns:`
			`Extracted data as dict with CVR, invoice number, amounts, etc.`
			`"""`

			`# Truncate text if too long (keep first 4000 chars)`
			`if len(text) > 4000:`
			`text = text[:4000] + "\\n[... tekst afkortet ...]"`

			`prompt = f"{self.system_prompt}\\n\\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\\n{text}\\n\\nReturner kun gyldig JSON:"`

			`logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")`

			`try:`
			`import httpx`

			`async with httpx.AsyncClient(timeout=1000.0) as client:`
			`response = await client.post(`
			`f"{self.endpoint}/api/generate",`
			`json={`
			`"model": self.model,`
			`"prompt": prompt,`
			`"stream": False,`
			`"options": {`
			`"temperature": 0.1,`
			`"top_p": 0.9,`
			`"num_predict": 2000`
			`}`
			`}`
			`)`

			`if response.status_code != 200:`
			`raise Exception(f"Ollama returned status {response.status_code}: {response.text}")`

			`result = response.json()`
			`raw_response = result.get("response", "")`

			`logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")`

			`# Parse JSON from response`
			`extraction = self._parse_json_response(raw_response)`

			`# Add raw response for debugging`
			`extraction['_raw_llm_response'] = raw_response`

			`return extraction`

			`except Exception as e:`
			`error_msg = f"Ollama extraction failed: {str(e)}"`
			`logger.error(f"❌ {error_msg}")`

			`error_str = str(e).lower()`
			`if "timeout" in error_str:`
			`return {`
			`"error": f"Ollama timeout efter 1000 sekunder",`
			`"confidence": 0.0`
			`}`
			`elif "connection" in error_str or "connect" in error_str:`
			`return {`
			`"error": f"Kan ikke forbinde til Ollama på {self.endpoint}",`
			`"confidence": 0.0`
			`}`
			`else:`
			`return {`
			`"error": error_msg,`
			`"confidence": 0.0`
			`}`

			`def _parse_json_response(self, response: str) -> Dict:`
			`"""Parse JSON from LLM response with improved error handling"""`
			`try:`
			`# Find JSON in response (between first { and last })`
			`start = response.find('{')`
			`end = response.rfind('}') + 1`

			`if start >= 0 and end > start:`
			`json_str = response[start:end]`

			`# Try to fix common JSON issues`
			`# Remove trailing commas before } or ]`
			`json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)`
			`# Fix single quotes to double quotes (but not in values)`
			`# This is risky, so we only do it if initial parse fails`

			`try:`
			`data = json.loads(json_str)`
			`return data`
			`except json.JSONDecodeError:`
			`# Try to fix common issues`
			`# Replace single quotes with double quotes (simple approach)`
			`fixed_json = json_str.replace("'", '"')`
			`try:`
			`data = json.loads(fixed_json)`
			`logger.warning("⚠️ Fixed JSON with quote replacement")`
			`return data`
			`except:`
			`pass`

			`# Last resort: log the problematic JSON`
			`logger.error(f"❌ Problematic JSON: {json_str[:300]}")`
			`raise`
			`else:`
			`raise ValueError("No JSON found in response")`

			`except json.JSONDecodeError as e:`
			`logger.error(f"❌ JSON parsing failed: {e}")`
			`logger.error(f"Raw response preview: {response[:500]}")`
			`return {`
			`"error": f"JSON parsing failed: {str(e)}",`
			`"confidence": 0.0,`
			`"raw_response": response[:500]`
			`}`

			`def calculate_file_checksum(self, file_path: Path) -> str:`
			`"""Calculate SHA256 checksum of file for duplicate detection"""`
			`sha256 = hashlib.sha256()`
			`with open(file_path, 'rb') as f:`
			`while chunk := f.read(8192):`
			`sha256.update(chunk)`
			`checksum = sha256.hexdigest()`
			`logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")`
			`return checksum`

			`async def _extract_text_from_file(self, file_path: Path) -> str:`
			`"""Extract text from PDF, image, or text file"""`
			`suffix = file_path.suffix.lower()`

			`try:`
			`if suffix == '.pdf':`
			`return await self._extract_text_from_pdf(file_path)`
			`elif suffix in ['.png', '.jpg', '.jpeg']:`
			`return await self._extract_text_from_image(file_path)`
			`elif suffix in ['.txt', '.csv']:`
			`with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:`
			`return f.read()`
			`else:`
			`raise ValueError(f"Unsupported file type: {suffix}")`

			`except Exception as e:`
			`logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")`
			`raise`

			`async def _extract_text_from_pdf(self, file_path: Path) -> str:`
			`"""Extract text from PDF using PyPDF2"""`
			`try:`
			`from PyPDF2 import PdfReader`

			`reader = PdfReader(file_path)`
			`text = ""`

			`for page_num, page in enumerate(reader.pages):`
			`page_text = page.extract_text()`
			`text += f"\\n--- Side {page_num + 1} ---\\n{page_text}"`

			`logger.info(f"📄 Extracted {len(text)} chars from PDF with {len(reader.pages)} pages")`
			`return text`

			`except Exception as e:`
			`logger.error(f"❌ PDF extraction failed: {e}")`
			`raise`

			`async def _extract_text_from_image(self, file_path: Path) -> str:`
			`"""Extract text from image using Tesseract OCR"""`
			`try:`
			`import pytesseract`
			`from PIL import Image`

			`image = Image.open(file_path)`

			`# Use Danish + English for OCR`
			`text = pytesseract.image_to_string(image, lang='dan+eng')`

			`logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")`
			`return text`

			`except Exception as e:`
			`logger.error(f"❌ OCR extraction failed: {e}")`
			`# Fallback to English only`
			`try:`
			`text = pytesseract.image_to_string(Image.open(file_path), lang='eng')`
			`logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")`
			`return text`
			`except:`
			`raise`

			`def _get_mime_type(self, file_path: Path) -> str:`
			`"""Get MIME type from file extension"""`
			`suffix = file_path.suffix.lower()`
			`mime_types = {`
			`'.pdf': 'application/pdf',`
			`'.png': 'image/png',`
			`'.jpg': 'image/jpeg',`
			`'.jpeg': 'image/jpeg',`
			`'.txt': 'text/plain',`
			`'.csv': 'text/csv'`
			`}`
			`return mime_types.get(suffix, 'application/octet-stream')`

			`def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:`
			`"""`
			`Match vendor from database using CVR number`

			`Args:`
			`vendor_cvr: CVR number from extraction`

			`Returns:`
			`Vendor dict if found, None otherwise`
			`"""`
			`if not vendor_cvr:`
			`return None`

			`# Clean CVR (remove spaces, dashes)`
			`cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)`

			`if len(cvr_clean) != 8:`
			`logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")`
			`return None`

			`# Search vendors table`
			`vendor = execute_query(`
			`"SELECT * FROM vendors WHERE cvr = %s",`
			`(cvr_clean,),`
			`fetchone=True`
			`)`

			`if vendor:`
			`logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")`
			`return vendor`
			`else:`
			`logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")`
			`return None`


			`# Global instance`
			`ollama_service = OllamaService()`