332 lines
12 KiB
Python
332 lines
12 KiB
Python
|
|
"""
|
||
|
|
Ollama Integration Service for BMC Hub
|
||
|
|
Handles supplier invoice extraction using Ollama LLM with CVR matching
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import hashlib
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional, Dict, List, Tuple
|
||
|
|
from datetime import datetime
|
||
|
|
import re
|
||
|
|
|
||
|
|
from app.core.config import settings
|
||
|
|
from app.core.database import execute_insert, execute_query, execute_update
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
class OllamaService:
|
||
|
|
"""Service for extracting supplier invoice data using Ollama LLM"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self.endpoint = settings.OLLAMA_ENDPOINT
|
||
|
|
self.model = settings.OLLAMA_MODEL
|
||
|
|
self.system_prompt = self._build_system_prompt()
|
||
|
|
logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")
|
||
|
|
|
||
|
|
def _build_system_prompt(self) -> str:
|
||
|
|
"""Build Danish system prompt for invoice extraction with CVR"""
|
||
|
|
return """Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og leverandørdokumenter.
|
||
|
|
|
||
|
|
VIGTIGE REGLER:
|
||
|
|
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
|
||
|
|
2. Hvis et felt ikke findes, sæt det til null
|
||
|
|
3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)
|
||
|
|
4. Datoer skal være i format YYYY-MM-DD
|
||
|
|
5. Tal skal være decimaler (brug . som decimalseparator)
|
||
|
|
6. CVR-nummer skal være 8 cifre uden mellemrum
|
||
|
|
7. Moms/VAT skal udtrækkes fra hver linje hvis muligt
|
||
|
|
|
||
|
|
JSON format skal være:
|
||
|
|
{
|
||
|
|
"document_type": "invoice",
|
||
|
|
"invoice_number": "fakturanummer",
|
||
|
|
"vendor_name": "leverandør firmanavn",
|
||
|
|
"vendor_cvr": "12345678",
|
||
|
|
"invoice_date": "YYYY-MM-DD",
|
||
|
|
"due_date": "YYYY-MM-DD",
|
||
|
|
"currency": "DKK",
|
||
|
|
"total_amount": 1234.56,
|
||
|
|
"vat_amount": 123.45,
|
||
|
|
"lines": [
|
||
|
|
{
|
||
|
|
"line_number": 1,
|
||
|
|
"description": "beskrivelse af varen/ydelsen",
|
||
|
|
"quantity": antal_som_tal,
|
||
|
|
"unit_price": pris_per_stk,
|
||
|
|
"line_total": total_for_linjen,
|
||
|
|
"vat_rate": 25.00,
|
||
|
|
"vat_amount": moms_beløb,
|
||
|
|
"confidence": 0.0_til_1.0
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"confidence": gennemsnits_confidence,
|
||
|
|
"raw_text_snippet": "første 200 tegn fra dokumentet"
|
||
|
|
}
|
||
|
|
|
||
|
|
EKSEMPEL:
|
||
|
|
Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk Fiber 100/100 Mbit @ 299,00 DKK\\nMoms (25%): 74,75 DKK\\nTotal: 373,75 DKK"
|
||
|
|
|
||
|
|
Output: {
|
||
|
|
"document_type": "invoice",
|
||
|
|
"invoice_number": "2025-001",
|
||
|
|
"vendor_name": "GlobalConnect A/S",
|
||
|
|
"vendor_cvr": "12345678",
|
||
|
|
"total_amount": 373.75,
|
||
|
|
"vat_amount": 74.75,
|
||
|
|
"lines": [{
|
||
|
|
"line_number": 1,
|
||
|
|
"description": "Fiber 100/100 Mbit",
|
||
|
|
"quantity": 1,
|
||
|
|
"unit_price": 299.00,
|
||
|
|
"line_total": 299.00,
|
||
|
|
"vat_rate": 25.00,
|
||
|
|
"vat_amount": 74.75,
|
||
|
|
"confidence": 0.95
|
||
|
|
}],
|
||
|
|
"confidence": 0.95
|
||
|
|
}"""
|
||
|
|
|
||
|
|
async def extract_from_text(self, text: str) -> Dict:
|
||
|
|
"""
|
||
|
|
Extract structured invoice data from text using Ollama
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: Document text content
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Extracted data as dict with CVR, invoice number, amounts, etc.
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Truncate text if too long (keep first 4000 chars)
|
||
|
|
if len(text) > 4000:
|
||
|
|
text = text[:4000] + "\\n[... tekst afkortet ...]"
|
||
|
|
|
||
|
|
prompt = f"{self.system_prompt}\\n\\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\\n{text}\\n\\nReturner kun gyldig JSON:"
|
||
|
|
|
||
|
|
logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")
|
||
|
|
|
||
|
|
try:
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
async with httpx.AsyncClient(timeout=1000.0) as client:
|
||
|
|
response = await client.post(
|
||
|
|
f"{self.endpoint}/api/generate",
|
||
|
|
json={
|
||
|
|
"model": self.model,
|
||
|
|
"prompt": prompt,
|
||
|
|
"stream": False,
|
||
|
|
"options": {
|
||
|
|
"temperature": 0.1,
|
||
|
|
"top_p": 0.9,
|
||
|
|
"num_predict": 2000
|
||
|
|
}
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
if response.status_code != 200:
|
||
|
|
raise Exception(f"Ollama returned status {response.status_code}: {response.text}")
|
||
|
|
|
||
|
|
result = response.json()
|
||
|
|
raw_response = result.get("response", "")
|
||
|
|
|
||
|
|
logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")
|
||
|
|
|
||
|
|
# Parse JSON from response
|
||
|
|
extraction = self._parse_json_response(raw_response)
|
||
|
|
|
||
|
|
# Add raw response for debugging
|
||
|
|
extraction['_raw_llm_response'] = raw_response
|
||
|
|
|
||
|
|
return extraction
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
error_msg = f"Ollama extraction failed: {str(e)}"
|
||
|
|
logger.error(f"❌ {error_msg}")
|
||
|
|
|
||
|
|
error_str = str(e).lower()
|
||
|
|
if "timeout" in error_str:
|
||
|
|
return {
|
||
|
|
"error": f"Ollama timeout efter 1000 sekunder",
|
||
|
|
"confidence": 0.0
|
||
|
|
}
|
||
|
|
elif "connection" in error_str or "connect" in error_str:
|
||
|
|
return {
|
||
|
|
"error": f"Kan ikke forbinde til Ollama på {self.endpoint}",
|
||
|
|
"confidence": 0.0
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
return {
|
||
|
|
"error": error_msg,
|
||
|
|
"confidence": 0.0
|
||
|
|
}
|
||
|
|
|
||
|
|
def _parse_json_response(self, response: str) -> Dict:
|
||
|
|
"""Parse JSON from LLM response with improved error handling"""
|
||
|
|
try:
|
||
|
|
# Find JSON in response (between first { and last })
|
||
|
|
start = response.find('{')
|
||
|
|
end = response.rfind('}') + 1
|
||
|
|
|
||
|
|
if start >= 0 and end > start:
|
||
|
|
json_str = response[start:end]
|
||
|
|
|
||
|
|
# Try to fix common JSON issues
|
||
|
|
# Remove trailing commas before } or ]
|
||
|
|
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||
|
|
# Fix single quotes to double quotes (but not in values)
|
||
|
|
# This is risky, so we only do it if initial parse fails
|
||
|
|
|
||
|
|
try:
|
||
|
|
data = json.loads(json_str)
|
||
|
|
return data
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
# Try to fix common issues
|
||
|
|
# Replace single quotes with double quotes (simple approach)
|
||
|
|
fixed_json = json_str.replace("'", '"')
|
||
|
|
try:
|
||
|
|
data = json.loads(fixed_json)
|
||
|
|
logger.warning("⚠️ Fixed JSON with quote replacement")
|
||
|
|
return data
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Last resort: log the problematic JSON
|
||
|
|
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
|
||
|
|
raise
|
||
|
|
else:
|
||
|
|
raise ValueError("No JSON found in response")
|
||
|
|
|
||
|
|
except json.JSONDecodeError as e:
|
||
|
|
logger.error(f"❌ JSON parsing failed: {e}")
|
||
|
|
logger.error(f"Raw response preview: {response[:500]}")
|
||
|
|
return {
|
||
|
|
"error": f"JSON parsing failed: {str(e)}",
|
||
|
|
"confidence": 0.0,
|
||
|
|
"raw_response": response[:500]
|
||
|
|
}
|
||
|
|
|
||
|
|
def calculate_file_checksum(self, file_path: Path) -> str:
|
||
|
|
"""Calculate SHA256 checksum of file for duplicate detection"""
|
||
|
|
sha256 = hashlib.sha256()
|
||
|
|
with open(file_path, 'rb') as f:
|
||
|
|
while chunk := f.read(8192):
|
||
|
|
sha256.update(chunk)
|
||
|
|
checksum = sha256.hexdigest()
|
||
|
|
logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")
|
||
|
|
return checksum
|
||
|
|
|
||
|
|
async def _extract_text_from_file(self, file_path: Path) -> str:
|
||
|
|
"""Extract text from PDF, image, or text file"""
|
||
|
|
suffix = file_path.suffix.lower()
|
||
|
|
|
||
|
|
try:
|
||
|
|
if suffix == '.pdf':
|
||
|
|
return await self._extract_text_from_pdf(file_path)
|
||
|
|
elif suffix in ['.png', '.jpg', '.jpeg']:
|
||
|
|
return await self._extract_text_from_image(file_path)
|
||
|
|
elif suffix in ['.txt', '.csv']:
|
||
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||
|
|
return f.read()
|
||
|
|
else:
|
||
|
|
raise ValueError(f"Unsupported file type: {suffix}")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")
|
||
|
|
raise
|
||
|
|
|
||
|
|
async def _extract_text_from_pdf(self, file_path: Path) -> str:
|
||
|
|
"""Extract text from PDF using PyPDF2"""
|
||
|
|
try:
|
||
|
|
from PyPDF2 import PdfReader
|
||
|
|
|
||
|
|
reader = PdfReader(file_path)
|
||
|
|
text = ""
|
||
|
|
|
||
|
|
for page_num, page in enumerate(reader.pages):
|
||
|
|
page_text = page.extract_text()
|
||
|
|
text += f"\\n--- Side {page_num + 1} ---\\n{page_text}"
|
||
|
|
|
||
|
|
logger.info(f"📄 Extracted {len(text)} chars from PDF with {len(reader.pages)} pages")
|
||
|
|
return text
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ PDF extraction failed: {e}")
|
||
|
|
raise
|
||
|
|
|
||
|
|
async def _extract_text_from_image(self, file_path: Path) -> str:
|
||
|
|
"""Extract text from image using Tesseract OCR"""
|
||
|
|
try:
|
||
|
|
import pytesseract
|
||
|
|
from PIL import Image
|
||
|
|
|
||
|
|
image = Image.open(file_path)
|
||
|
|
|
||
|
|
# Use Danish + English for OCR
|
||
|
|
text = pytesseract.image_to_string(image, lang='dan+eng')
|
||
|
|
|
||
|
|
logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")
|
||
|
|
return text
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ OCR extraction failed: {e}")
|
||
|
|
# Fallback to English only
|
||
|
|
try:
|
||
|
|
text = pytesseract.image_to_string(Image.open(file_path), lang='eng')
|
||
|
|
logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")
|
||
|
|
return text
|
||
|
|
except:
|
||
|
|
raise
|
||
|
|
|
||
|
|
def _get_mime_type(self, file_path: Path) -> str:
|
||
|
|
"""Get MIME type from file extension"""
|
||
|
|
suffix = file_path.suffix.lower()
|
||
|
|
mime_types = {
|
||
|
|
'.pdf': 'application/pdf',
|
||
|
|
'.png': 'image/png',
|
||
|
|
'.jpg': 'image/jpeg',
|
||
|
|
'.jpeg': 'image/jpeg',
|
||
|
|
'.txt': 'text/plain',
|
||
|
|
'.csv': 'text/csv'
|
||
|
|
}
|
||
|
|
return mime_types.get(suffix, 'application/octet-stream')
|
||
|
|
|
||
|
|
def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
|
||
|
|
"""
|
||
|
|
Match vendor from database using CVR number
|
||
|
|
|
||
|
|
Args:
|
||
|
|
vendor_cvr: CVR number from extraction
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Vendor dict if found, None otherwise
|
||
|
|
"""
|
||
|
|
if not vendor_cvr:
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Clean CVR (remove spaces, dashes)
|
||
|
|
cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)
|
||
|
|
|
||
|
|
if len(cvr_clean) != 8:
|
||
|
|
logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Search vendors table
|
||
|
|
vendor = execute_query(
|
||
|
|
"SELECT * FROM vendors WHERE cvr = %s",
|
||
|
|
(cvr_clean,),
|
||
|
|
fetchone=True
|
||
|
|
)
|
||
|
|
|
||
|
|
if vendor:
|
||
|
|
logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")
|
||
|
|
return vendor
|
||
|
|
else:
|
||
|
|
logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
# Global instance
|
||
|
|
ollama_service = OllamaService()
|