bmc_hub/app/services/ollama_service.py
Christian 18b0fe9c05 feat: Enhance billing frontend with Jinja2 templates and improve invoice handling
- Updated billing frontend views to use Jinja2 templates for rendering HTML pages.
- Added support for displaying supplier invoices, template builder, and templates list with titles.
- Introduced a new configuration setting for company CVR number.
- Enhanced OllamaService to support credit notes in invoice extraction, including detailed JSON output format.
- Improved PDF text extraction using pdfplumber for better layout handling.
- Added a modal for editing vendor details with comprehensive fields and validation.
- Implemented invoice loading and display functionality in vendor detail view.
- Updated vendor management to remove priority handling and improve error messaging.
- Added tests for AI analyze endpoint and CVR filtering to ensure correct behavior.
- Created migration script to support credit notes in the database schema.
2025-12-08 09:15:52 +01:00

414 lines
16 KiB
Python

"""
Ollama Integration Service for BMC Hub
Handles supplier invoice extraction using Ollama LLM with CVR matching
"""
import json
import hashlib
import logging
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from datetime import datetime
import re
from app.core.config import settings
from app.core.database import execute_insert, execute_query, execute_update
logger = logging.getLogger(__name__)
class OllamaService:
"""Service for extracting supplier invoice data using Ollama LLM"""
def __init__(self):
self.endpoint = settings.OLLAMA_ENDPOINT
self.model = settings.OLLAMA_MODEL
self.system_prompt = self._build_system_prompt()
logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")
def _build_system_prompt(self) -> str:
"""Build Danish system prompt for invoice extraction with CVR"""
return """Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer, kreditnotaer og leverandørdokumenter.
VIGTIGE REGLER:
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
2. Hvis et felt ikke findes, sæt det til null
3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)
4. Datoer skal være i format YYYY-MM-DD
5. DANSKE PRISFORMATER:
- Tusind-separator kan være . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
- Decimal-separator er , (komma): "1.234,56 kr"
- I JSON output skal du bruge . (punkt) som decimal: 1234.56
- Eksempel: "5.965,18 kr" → 5965.18 i JSON
- Eksempel: "1.234,56 DKK" → 1234.56 i JSON
6. CVR-nummer skal være 8 cifre uden mellemrum
7. Moms/VAT skal udtrækkes fra hver linje hvis muligt
8. DOKUMENTTYPE DETEKTION:
- "invoice" = Almindelig faktura
- "credit_note" = Kreditnota (refusion, tilbagebetaling, korrektion)
- Kig efter ord som: "Kreditnota", "Credit Note", "Refusion", "Tilbagebetaling", "Godtgørelse"
9. BELØB OG FORTEGN (ABSOLUT KRITISK):
- **ALMINDELIGE FAKTURAER**: Alle beløb skal være POSITIVE tal (total_amount > 0, line_total > 0)
- **KREDITNOTAER**: Alle beløb skal være NEGATIVE tal (total_amount < 0, line_total < 0)
- Hvis dokumentet siger "Faktura" → document_type: "invoice" → POSITIVE beløb
- Hvis dokumentet siger "Kreditnota" → document_type: "credit_note" → NEGATIVE beløb
JSON format skal være:
{
"document_type": "invoice" eller "credit_note",
"invoice_number": "fakturanummer eller kreditnota nummer",
"vendor_name": "leverandør firmanavn",
"vendor_cvr": "12345678",
"invoice_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"currency": "DKK",
"total_amount": 1234.56 (NEGATIVT for kreditnotaer),
"vat_amount": 123.45 (NEGATIVT for kreditnotaer),
"original_invoice_reference": "reference til original faktura (kun for kreditnotaer)",
"lines": [
{
"line_number": 1,
"description": "beskrivelse af varen/ydelsen",
"quantity": antal_som_tal,
"unit_price": pris_per_stk (NEGATIVT for kreditnotaer),
"line_total": total_for_linjen (NEGATIVT for kreditnotaer),
"vat_rate": 25.00,
"vat_amount": moms_beløb (NEGATIVT for kreditnotaer),
"confidence": 0.0_til_1.0
}
],
"confidence": gennemsnits_confidence,
"raw_text_snippet": "første 200 tegn fra dokumentet"
}
EKSEMPEL PÅ FAKTURA (POSITIVE BELØB):
Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk iPhone 16 @ 5.965,18 DKK\\nMoms (25%): 1.491,30 DKK\\nTotal: 7.456,48 DKK"
Output: {
"document_type": "invoice",
"invoice_number": "2025-001",
"vendor_name": "GlobalConnect A/S",
"vendor_cvr": "12345678",
"total_amount": 7456.48,
"vat_amount": 1491.30,
"lines": [{
"line_number": 1,
"description": "iPhone 16",
"quantity": 1,
"unit_price": 5965.18,
"line_total": 5965.18,
"vat_rate": 25.00,
"vat_amount": 1491.30,
"confidence": 0.95
}],
"confidence": 0.95
}
EKSEMPEL PÅ KREDITNOTA (NEGATIVE BELØB):
Input: "KREDITNOTA CN-2025-042\\nGlobalConnect A/S\\nCVR: 12345678\\nReference: Faktura 2025-001\\nTilbagebetaling:\\n1 stk iPhone 16 returneret @ -5.965,18 DKK\\nMoms (25%): -1.491,30 DKK\\nTotal: -7.456,48 DKK"
Output: {
"document_type": "credit_note",
"invoice_number": "CN-2025-042",
"vendor_name": "GlobalConnect A/S",
"vendor_cvr": "12345678",
"original_invoice_reference": "2025-001",
"total_amount": -7456.48,
"vat_amount": -1491.30,
"lines": [{
"line_number": 1,
"description": "iPhone 16 returneret",
"quantity": 1,
"unit_price": -5965.18,
"line_total": -5965.18,
"vat_rate": 25.00,
"vat_amount": -1491.30,
"confidence": 0.95
}],
"confidence": 0.95
}"""
async def extract_from_text(self, text: str) -> Dict:
"""
Extract structured invoice data from text using Ollama
Args:
text: Document text content
Returns:
Extracted data as dict with CVR, invoice number, amounts, etc.
"""
# No truncation - send full text to AI
prompt = f"{self.system_prompt}\n\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\n{text}\n\nReturner kun gyldig JSON:"
logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")
try:
import httpx
async with httpx.AsyncClient(timeout=1000.0) as client:
response = await client.post(
f"{self.endpoint}/api/generate",
json={
"model": self.model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1,
"top_p": 0.9,
"num_predict": 2000
}
}
)
if response.status_code != 200:
raise Exception(f"Ollama returned status {response.status_code}: {response.text}")
result = response.json()
raw_response = result.get("response", "")
logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")
# Parse JSON from response
extraction = self._parse_json_response(raw_response)
# CRITICAL: Fix amount signs based on document_type
# LLM sometimes returns negative amounts for invoices - fix this!
document_type = extraction.get('document_type', 'invoice')
if document_type == 'invoice':
# Normal invoices should have POSITIVE amounts
if extraction.get('total_amount') and extraction['total_amount'] < 0:
logger.warning(f"⚠️ Fixing negative total_amount for invoice: {extraction['total_amount']}{abs(extraction['total_amount'])}")
extraction['total_amount'] = abs(extraction['total_amount'])
if extraction.get('vat_amount') and extraction['vat_amount'] < 0:
extraction['vat_amount'] = abs(extraction['vat_amount'])
# Fix line totals
if 'lines' in extraction:
for line in extraction['lines']:
if line.get('unit_price') and line['unit_price'] < 0:
line['unit_price'] = abs(line['unit_price'])
if line.get('line_total') and line['line_total'] < 0:
line['line_total'] = abs(line['line_total'])
if line.get('vat_amount') and line['vat_amount'] < 0:
line['vat_amount'] = abs(line['vat_amount'])
elif document_type == 'credit_note':
# Credit notes should have NEGATIVE amounts
if extraction.get('total_amount') and extraction['total_amount'] > 0:
logger.warning(f"⚠️ Fixing positive total_amount for credit_note: {extraction['total_amount']}{-abs(extraction['total_amount'])}")
extraction['total_amount'] = -abs(extraction['total_amount'])
if extraction.get('vat_amount') and extraction['vat_amount'] > 0:
extraction['vat_amount'] = -abs(extraction['vat_amount'])
# Fix line totals
if 'lines' in extraction:
for line in extraction['lines']:
if line.get('unit_price') and line['unit_price'] > 0:
line['unit_price'] = -abs(line['unit_price'])
if line.get('line_total') and line['line_total'] > 0:
line['line_total'] = -abs(line['line_total'])
if line.get('vat_amount') and line['vat_amount'] > 0:
line['vat_amount'] = -abs(line['vat_amount'])
# Add raw response for debugging
extraction['_raw_llm_response'] = raw_response
return extraction
except Exception as e:
error_msg = f"Ollama extraction failed: {str(e)}"
logger.error(f"{error_msg}")
error_str = str(e).lower()
if "timeout" in error_str:
return {
"error": f"Ollama timeout efter 1000 sekunder",
"confidence": 0.0
}
elif "connection" in error_str or "connect" in error_str:
return {
"error": f"Kan ikke forbinde til Ollama på {self.endpoint}",
"confidence": 0.0
}
else:
return {
"error": error_msg,
"confidence": 0.0
}
def _parse_json_response(self, response: str) -> Dict:
"""Parse JSON from LLM response with improved error handling"""
try:
# Find JSON in response (between first { and last })
start = response.find('{')
end = response.rfind('}') + 1
if start >= 0 and end > start:
json_str = response[start:end]
# Try to fix common JSON issues
# Remove trailing commas before } or ]
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
# Fix single quotes to double quotes (but not in values)
# This is risky, so we only do it if initial parse fails
try:
data = json.loads(json_str)
return data
except json.JSONDecodeError:
# Try to fix common issues
# Replace single quotes with double quotes (simple approach)
fixed_json = json_str.replace("'", '"')
try:
data = json.loads(fixed_json)
logger.warning("⚠️ Fixed JSON with quote replacement")
return data
except:
pass
# Last resort: log the problematic JSON
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
raise
else:
raise ValueError("No JSON found in response")
except json.JSONDecodeError as e:
logger.error(f"❌ JSON parsing failed: {e}")
logger.error(f"Raw response preview: {response[:500]}")
return {
"error": f"JSON parsing failed: {str(e)}",
"confidence": 0.0,
"raw_response": response[:500]
}
def calculate_file_checksum(self, file_path: Path) -> str:
"""Calculate SHA256 checksum of file for duplicate detection"""
sha256 = hashlib.sha256()
with open(file_path, 'rb') as f:
while chunk := f.read(8192):
sha256.update(chunk)
checksum = sha256.hexdigest()
logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")
return checksum
async def _extract_text_from_file(self, file_path: Path) -> str:
"""Extract text from PDF, image, or text file"""
suffix = file_path.suffix.lower()
try:
if suffix == '.pdf':
return await self._extract_text_from_pdf(file_path)
elif suffix in ['.png', '.jpg', '.jpeg']:
return await self._extract_text_from_image(file_path)
elif suffix in ['.txt', '.csv']:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
else:
raise ValueError(f"Unsupported file type: {suffix}")
except Exception as e:
logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")
raise
async def _extract_text_from_pdf(self, file_path: Path) -> str:
"""Extract text from PDF using pdfplumber (better table/layout support)"""
try:
import pdfplumber
all_text = []
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages):
# Strategy: Use regular text extraction (includes tables)
# pdfplumber's extract_text() handles tables better than PyPDF2
page_text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
if page_text:
all_text.append(page_text)
text = "\\n".join(all_text)
logger.info(f"📄 Extracted {len(text)} chars from PDF with pdfplumber")
return text
except Exception as e:
logger.error(f"❌ PDF extraction failed: {e}")
raise
async def _extract_text_from_image(self, file_path: Path) -> str:
"""Extract text from image using Tesseract OCR"""
try:
import pytesseract
from PIL import Image
image = Image.open(file_path)
# Use Danish + English for OCR
text = pytesseract.image_to_string(image, lang='dan+eng')
logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")
return text
except Exception as e:
logger.error(f"❌ OCR extraction failed: {e}")
# Fallback to English only
try:
text = pytesseract.image_to_string(Image.open(file_path), lang='eng')
logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")
return text
except:
raise
def _get_mime_type(self, file_path: Path) -> str:
"""Get MIME type from file extension"""
suffix = file_path.suffix.lower()
mime_types = {
'.pdf': 'application/pdf',
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.txt': 'text/plain',
'.csv': 'text/csv'
}
return mime_types.get(suffix, 'application/octet-stream')
def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
"""
Match vendor from database using CVR number
Args:
vendor_cvr: CVR number from extraction
Returns:
Vendor dict if found, None otherwise
"""
if not vendor_cvr:
return None
# Clean CVR (remove spaces, dashes)
cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)
if len(cvr_clean) != 8:
logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")
return None
# Search vendors table
vendor = execute_query(
"SELECT * FROM vendors WHERE cvr = %s",
(cvr_clean,),
fetchone=True
)
if vendor:
logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")
return vendor
else:
logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")
return None
# Global instance
ollama_service = OllamaService()