- Updated billing frontend views to use Jinja2 templates for rendering HTML pages. - Added support for displaying supplier invoices, template builder, and templates list with titles. - Introduced a new configuration setting for company CVR number. - Enhanced OllamaService to support credit notes in invoice extraction, including detailed JSON output format. - Improved PDF text extraction using pdfplumber for better layout handling. - Added a modal for editing vendor details with comprehensive fields and validation. - Implemented invoice loading and display functionality in vendor detail view. - Updated vendor management to remove priority handling and improve error messaging. - Added tests for AI analyze endpoint and CVR filtering to ensure correct behavior. - Created migration script to support credit notes in the database schema.
414 lines
16 KiB
Python
414 lines
16 KiB
Python
"""
|
|
Ollama Integration Service for BMC Hub
|
|
Handles supplier invoice extraction using Ollama LLM with CVR matching
|
|
"""
|
|
|
|
import json
|
|
import hashlib
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List, Tuple
|
|
from datetime import datetime
|
|
import re
|
|
|
|
from app.core.config import settings
|
|
from app.core.database import execute_insert, execute_query, execute_update
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class OllamaService:
|
|
"""Service for extracting supplier invoice data using Ollama LLM"""
|
|
|
|
def __init__(self):
|
|
self.endpoint = settings.OLLAMA_ENDPOINT
|
|
self.model = settings.OLLAMA_MODEL
|
|
self.system_prompt = self._build_system_prompt()
|
|
logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")
|
|
|
|
def _build_system_prompt(self) -> str:
|
|
"""Build Danish system prompt for invoice extraction with CVR"""
|
|
return """Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer, kreditnotaer og leverandørdokumenter.
|
|
|
|
VIGTIGE REGLER:
|
|
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
|
|
2. Hvis et felt ikke findes, sæt det til null
|
|
3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)
|
|
4. Datoer skal være i format YYYY-MM-DD
|
|
5. DANSKE PRISFORMATER:
|
|
- Tusind-separator kan være . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
|
|
- Decimal-separator er , (komma): "1.234,56 kr"
|
|
- I JSON output skal du bruge . (punkt) som decimal: 1234.56
|
|
- Eksempel: "5.965,18 kr" → 5965.18 i JSON
|
|
- Eksempel: "1.234,56 DKK" → 1234.56 i JSON
|
|
6. CVR-nummer skal være 8 cifre uden mellemrum
|
|
7. Moms/VAT skal udtrækkes fra hver linje hvis muligt
|
|
8. DOKUMENTTYPE DETEKTION:
|
|
- "invoice" = Almindelig faktura
|
|
- "credit_note" = Kreditnota (refusion, tilbagebetaling, korrektion)
|
|
- Kig efter ord som: "Kreditnota", "Credit Note", "Refusion", "Tilbagebetaling", "Godtgørelse"
|
|
9. BELØB OG FORTEGN (ABSOLUT KRITISK):
|
|
- **ALMINDELIGE FAKTURAER**: Alle beløb skal være POSITIVE tal (total_amount > 0, line_total > 0)
|
|
- **KREDITNOTAER**: Alle beløb skal være NEGATIVE tal (total_amount < 0, line_total < 0)
|
|
- Hvis dokumentet siger "Faktura" → document_type: "invoice" → POSITIVE beløb
|
|
- Hvis dokumentet siger "Kreditnota" → document_type: "credit_note" → NEGATIVE beløb
|
|
|
|
JSON format skal være:
|
|
{
|
|
"document_type": "invoice" eller "credit_note",
|
|
"invoice_number": "fakturanummer eller kreditnota nummer",
|
|
"vendor_name": "leverandør firmanavn",
|
|
"vendor_cvr": "12345678",
|
|
"invoice_date": "YYYY-MM-DD",
|
|
"due_date": "YYYY-MM-DD",
|
|
"currency": "DKK",
|
|
"total_amount": 1234.56 (NEGATIVT for kreditnotaer),
|
|
"vat_amount": 123.45 (NEGATIVT for kreditnotaer),
|
|
"original_invoice_reference": "reference til original faktura (kun for kreditnotaer)",
|
|
"lines": [
|
|
{
|
|
"line_number": 1,
|
|
"description": "beskrivelse af varen/ydelsen",
|
|
"quantity": antal_som_tal,
|
|
"unit_price": pris_per_stk (NEGATIVT for kreditnotaer),
|
|
"line_total": total_for_linjen (NEGATIVT for kreditnotaer),
|
|
"vat_rate": 25.00,
|
|
"vat_amount": moms_beløb (NEGATIVT for kreditnotaer),
|
|
"confidence": 0.0_til_1.0
|
|
}
|
|
],
|
|
"confidence": gennemsnits_confidence,
|
|
"raw_text_snippet": "første 200 tegn fra dokumentet"
|
|
}
|
|
|
|
EKSEMPEL PÅ FAKTURA (POSITIVE BELØB):
|
|
Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk iPhone 16 @ 5.965,18 DKK\\nMoms (25%): 1.491,30 DKK\\nTotal: 7.456,48 DKK"
|
|
|
|
Output: {
|
|
"document_type": "invoice",
|
|
"invoice_number": "2025-001",
|
|
"vendor_name": "GlobalConnect A/S",
|
|
"vendor_cvr": "12345678",
|
|
"total_amount": 7456.48,
|
|
"vat_amount": 1491.30,
|
|
"lines": [{
|
|
"line_number": 1,
|
|
"description": "iPhone 16",
|
|
"quantity": 1,
|
|
"unit_price": 5965.18,
|
|
"line_total": 5965.18,
|
|
"vat_rate": 25.00,
|
|
"vat_amount": 1491.30,
|
|
"confidence": 0.95
|
|
}],
|
|
"confidence": 0.95
|
|
}
|
|
|
|
EKSEMPEL PÅ KREDITNOTA (NEGATIVE BELØB):
|
|
Input: "KREDITNOTA CN-2025-042\\nGlobalConnect A/S\\nCVR: 12345678\\nReference: Faktura 2025-001\\nTilbagebetaling:\\n1 stk iPhone 16 returneret @ -5.965,18 DKK\\nMoms (25%): -1.491,30 DKK\\nTotal: -7.456,48 DKK"
|
|
|
|
Output: {
|
|
"document_type": "credit_note",
|
|
"invoice_number": "CN-2025-042",
|
|
"vendor_name": "GlobalConnect A/S",
|
|
"vendor_cvr": "12345678",
|
|
"original_invoice_reference": "2025-001",
|
|
"total_amount": -7456.48,
|
|
"vat_amount": -1491.30,
|
|
"lines": [{
|
|
"line_number": 1,
|
|
"description": "iPhone 16 returneret",
|
|
"quantity": 1,
|
|
"unit_price": -5965.18,
|
|
"line_total": -5965.18,
|
|
"vat_rate": 25.00,
|
|
"vat_amount": -1491.30,
|
|
"confidence": 0.95
|
|
}],
|
|
"confidence": 0.95
|
|
}"""
|
|
|
|
async def extract_from_text(self, text: str) -> Dict:
|
|
"""
|
|
Extract structured invoice data from text using Ollama
|
|
|
|
Args:
|
|
text: Document text content
|
|
|
|
Returns:
|
|
Extracted data as dict with CVR, invoice number, amounts, etc.
|
|
"""
|
|
|
|
# No truncation - send full text to AI
|
|
prompt = f"{self.system_prompt}\n\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\n{text}\n\nReturner kun gyldig JSON:"
|
|
|
|
logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")
|
|
|
|
try:
|
|
import httpx
|
|
|
|
async with httpx.AsyncClient(timeout=1000.0) as client:
|
|
response = await client.post(
|
|
f"{self.endpoint}/api/generate",
|
|
json={
|
|
"model": self.model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"top_p": 0.9,
|
|
"num_predict": 2000
|
|
}
|
|
}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Ollama returned status {response.status_code}: {response.text}")
|
|
|
|
result = response.json()
|
|
raw_response = result.get("response", "")
|
|
|
|
logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")
|
|
|
|
# Parse JSON from response
|
|
extraction = self._parse_json_response(raw_response)
|
|
|
|
# CRITICAL: Fix amount signs based on document_type
|
|
# LLM sometimes returns negative amounts for invoices - fix this!
|
|
document_type = extraction.get('document_type', 'invoice')
|
|
|
|
if document_type == 'invoice':
|
|
# Normal invoices should have POSITIVE amounts
|
|
if extraction.get('total_amount') and extraction['total_amount'] < 0:
|
|
logger.warning(f"⚠️ Fixing negative total_amount for invoice: {extraction['total_amount']} → {abs(extraction['total_amount'])}")
|
|
extraction['total_amount'] = abs(extraction['total_amount'])
|
|
|
|
if extraction.get('vat_amount') and extraction['vat_amount'] < 0:
|
|
extraction['vat_amount'] = abs(extraction['vat_amount'])
|
|
|
|
# Fix line totals
|
|
if 'lines' in extraction:
|
|
for line in extraction['lines']:
|
|
if line.get('unit_price') and line['unit_price'] < 0:
|
|
line['unit_price'] = abs(line['unit_price'])
|
|
if line.get('line_total') and line['line_total'] < 0:
|
|
line['line_total'] = abs(line['line_total'])
|
|
if line.get('vat_amount') and line['vat_amount'] < 0:
|
|
line['vat_amount'] = abs(line['vat_amount'])
|
|
|
|
elif document_type == 'credit_note':
|
|
# Credit notes should have NEGATIVE amounts
|
|
if extraction.get('total_amount') and extraction['total_amount'] > 0:
|
|
logger.warning(f"⚠️ Fixing positive total_amount for credit_note: {extraction['total_amount']} → {-abs(extraction['total_amount'])}")
|
|
extraction['total_amount'] = -abs(extraction['total_amount'])
|
|
|
|
if extraction.get('vat_amount') and extraction['vat_amount'] > 0:
|
|
extraction['vat_amount'] = -abs(extraction['vat_amount'])
|
|
|
|
# Fix line totals
|
|
if 'lines' in extraction:
|
|
for line in extraction['lines']:
|
|
if line.get('unit_price') and line['unit_price'] > 0:
|
|
line['unit_price'] = -abs(line['unit_price'])
|
|
if line.get('line_total') and line['line_total'] > 0:
|
|
line['line_total'] = -abs(line['line_total'])
|
|
if line.get('vat_amount') and line['vat_amount'] > 0:
|
|
line['vat_amount'] = -abs(line['vat_amount'])
|
|
|
|
# Add raw response for debugging
|
|
extraction['_raw_llm_response'] = raw_response
|
|
|
|
return extraction
|
|
|
|
except Exception as e:
|
|
error_msg = f"Ollama extraction failed: {str(e)}"
|
|
logger.error(f"❌ {error_msg}")
|
|
|
|
error_str = str(e).lower()
|
|
if "timeout" in error_str:
|
|
return {
|
|
"error": f"Ollama timeout efter 1000 sekunder",
|
|
"confidence": 0.0
|
|
}
|
|
elif "connection" in error_str or "connect" in error_str:
|
|
return {
|
|
"error": f"Kan ikke forbinde til Ollama på {self.endpoint}",
|
|
"confidence": 0.0
|
|
}
|
|
else:
|
|
return {
|
|
"error": error_msg,
|
|
"confidence": 0.0
|
|
}
|
|
|
|
def _parse_json_response(self, response: str) -> Dict:
|
|
"""Parse JSON from LLM response with improved error handling"""
|
|
try:
|
|
# Find JSON in response (between first { and last })
|
|
start = response.find('{')
|
|
end = response.rfind('}') + 1
|
|
|
|
if start >= 0 and end > start:
|
|
json_str = response[start:end]
|
|
|
|
# Try to fix common JSON issues
|
|
# Remove trailing commas before } or ]
|
|
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
|
# Fix single quotes to double quotes (but not in values)
|
|
# This is risky, so we only do it if initial parse fails
|
|
|
|
try:
|
|
data = json.loads(json_str)
|
|
return data
|
|
except json.JSONDecodeError:
|
|
# Try to fix common issues
|
|
# Replace single quotes with double quotes (simple approach)
|
|
fixed_json = json_str.replace("'", '"')
|
|
try:
|
|
data = json.loads(fixed_json)
|
|
logger.warning("⚠️ Fixed JSON with quote replacement")
|
|
return data
|
|
except:
|
|
pass
|
|
|
|
# Last resort: log the problematic JSON
|
|
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
|
|
raise
|
|
else:
|
|
raise ValueError("No JSON found in response")
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"❌ JSON parsing failed: {e}")
|
|
logger.error(f"Raw response preview: {response[:500]}")
|
|
return {
|
|
"error": f"JSON parsing failed: {str(e)}",
|
|
"confidence": 0.0,
|
|
"raw_response": response[:500]
|
|
}
|
|
|
|
def calculate_file_checksum(self, file_path: Path) -> str:
|
|
"""Calculate SHA256 checksum of file for duplicate detection"""
|
|
sha256 = hashlib.sha256()
|
|
with open(file_path, 'rb') as f:
|
|
while chunk := f.read(8192):
|
|
sha256.update(chunk)
|
|
checksum = sha256.hexdigest()
|
|
logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")
|
|
return checksum
|
|
|
|
async def _extract_text_from_file(self, file_path: Path) -> str:
|
|
"""Extract text from PDF, image, or text file"""
|
|
suffix = file_path.suffix.lower()
|
|
|
|
try:
|
|
if suffix == '.pdf':
|
|
return await self._extract_text_from_pdf(file_path)
|
|
elif suffix in ['.png', '.jpg', '.jpeg']:
|
|
return await self._extract_text_from_image(file_path)
|
|
elif suffix in ['.txt', '.csv']:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
return f.read()
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {suffix}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")
|
|
raise
|
|
|
|
async def _extract_text_from_pdf(self, file_path: Path) -> str:
|
|
"""Extract text from PDF using pdfplumber (better table/layout support)"""
|
|
try:
|
|
import pdfplumber
|
|
|
|
all_text = []
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for page_num, page in enumerate(pdf.pages):
|
|
# Strategy: Use regular text extraction (includes tables)
|
|
# pdfplumber's extract_text() handles tables better than PyPDF2
|
|
page_text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
|
|
|
|
if page_text:
|
|
all_text.append(page_text)
|
|
|
|
text = "\\n".join(all_text)
|
|
logger.info(f"📄 Extracted {len(text)} chars from PDF with pdfplumber")
|
|
return text
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ PDF extraction failed: {e}")
|
|
raise
|
|
|
|
async def _extract_text_from_image(self, file_path: Path) -> str:
|
|
"""Extract text from image using Tesseract OCR"""
|
|
try:
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
image = Image.open(file_path)
|
|
|
|
# Use Danish + English for OCR
|
|
text = pytesseract.image_to_string(image, lang='dan+eng')
|
|
|
|
logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")
|
|
return text
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ OCR extraction failed: {e}")
|
|
# Fallback to English only
|
|
try:
|
|
text = pytesseract.image_to_string(Image.open(file_path), lang='eng')
|
|
logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")
|
|
return text
|
|
except:
|
|
raise
|
|
|
|
def _get_mime_type(self, file_path: Path) -> str:
|
|
"""Get MIME type from file extension"""
|
|
suffix = file_path.suffix.lower()
|
|
mime_types = {
|
|
'.pdf': 'application/pdf',
|
|
'.png': 'image/png',
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.txt': 'text/plain',
|
|
'.csv': 'text/csv'
|
|
}
|
|
return mime_types.get(suffix, 'application/octet-stream')
|
|
|
|
def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
|
|
"""
|
|
Match vendor from database using CVR number
|
|
|
|
Args:
|
|
vendor_cvr: CVR number from extraction
|
|
|
|
Returns:
|
|
Vendor dict if found, None otherwise
|
|
"""
|
|
if not vendor_cvr:
|
|
return None
|
|
|
|
# Clean CVR (remove spaces, dashes)
|
|
cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)
|
|
|
|
if len(cvr_clean) != 8:
|
|
logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")
|
|
return None
|
|
|
|
# Search vendors table
|
|
vendor = execute_query(
|
|
"SELECT * FROM vendors WHERE cvr = %s",
|
|
(cvr_clean,),
|
|
fetchone=True
|
|
)
|
|
|
|
if vendor:
|
|
logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")
|
|
return vendor
|
|
else:
|
|
logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")
|
|
return None
|
|
|
|
|
|
# Global instance
|
|
ollama_service = OllamaService()
|