- Added `check_invoice_number_exists` method in `EconomicService` to verify invoice numbers in e-conomic journals. - Introduced `quick_analysis_on_upload` method in `OllamaService` for extracting critical fields from uploaded PDFs, including CVR, document type, and document number. - Created migration script to add new fields for storing detected CVR, vendor ID, document type, and document number in the `incoming_files` table. - Developed comprehensive tests for the quick analysis functionality, validating CVR detection, document type identification, and invoice number extraction.
601 lines
25 KiB
Python
601 lines
25 KiB
Python
"""
|
|
Ollama Integration Service for BMC Hub
|
|
Handles supplier invoice extraction using Ollama LLM with CVR matching
|
|
"""
|
|
|
|
import json
|
|
import hashlib
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List, Tuple
|
|
from datetime import datetime
|
|
import re
|
|
|
|
from app.core.config import settings
|
|
from app.core.database import execute_insert, execute_query, execute_update
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class OllamaService:
|
|
"""Service for extracting supplier invoice data using Ollama LLM"""
|
|
|
|
def __init__(self):
|
|
self.endpoint = settings.OLLAMA_ENDPOINT
|
|
self.model = settings.OLLAMA_MODEL
|
|
self.system_prompt = self._build_system_prompt()
|
|
logger.info(f"🤖 Initialized OllamaService: {self.endpoint}, model={self.model}")
|
|
|
|
def _build_system_prompt(self) -> str:
|
|
"""Build Danish system prompt for invoice extraction with CVR"""
|
|
return """Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer, kreditnotaer og leverandørdokumenter.
|
|
|
|
VIGTIGE REGLER:
|
|
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
|
|
2. Hvis et felt ikke findes, sæt det til null
|
|
3. Beregn confidence baseret på hvor sikker du er på hvert felt (0.0-1.0)
|
|
4. Datoer skal være i format YYYY-MM-DD
|
|
5. DANSKE PRISFORMATER:
|
|
- Tusind-separator kan være . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
|
|
- Decimal-separator er , (komma): "1.234,56 kr"
|
|
- I JSON output skal du bruge . (punkt) som decimal: 1234.56
|
|
- Eksempel: "5.965,18 kr" → 5965.18 i JSON
|
|
- Eksempel: "1.234,56 DKK" → 1234.56 i JSON
|
|
6. CVR-nummer skal være 8 cifre uden mellemrum
|
|
7. Moms/VAT skal udtrækkes fra hver linje hvis muligt
|
|
8. DOKUMENTTYPE DETEKTION:
|
|
- "invoice" = Almindelig faktura
|
|
- "credit_note" = Kreditnota (refusion, tilbagebetaling, korrektion)
|
|
- Kig efter ord som: "Kreditnota", "Credit Note", "Refusion", "Tilbagebetaling", "Godtgørelse"
|
|
9. BELØB OG FORTEGN (ABSOLUT KRITISK):
|
|
- **ALMINDELIGE FAKTURAER**: Alle beløb skal være POSITIVE tal (total_amount > 0, line_total > 0)
|
|
- **KREDITNOTAER**: Alle beløb skal være NEGATIVE tal (total_amount < 0, line_total < 0)
|
|
- Hvis dokumentet siger "Faktura" → document_type: "invoice" → POSITIVE beløb
|
|
- Hvis dokumentet siger "Kreditnota" → document_type: "credit_note" → NEGATIVE beløb
|
|
|
|
JSON format skal være:
|
|
{
|
|
"document_type": "invoice" eller "credit_note",
|
|
"invoice_number": "fakturanummer eller kreditnota nummer",
|
|
"vendor_name": "leverandør firmanavn",
|
|
"vendor_cvr": "12345678",
|
|
"invoice_date": "YYYY-MM-DD",
|
|
"due_date": "YYYY-MM-DD",
|
|
"currency": "DKK",
|
|
"total_amount": 1234.56 (NEGATIVT for kreditnotaer),
|
|
"vat_amount": 123.45 (NEGATIVT for kreditnotaer),
|
|
"original_invoice_reference": "reference til original faktura (kun for kreditnotaer)",
|
|
"lines": [
|
|
{
|
|
"line_number": 1,
|
|
"description": "beskrivelse af varen/ydelsen",
|
|
"quantity": antal_som_tal,
|
|
"unit_price": pris_per_stk (NEGATIVT for kreditnotaer),
|
|
"line_total": total_for_linjen (NEGATIVT for kreditnotaer),
|
|
"vat_rate": 25.00,
|
|
"vat_amount": moms_beløb (NEGATIVT for kreditnotaer),
|
|
"confidence": 0.0_til_1.0
|
|
}
|
|
],
|
|
"confidence": gennemsnits_confidence,
|
|
"raw_text_snippet": "første 200 tegn fra dokumentet"
|
|
}
|
|
|
|
EKSEMPEL PÅ FAKTURA (POSITIVE BELØB):
|
|
Input: "FAKTURA 2025-001\\nGlobalConnect A/S\\nCVR: 12345678\\n1 stk iPhone 16 @ 5.965,18 DKK\\nMoms (25%): 1.491,30 DKK\\nTotal: 7.456,48 DKK"
|
|
|
|
Output: {
|
|
"document_type": "invoice",
|
|
"invoice_number": "2025-001",
|
|
"vendor_name": "GlobalConnect A/S",
|
|
"vendor_cvr": "12345678",
|
|
"total_amount": 7456.48,
|
|
"vat_amount": 1491.30,
|
|
"lines": [{
|
|
"line_number": 1,
|
|
"description": "iPhone 16",
|
|
"quantity": 1,
|
|
"unit_price": 5965.18,
|
|
"line_total": 5965.18,
|
|
"vat_rate": 25.00,
|
|
"vat_amount": 1491.30,
|
|
"confidence": 0.95
|
|
}],
|
|
"confidence": 0.95
|
|
}
|
|
|
|
EKSEMPEL PÅ KREDITNOTA (NEGATIVE BELØB):
|
|
Input: "KREDITNOTA CN-2025-042\\nGlobalConnect A/S\\nCVR: 12345678\\nReference: Faktura 2025-001\\nTilbagebetaling:\\n1 stk iPhone 16 returneret @ -5.965,18 DKK\\nMoms (25%): -1.491,30 DKK\\nTotal: -7.456,48 DKK"
|
|
|
|
Output: {
|
|
"document_type": "credit_note",
|
|
"invoice_number": "CN-2025-042",
|
|
"vendor_name": "GlobalConnect A/S",
|
|
"vendor_cvr": "12345678",
|
|
"original_invoice_reference": "2025-001",
|
|
"total_amount": -7456.48,
|
|
"vat_amount": -1491.30,
|
|
"lines": [{
|
|
"line_number": 1,
|
|
"description": "iPhone 16 returneret",
|
|
"quantity": 1,
|
|
"unit_price": -5965.18,
|
|
"line_total": -5965.18,
|
|
"vat_rate": 25.00,
|
|
"vat_amount": -1491.30,
|
|
"confidence": 0.95
|
|
}],
|
|
"confidence": 0.95
|
|
}"""
|
|
|
|
async def extract_from_text(self, text: str) -> Dict:
|
|
"""
|
|
Extract structured invoice data from text using Ollama
|
|
|
|
Args:
|
|
text: Document text content
|
|
|
|
Returns:
|
|
Extracted data as dict with CVR, invoice number, amounts, etc.
|
|
"""
|
|
|
|
# No truncation - send full text to AI
|
|
prompt = f"{self.system_prompt}\n\nNU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\n{text}\n\nReturner kun gyldig JSON:"
|
|
|
|
logger.info(f"🤖 Extracting invoice data from text (length: {len(text)})")
|
|
|
|
try:
|
|
import httpx
|
|
|
|
# Detect if using qwen3 model (requires Chat API)
|
|
use_chat_api = self.model.startswith('qwen3')
|
|
|
|
async with httpx.AsyncClient(timeout=1000.0) as client:
|
|
if use_chat_api:
|
|
# qwen3 models use Chat API format
|
|
logger.info(f"🤖 Using Chat API for {self.model}")
|
|
response = await client.post(
|
|
f"{self.endpoint}/api/chat",
|
|
json={
|
|
"model": self.model,
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": self.system_prompt
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"NU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA:\n{text}\n\nVIGTIGT: Dit svar skal STARTE med {{ og SLUTTE med }} - ingen forklaring før eller efter JSON!"
|
|
}
|
|
],
|
|
"stream": False,
|
|
"format": "json",
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"top_p": 0.9,
|
|
"num_predict": 2000
|
|
}
|
|
}
|
|
)
|
|
else:
|
|
# qwen2.5 and other models use Generate API format
|
|
logger.info(f"🤖 Using Generate API for {self.model}")
|
|
response = await client.post(
|
|
f"{self.endpoint}/api/generate",
|
|
json={
|
|
"model": self.model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"top_p": 0.9,
|
|
"num_predict": 2000
|
|
}
|
|
}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"Ollama returned status {response.status_code}: {response.text}")
|
|
|
|
result = response.json()
|
|
|
|
# Extract response based on API type
|
|
if use_chat_api:
|
|
# qwen3 models sometimes put the actual response in "thinking" field
|
|
raw_response = result.get("message", {}).get("content", "")
|
|
thinking = result.get("message", {}).get("thinking", "")
|
|
|
|
# If content is empty but thinking has data, try to extract JSON from thinking
|
|
if not raw_response and thinking:
|
|
logger.info(f"💭 Content empty, attempting to extract JSON from thinking field (length: {len(thinking)})")
|
|
# Try to find JSON block in thinking text
|
|
json_start = thinking.find('{')
|
|
json_end = thinking.rfind('}') + 1
|
|
if json_start >= 0 and json_end > json_start:
|
|
potential_json = thinking[json_start:json_end]
|
|
logger.info(f"📦 Found potential JSON in thinking field (length: {len(potential_json)})")
|
|
raw_response = potential_json
|
|
else:
|
|
logger.warning(f"⚠️ No JSON found in thinking field, using full thinking as fallback")
|
|
raw_response = thinking
|
|
elif thinking:
|
|
logger.info(f"💭 Model thinking (length: {len(thinking)})")
|
|
|
|
# DEBUG: Log full result structure
|
|
logger.info(f"📊 Chat API result keys: {list(result.keys())}")
|
|
logger.info(f"📊 Message keys: {list(result.get('message', {}).keys())}")
|
|
else:
|
|
raw_response = result.get("response", "")
|
|
|
|
logger.info(f"✅ Ollama extraction completed (response length: {len(raw_response)})")
|
|
|
|
# Parse JSON from response
|
|
extraction = self._parse_json_response(raw_response)
|
|
|
|
# CRITICAL: Fix amount signs based on document_type
|
|
# LLM sometimes returns negative amounts for invoices - fix this!
|
|
document_type = extraction.get('document_type', 'invoice')
|
|
|
|
if document_type == 'invoice':
|
|
# Normal invoices should have POSITIVE amounts
|
|
if extraction.get('total_amount') and extraction['total_amount'] < 0:
|
|
logger.warning(f"⚠️ Fixing negative total_amount for invoice: {extraction['total_amount']} → {abs(extraction['total_amount'])}")
|
|
extraction['total_amount'] = abs(extraction['total_amount'])
|
|
|
|
if extraction.get('vat_amount') and extraction['vat_amount'] < 0:
|
|
extraction['vat_amount'] = abs(extraction['vat_amount'])
|
|
|
|
# Fix line totals
|
|
if 'lines' in extraction:
|
|
for line in extraction['lines']:
|
|
if line.get('unit_price') and line['unit_price'] < 0:
|
|
line['unit_price'] = abs(line['unit_price'])
|
|
if line.get('line_total') and line['line_total'] < 0:
|
|
line['line_total'] = abs(line['line_total'])
|
|
if line.get('vat_amount') and line['vat_amount'] < 0:
|
|
line['vat_amount'] = abs(line['vat_amount'])
|
|
|
|
elif document_type == 'credit_note':
|
|
# Credit notes should have NEGATIVE amounts
|
|
if extraction.get('total_amount') and extraction['total_amount'] > 0:
|
|
logger.warning(f"⚠️ Fixing positive total_amount for credit_note: {extraction['total_amount']} → {-abs(extraction['total_amount'])}")
|
|
extraction['total_amount'] = -abs(extraction['total_amount'])
|
|
|
|
if extraction.get('vat_amount') and extraction['vat_amount'] > 0:
|
|
extraction['vat_amount'] = -abs(extraction['vat_amount'])
|
|
|
|
# Fix line totals
|
|
if 'lines' in extraction:
|
|
for line in extraction['lines']:
|
|
if line.get('unit_price') and line['unit_price'] > 0:
|
|
line['unit_price'] = -abs(line['unit_price'])
|
|
if line.get('line_total') and line['line_total'] > 0:
|
|
line['line_total'] = -abs(line['line_total'])
|
|
if line.get('vat_amount') and line['vat_amount'] > 0:
|
|
line['vat_amount'] = -abs(line['vat_amount'])
|
|
|
|
# Add raw response for debugging
|
|
extraction['_raw_llm_response'] = raw_response
|
|
|
|
return extraction
|
|
|
|
except Exception as e:
|
|
error_msg = f"Ollama extraction failed: {str(e)}"
|
|
logger.error(f"❌ {error_msg}")
|
|
|
|
error_str = str(e).lower()
|
|
if "timeout" in error_str:
|
|
return {
|
|
"error": f"Ollama timeout efter 1000 sekunder",
|
|
"confidence": 0.0
|
|
}
|
|
elif "connection" in error_str or "connect" in error_str:
|
|
return {
|
|
"error": f"Kan ikke forbinde til Ollama på {self.endpoint}",
|
|
"confidence": 0.0
|
|
}
|
|
else:
|
|
return {
|
|
"error": error_msg,
|
|
"confidence": 0.0
|
|
}
|
|
|
|
def _parse_json_response(self, response: str) -> Dict:
|
|
"""Parse JSON from LLM response with improved error handling"""
|
|
try:
|
|
# Log preview of response for debugging
|
|
logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
|
|
|
|
# Find JSON in response (between first { and last })
|
|
start = response.find('{')
|
|
end = response.rfind('}') + 1
|
|
|
|
if start >= 0 and end > start:
|
|
json_str = response[start:end]
|
|
logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
|
|
|
|
# Try to fix common JSON issues
|
|
# Remove trailing commas before } or ]
|
|
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
|
# Fix single quotes to double quotes (but not in values)
|
|
# This is risky, so we only do it if initial parse fails
|
|
|
|
try:
|
|
data = json.loads(json_str)
|
|
return data
|
|
except json.JSONDecodeError:
|
|
# Try to fix common issues
|
|
# Replace single quotes with double quotes (simple approach)
|
|
fixed_json = json_str.replace("'", '"')
|
|
try:
|
|
data = json.loads(fixed_json)
|
|
logger.warning("⚠️ Fixed JSON with quote replacement")
|
|
return data
|
|
except:
|
|
pass
|
|
|
|
# Last resort: log the problematic JSON
|
|
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
|
|
raise
|
|
else:
|
|
raise ValueError("No JSON found in response")
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"❌ JSON parsing failed: {e}")
|
|
logger.error(f"Raw response preview: {response[:500]}")
|
|
return {
|
|
"error": f"JSON parsing failed: {str(e)}",
|
|
"confidence": 0.0,
|
|
"raw_response": response[:500]
|
|
}
|
|
|
|
def calculate_file_checksum(self, file_path: Path) -> str:
|
|
"""Calculate SHA256 checksum of file for duplicate detection"""
|
|
sha256 = hashlib.sha256()
|
|
with open(file_path, 'rb') as f:
|
|
while chunk := f.read(8192):
|
|
sha256.update(chunk)
|
|
checksum = sha256.hexdigest()
|
|
logger.info(f"📋 Calculated checksum: {checksum[:16]}... for {file_path.name}")
|
|
return checksum
|
|
|
|
async def _extract_text_from_file(self, file_path: Path) -> str:
|
|
"""Extract text from PDF, image, or text file"""
|
|
suffix = file_path.suffix.lower()
|
|
|
|
try:
|
|
if suffix == '.pdf':
|
|
return await self._extract_text_from_pdf(file_path)
|
|
elif suffix in ['.png', '.jpg', '.jpeg']:
|
|
return await self._extract_text_from_image(file_path)
|
|
elif suffix in ['.txt', '.csv']:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
return f.read()
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {suffix}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Text extraction failed for {file_path.name}: {e}")
|
|
raise
|
|
|
|
async def _extract_text_from_pdf(self, file_path: Path) -> str:
|
|
"""Extract text from PDF using pdfplumber (better table/layout support)"""
|
|
try:
|
|
import pdfplumber
|
|
|
|
all_text = []
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for page_num, page in enumerate(pdf.pages):
|
|
# Strategy: Use regular text extraction (includes tables)
|
|
# pdfplumber's extract_text() handles tables better than PyPDF2
|
|
page_text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
|
|
|
|
if page_text:
|
|
all_text.append(page_text)
|
|
|
|
text = "\\n".join(all_text)
|
|
logger.info(f"📄 Extracted {len(text)} chars from PDF with pdfplumber")
|
|
return text
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ PDF extraction failed: {e}")
|
|
raise
|
|
|
|
async def _extract_text_from_image(self, file_path: Path) -> str:
|
|
"""Extract text from image using Tesseract OCR"""
|
|
try:
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
image = Image.open(file_path)
|
|
|
|
# Use Danish + English for OCR
|
|
text = pytesseract.image_to_string(image, lang='dan+eng')
|
|
|
|
logger.info(f"🖼️ Extracted {len(text)} chars from image via OCR")
|
|
return text
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ OCR extraction failed: {e}")
|
|
# Fallback to English only
|
|
try:
|
|
text = pytesseract.image_to_string(Image.open(file_path), lang='eng')
|
|
logger.warning(f"⚠️ Fallback to English OCR: {len(text)} chars")
|
|
return text
|
|
except:
|
|
raise
|
|
|
|
def _get_mime_type(self, file_path: Path) -> str:
|
|
"""Get MIME type from file extension"""
|
|
suffix = file_path.suffix.lower()
|
|
mime_types = {
|
|
'.pdf': 'application/pdf',
|
|
'.png': 'image/png',
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.txt': 'text/plain',
|
|
'.csv': 'text/csv'
|
|
}
|
|
return mime_types.get(suffix, 'application/octet-stream')
|
|
|
|
async def quick_analysis_on_upload(self, pdf_text: str) -> Dict:
|
|
"""
|
|
Quick analysis when file is uploaded - extracts critical fields only:
|
|
- CVR number (to match vendor)
|
|
- Document type (invoice vs credit note)
|
|
- Invoice/credit note number
|
|
|
|
This runs BEFORE template matching for early vendor detection.
|
|
|
|
Args:
|
|
pdf_text: Extracted text from PDF
|
|
|
|
Returns:
|
|
Dict with cvr, document_type, document_number, vendor_id, vendor_name, is_own_invoice
|
|
"""
|
|
from app.core.config import settings
|
|
|
|
logger.info("⚡ Running quick analysis on upload...")
|
|
|
|
result = {
|
|
"cvr": None,
|
|
"document_type": None, # 'invoice' or 'credit_note'
|
|
"document_number": None,
|
|
"vendor_id": None,
|
|
"vendor_name": None,
|
|
"is_own_invoice": False # True if this is an outgoing invoice (BMC's own CVR)
|
|
}
|
|
|
|
# 1. FIND CVR NUMBER (8 digits)
|
|
# Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678"
|
|
# Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR
|
|
# We need the SELLER's CVR (not BMC's own)
|
|
|
|
cvr_patterns = [
|
|
r'CVR[:\-\s]*(\d{8})',
|
|
r'CVR[:\-\s]*nr\.?\s*(\d{8})',
|
|
r'CVR[:\-\s]*nummer\s*(\d{8})',
|
|
r'SE[:\-\s]*(\d{8})', # SE = Svensk CVR, men også brugt i DK
|
|
r'\b(\d{8})\b' # Fallback: any 8-digit number
|
|
]
|
|
|
|
# Find ALL CVR numbers in document
|
|
found_cvrs = []
|
|
for pattern in cvr_patterns:
|
|
matches = re.finditer(pattern, pdf_text, re.IGNORECASE)
|
|
for match in matches:
|
|
cvr_candidate = match.group(1)
|
|
# Validate it's a real CVR (starts with 1-4, not a random number)
|
|
if cvr_candidate[0] in '1234' and cvr_candidate not in found_cvrs:
|
|
found_cvrs.append(cvr_candidate)
|
|
|
|
# Remove BMC's own CVR from list (buyer CVR, not seller)
|
|
vendor_cvrs = [cvr for cvr in found_cvrs if cvr != settings.OWN_CVR]
|
|
|
|
if settings.OWN_CVR in found_cvrs:
|
|
# This is a proper invoice where BMC is the buyer
|
|
if len(vendor_cvrs) > 0:
|
|
# Found vendor CVR - use the first non-BMC CVR
|
|
result['cvr'] = vendor_cvrs[0]
|
|
logger.info(f"📋 Found vendor CVR: {vendor_cvrs[0]} (ignored BMC CVR: {settings.OWN_CVR})")
|
|
|
|
# Try to match vendor
|
|
vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
|
|
if vendor:
|
|
result['vendor_id'] = vendor['id']
|
|
result['vendor_name'] = vendor['name']
|
|
else:
|
|
# Only BMC's CVR found = this is an outgoing invoice
|
|
result['is_own_invoice'] = True
|
|
result['cvr'] = settings.OWN_CVR
|
|
logger.warning(f"⚠️ OUTGOING INVOICE: Only BMC CVR found")
|
|
elif len(vendor_cvrs) > 0:
|
|
# No BMC CVR, but other CVR found - use first one
|
|
result['cvr'] = vendor_cvrs[0]
|
|
logger.info(f"📋 Found CVR: {vendor_cvrs[0]}")
|
|
|
|
vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
|
|
if vendor:
|
|
result['vendor_id'] = vendor['id']
|
|
result['vendor_name'] = vendor['name']
|
|
|
|
# 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note)
|
|
credit_keywords = [
|
|
'kreditnota', 'credit note', 'creditnote', 'kreditfaktura',
|
|
'refusion', 'tilbagebetaling', 'godtgørelse', 'tilbageførsel'
|
|
]
|
|
|
|
text_lower = pdf_text.lower()
|
|
is_credit_note = any(keyword in text_lower for keyword in credit_keywords)
|
|
|
|
if is_credit_note:
|
|
result['document_type'] = 'credit_note'
|
|
logger.info("📄 Document type: CREDIT NOTE")
|
|
else:
|
|
result['document_type'] = 'invoice'
|
|
logger.info("📄 Document type: INVOICE")
|
|
|
|
# 3. EXTRACT DOCUMENT NUMBER
|
|
# For invoices: "Faktura nr.", "Invoice number:", "Fakturanr."
|
|
# For credit notes: "Kreditnota nr.", "Credit note number:"
|
|
|
|
if result['document_type'] == 'credit_note':
|
|
number_patterns = [
|
|
r'kreditnota\s*(?:nr\.?|nummer)[:\s]*(\S+)',
|
|
r'credit\s*note\s*(?:no\.?|number)[:\s]*(\S+)',
|
|
r'kreditfaktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
|
|
]
|
|
else:
|
|
number_patterns = [
|
|
r'faktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
|
|
r'invoice\s*(?:no\.?|number)[:\s]*(\S+)',
|
|
r'fakturanr\.?\s*[:\s]*(\S+)',
|
|
]
|
|
|
|
for pattern in number_patterns:
|
|
match = re.search(pattern, pdf_text, re.IGNORECASE)
|
|
if match:
|
|
result['document_number'] = match.group(1).strip()
|
|
logger.info(f"🔢 Document number: {result['document_number']}")
|
|
break
|
|
|
|
logger.info(f"✅ Quick analysis complete: CVR={result['cvr']}, Type={result['document_type']}, Number={result['document_number']}, Vendor={result['vendor_name']}")
|
|
return result
|
|
|
|
def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
|
|
"""
|
|
Match vendor from database using CVR number
|
|
|
|
Args:
|
|
vendor_cvr: CVR number from extraction
|
|
|
|
Returns:
|
|
Vendor dict if found, None otherwise
|
|
"""
|
|
if not vendor_cvr:
|
|
return None
|
|
|
|
# Clean CVR (remove spaces, dashes)
|
|
cvr_clean = re.sub(r'[^0-9]', '', vendor_cvr)
|
|
|
|
if len(cvr_clean) != 8:
|
|
logger.warning(f"⚠️ Invalid CVR format: {vendor_cvr} (cleaned: {cvr_clean})")
|
|
return None
|
|
|
|
# Search vendors table
|
|
vendor = execute_query(
|
|
"SELECT * FROM vendors WHERE cvr_number = %s",
|
|
(cvr_clean,),
|
|
fetchone=True
|
|
)
|
|
|
|
if vendor:
|
|
logger.info(f"✅ Matched vendor: {vendor['name']} (CVR: {cvr_clean})")
|
|
return vendor
|
|
else:
|
|
logger.info(f"⚠️ No vendor found with CVR: {cvr_clean}")
|
|
return None
|
|
|
|
|
|
# Global instance
|
|
ollama_service = OllamaService()
|