2025-12-07 03:29:54 +01:00
"""
Ollama Integration Service for BMC Hub
Handles supplier invoice extraction using Ollama LLM with CVR matching
"""
import json
import hashlib
import logging
2026-01-25 03:29:28 +01:00
import os
2025-12-07 03:29:54 +01:00
from pathlib import Path
from typing import Optional , Dict , List , Tuple
from datetime import datetime
import re
from app . core . config import settings
2025-12-16 15:36:11 +01:00
from app . core . database import execute_insert , execute_query , execute_update , execute_query_single
2025-12-07 03:29:54 +01:00
logger = logging . getLogger ( __name__ )
class OllamaService :
""" Service for extracting supplier invoice data using Ollama LLM """
def __init__ ( self ) :
self . endpoint = settings . OLLAMA_ENDPOINT
self . model = settings . OLLAMA_MODEL
self . system_prompt = self . _build_system_prompt ( )
logger . info ( f " 🤖 Initialized OllamaService: { self . endpoint } , model= { self . model } " )
def _build_system_prompt ( self ) - > str :
""" Build Danish system prompt for invoice extraction with CVR """
2026-03-02 09:01:43 +01:00
own_cvr = getattr ( settings , ' OWN_CVR ' , ' 29522790 ' )
own_cvr_rule = (
f " 4b. KRITISK - LEVERANDØR CVR: CVR { own_cvr } er VORES eget CVR (køberen/modtageren). "
f " Sæt ALDRIG vendor_cvr til { own_cvr } ! Leverandørens CVR er CVR-nummeret der hører til "
f " firmaet som har SENDT fakturaen (ikke modtageren). \n "
)
return ( """ Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer, kreditnotaer og leverandørdokumenter.
2025-12-07 03:29:54 +01:00
VIGTIGE REGLER :
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
2. Hvis et felt ikke findes , sæt det til null
3. Beregn confidence baseret på hvor sikker du er på hvert felt ( 0.0 - 1.0 )
4. Datoer skal være i format YYYY - MM - DD
2026-03-02 09:01:43 +01:00
""" + own_cvr_rule + """ 5. DANSKE PRISFORMATER :
2025-12-08 09:15:52 +01:00
- Tusind - separator kan være . ( punkt ) eller mellemrum : " 5.965,18 " eller " 5 965,18 "
- Decimal - separator er , ( komma ) : " 1.234,56 kr "
- I JSON output skal du bruge . ( punkt ) som decimal : 1234.56
- Eksempel : " 5.965,18 kr " → 5965.18 i JSON
- Eksempel : " 1.234,56 DKK " → 1234.56 i JSON
2025-12-07 03:29:54 +01:00
6. CVR - nummer skal være 8 cifre uden mellemrum
7. Moms / VAT skal udtrækkes fra hver linje hvis muligt
2025-12-08 09:15:52 +01:00
8. DOKUMENTTYPE DETEKTION :
- " invoice " = Almindelig faktura
- " credit_note " = Kreditnota ( refusion , tilbagebetaling , korrektion )
- Kig efter ord som : " Kreditnota " , " Credit Note " , " Refusion " , " Tilbagebetaling " , " Godtgørelse "
9. BELØB OG FORTEGN ( ABSOLUT KRITISK ) :
- * * ALMINDELIGE FAKTURAER * * : Alle beløb skal være POSITIVE tal ( total_amount > 0 , line_total > 0 )
- * * KREDITNOTAER * * : Alle beløb skal være NEGATIVE tal ( total_amount < 0 , line_total < 0 )
- Hvis dokumentet siger " Faktura " → document_type : " invoice " → POSITIVE beløb
- Hvis dokumentet siger " Kreditnota " → document_type : " credit_note " → NEGATIVE beløb
2025-12-07 03:29:54 +01:00
JSON format skal være :
{
2025-12-08 09:15:52 +01:00
" document_type " : " invoice " eller " credit_note " ,
" invoice_number " : " fakturanummer eller kreditnota nummer " ,
2025-12-07 03:29:54 +01:00
" vendor_name " : " leverandør firmanavn " ,
" vendor_cvr " : " 12345678 " ,
" invoice_date " : " YYYY-MM-DD " ,
" due_date " : " YYYY-MM-DD " ,
" currency " : " DKK " ,
2025-12-08 09:15:52 +01:00
" total_amount " : 1234.56 ( NEGATIVT for kreditnotaer ) ,
" vat_amount " : 123.45 ( NEGATIVT for kreditnotaer ) ,
" original_invoice_reference " : " reference til original faktura (kun for kreditnotaer) " ,
2025-12-07 03:29:54 +01:00
" lines " : [
{
" line_number " : 1 ,
" description " : " beskrivelse af varen/ydelsen " ,
" quantity " : antal_som_tal ,
2025-12-08 09:15:52 +01:00
" unit_price " : pris_per_stk ( NEGATIVT for kreditnotaer ) ,
" line_total " : total_for_linjen ( NEGATIVT for kreditnotaer ) ,
2025-12-07 03:29:54 +01:00
" vat_rate " : 25.00 ,
2025-12-08 09:15:52 +01:00
" vat_amount " : moms_beløb ( NEGATIVT for kreditnotaer ) ,
2025-12-07 03:29:54 +01:00
" confidence " : 0.0 _til_1 .0
}
] ,
" confidence " : gennemsnits_confidence ,
" raw_text_snippet " : " første 200 tegn fra dokumentet "
}
2025-12-08 09:15:52 +01:00
EKSEMPEL PÅ FAKTURA ( POSITIVE BELØB ) :
Input : " FAKTURA 2025-001 \\ nGlobalConnect A/S \\ nCVR: 12345678 \\ n1 stk iPhone 16 @ 5.965,18 DKK \\ nMoms (25 % ): 1.491,30 DKK \\ nTotal: 7.456,48 DKK "
2025-12-07 03:29:54 +01:00
Output : {
" document_type " : " invoice " ,
" invoice_number " : " 2025-001 " ,
" vendor_name " : " GlobalConnect A/S " ,
" vendor_cvr " : " 12345678 " ,
2025-12-08 09:15:52 +01:00
" total_amount " : 7456.48 ,
" vat_amount " : 1491.30 ,
" lines " : [ {
" line_number " : 1 ,
" description " : " iPhone 16 " ,
" quantity " : 1 ,
" unit_price " : 5965.18 ,
" line_total " : 5965.18 ,
" vat_rate " : 25.00 ,
" vat_amount " : 1491.30 ,
" confidence " : 0.95
} ] ,
" confidence " : 0.95
}
EKSEMPEL PÅ KREDITNOTA ( NEGATIVE BELØB ) :
Input : " KREDITNOTA CN-2025-042 \\ nGlobalConnect A/S \\ nCVR: 12345678 \\ nReference: Faktura 2025-001 \\ nTilbagebetaling: \\ n1 stk iPhone 16 returneret @ -5.965,18 DKK \\ nMoms (25 % ): -1.491,30 DKK \\ nTotal: -7.456,48 DKK "
Output : {
" document_type " : " credit_note " ,
" invoice_number " : " CN-2025-042 " ,
" vendor_name " : " GlobalConnect A/S " ,
" vendor_cvr " : " 12345678 " ,
" original_invoice_reference " : " 2025-001 " ,
" total_amount " : - 7456.48 ,
" vat_amount " : - 1491.30 ,
2025-12-07 03:29:54 +01:00
" lines " : [ {
" line_number " : 1 ,
2025-12-08 09:15:52 +01:00
" description " : " iPhone 16 returneret " ,
2025-12-07 03:29:54 +01:00
" quantity " : 1 ,
2025-12-08 09:15:52 +01:00
" unit_price " : - 5965.18 ,
" line_total " : - 5965.18 ,
2025-12-07 03:29:54 +01:00
" vat_rate " : 25.00 ,
2025-12-08 09:15:52 +01:00
" vat_amount " : - 1491.30 ,
2025-12-07 03:29:54 +01:00
" confidence " : 0.95
} ] ,
" confidence " : 0.95
2026-03-02 09:01:43 +01:00
} """ )
2025-12-07 03:29:54 +01:00
async def extract_from_text ( self , text : str ) - > Dict :
"""
Extract structured invoice data from text using Ollama
Args :
text : Document text content
Returns :
Extracted data as dict with CVR , invoice number , amounts , etc .
"""
2025-12-08 09:15:52 +01:00
# No truncation - send full text to AI
prompt = f " { self . system_prompt } \n \n NU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA: \n { text } \n \n Returner kun gyldig JSON: "
2025-12-07 03:29:54 +01:00
logger . info ( f " 🤖 Extracting invoice data from text (length: { len ( text ) } ) " )
try :
import httpx
2025-12-08 23:46:18 +01:00
# Detect if using qwen3 model (requires Chat API)
use_chat_api = self . model . startswith ( ' qwen3 ' )
2025-12-07 03:29:54 +01:00
async with httpx . AsyncClient ( timeout = 1000.0 ) as client :
2025-12-08 23:46:18 +01:00
if use_chat_api :
# qwen3 models use Chat API format
logger . info ( f " 🤖 Using Chat API for { self . model } " )
response = await client . post (
f " { self . endpoint } /api/chat " ,
json = {
" model " : self . model ,
" messages " : [
{
" role " : " system " ,
" content " : self . system_prompt
} ,
{
" role " : " user " ,
" content " : f " NU SKAL DU UDTRÆKKE DATA FRA DENNE FAKTURA: \n { text } \n \n VIGTIGT: Dit svar skal STARTE med {{ og SLUTTE med }} - ingen forklaring før eller efter JSON! "
}
] ,
" stream " : False ,
" format " : " json " ,
" options " : {
" temperature " : 0.1 ,
" top_p " : 0.9 ,
" num_predict " : 2000
}
}
)
else :
# qwen2.5 and other models use Generate API format
logger . info ( f " 🤖 Using Generate API for { self . model } " )
response = await client . post (
f " { self . endpoint } /api/generate " ,
json = {
" model " : self . model ,
" prompt " : prompt ,
" stream " : False ,
" options " : {
" temperature " : 0.1 ,
" top_p " : 0.9 ,
" num_predict " : 2000
}
2025-12-07 03:29:54 +01:00
}
2025-12-08 23:46:18 +01:00
)
2025-12-07 03:29:54 +01:00
if response . status_code != 200 :
raise Exception ( f " Ollama returned status { response . status_code } : { response . text } " )
result = response . json ( )
2025-12-08 23:46:18 +01:00
# Extract response based on API type
if use_chat_api :
# qwen3 models sometimes put the actual response in "thinking" field
raw_response = result . get ( " message " , { } ) . get ( " content " , " " )
thinking = result . get ( " message " , { } ) . get ( " thinking " , " " )
# If content is empty but thinking has data, try to extract JSON from thinking
if not raw_response and thinking :
logger . info ( f " 💭 Content empty, attempting to extract JSON from thinking field (length: { len ( thinking ) } ) " )
# Try to find JSON block in thinking text
json_start = thinking . find ( ' { ' )
json_end = thinking . rfind ( ' } ' ) + 1
if json_start > = 0 and json_end > json_start :
potential_json = thinking [ json_start : json_end ]
logger . info ( f " 📦 Found potential JSON in thinking field (length: { len ( potential_json ) } ) " )
raw_response = potential_json
else :
logger . warning ( f " ⚠️ No JSON found in thinking field, using full thinking as fallback " )
raw_response = thinking
elif thinking :
logger . info ( f " 💭 Model thinking (length: { len ( thinking ) } ) " )
# DEBUG: Log full result structure
logger . info ( f " 📊 Chat API result keys: { list ( result . keys ( ) ) } " )
logger . info ( f " 📊 Message keys: { list ( result . get ( ' message ' , { } ) . keys ( ) ) } " )
else :
raw_response = result . get ( " response " , " " )
2025-12-07 03:29:54 +01:00
logger . info ( f " ✅ Ollama extraction completed (response length: { len ( raw_response ) } ) " )
# Parse JSON from response
extraction = self . _parse_json_response ( raw_response )
2025-12-08 09:15:52 +01:00
# CRITICAL: Fix amount signs based on document_type
# LLM sometimes returns negative amounts for invoices - fix this!
document_type = extraction . get ( ' document_type ' , ' invoice ' )
if document_type == ' invoice ' :
# Normal invoices should have POSITIVE amounts
if extraction . get ( ' total_amount ' ) and extraction [ ' total_amount ' ] < 0 :
logger . warning ( f " ⚠️ Fixing negative total_amount for invoice: { extraction [ ' total_amount ' ] } → { abs ( extraction [ ' total_amount ' ] ) } " )
extraction [ ' total_amount ' ] = abs ( extraction [ ' total_amount ' ] )
if extraction . get ( ' vat_amount ' ) and extraction [ ' vat_amount ' ] < 0 :
extraction [ ' vat_amount ' ] = abs ( extraction [ ' vat_amount ' ] )
# Fix line totals
if ' lines ' in extraction :
for line in extraction [ ' lines ' ] :
if line . get ( ' unit_price ' ) and line [ ' unit_price ' ] < 0 :
line [ ' unit_price ' ] = abs ( line [ ' unit_price ' ] )
if line . get ( ' line_total ' ) and line [ ' line_total ' ] < 0 :
line [ ' line_total ' ] = abs ( line [ ' line_total ' ] )
if line . get ( ' vat_amount ' ) and line [ ' vat_amount ' ] < 0 :
line [ ' vat_amount ' ] = abs ( line [ ' vat_amount ' ] )
elif document_type == ' credit_note ' :
# Credit notes should have NEGATIVE amounts
if extraction . get ( ' total_amount ' ) and extraction [ ' total_amount ' ] > 0 :
logger . warning ( f " ⚠️ Fixing positive total_amount for credit_note: { extraction [ ' total_amount ' ] } → { - abs ( extraction [ ' total_amount ' ] ) } " )
extraction [ ' total_amount ' ] = - abs ( extraction [ ' total_amount ' ] )
if extraction . get ( ' vat_amount ' ) and extraction [ ' vat_amount ' ] > 0 :
extraction [ ' vat_amount ' ] = - abs ( extraction [ ' vat_amount ' ] )
# Fix line totals
if ' lines ' in extraction :
for line in extraction [ ' lines ' ] :
if line . get ( ' unit_price ' ) and line [ ' unit_price ' ] > 0 :
line [ ' unit_price ' ] = - abs ( line [ ' unit_price ' ] )
if line . get ( ' line_total ' ) and line [ ' line_total ' ] > 0 :
line [ ' line_total ' ] = - abs ( line [ ' line_total ' ] )
if line . get ( ' vat_amount ' ) and line [ ' vat_amount ' ] > 0 :
line [ ' vat_amount ' ] = - abs ( line [ ' vat_amount ' ] )
2025-12-07 03:29:54 +01:00
# Add raw response for debugging
extraction [ ' _raw_llm_response ' ] = raw_response
return extraction
except Exception as e :
error_msg = f " Ollama extraction failed: { str ( e ) } "
logger . error ( f " ❌ { error_msg } " )
error_str = str ( e ) . lower ( )
if " timeout " in error_str :
return {
" error " : f " Ollama timeout efter 1000 sekunder " ,
" confidence " : 0.0
}
elif " connection " in error_str or " connect " in error_str :
return {
" error " : f " Kan ikke forbinde til Ollama på { self . endpoint } " ,
" confidence " : 0.0
}
else :
return {
" error " : error_msg ,
" confidence " : 0.0
}
def _parse_json_response ( self , response : str ) - > Dict :
""" Parse JSON from LLM response with improved error handling """
try :
2025-12-08 23:46:18 +01:00
# Log preview of response for debugging
logger . info ( f " 🔍 Response preview (first 500 chars): { response [ : 500 ] } " )
2025-12-07 03:29:54 +01:00
# Find JSON in response (between first { and last })
start = response . find ( ' { ' )
end = response . rfind ( ' } ' ) + 1
if start > = 0 and end > start :
json_str = response [ start : end ]
2025-12-08 23:46:18 +01:00
logger . info ( f " 🔍 Extracted JSON string length: { len ( json_str ) } , starts at position { start } " )
2025-12-07 03:29:54 +01:00
# Try to fix common JSON issues
# Remove trailing commas before } or ]
json_str = re . sub ( r ' ,( \ s*[} \ ]]) ' , r ' \ 1 ' , json_str )
# Fix single quotes to double quotes (but not in values)
# This is risky, so we only do it if initial parse fails
try :
data = json . loads ( json_str )
return data
except json . JSONDecodeError :
# Try to fix common issues
# Replace single quotes with double quotes (simple approach)
fixed_json = json_str . replace ( " ' " , ' " ' )
try :
data = json . loads ( fixed_json )
logger . warning ( " ⚠️ Fixed JSON with quote replacement " )
return data
except :
pass
# Last resort: log the problematic JSON
logger . error ( f " ❌ Problematic JSON: { json_str [ : 300 ] } " )
raise
else :
raise ValueError ( " No JSON found in response " )
except json . JSONDecodeError as e :
logger . error ( f " ❌ JSON parsing failed: { e } " )
logger . error ( f " Raw response preview: { response [ : 500 ] } " )
return {
" error " : f " JSON parsing failed: { str ( e ) } " ,
" confidence " : 0.0 ,
" raw_response " : response [ : 500 ]
}
def calculate_file_checksum ( self , file_path : Path ) - > str :
""" Calculate SHA256 checksum of file for duplicate detection """
sha256 = hashlib . sha256 ( )
with open ( file_path , ' rb ' ) as f :
while chunk := f . read ( 8192 ) :
sha256 . update ( chunk )
checksum = sha256 . hexdigest ( )
logger . info ( f " 📋 Calculated checksum: { checksum [ : 16 ] } ... for { file_path . name } " )
return checksum
async def _extract_text_from_file ( self , file_path : Path ) - > str :
""" Extract text from PDF, image, or text file """
suffix = file_path . suffix . lower ( )
try :
if suffix == ' .pdf ' :
return await self . _extract_text_from_pdf ( file_path )
elif suffix in [ ' .png ' , ' .jpg ' , ' .jpeg ' ] :
return await self . _extract_text_from_image ( file_path )
elif suffix in [ ' .txt ' , ' .csv ' ] :
with open ( file_path , ' r ' , encoding = ' utf-8 ' , errors = ' ignore ' ) as f :
return f . read ( )
else :
raise ValueError ( f " Unsupported file type: { suffix } " )
except Exception as e :
logger . error ( f " ❌ Text extraction failed for { file_path . name } : { e } " )
raise
async def _extract_text_from_pdf ( self , file_path : Path ) - > str :
2025-12-08 09:15:52 +01:00
""" Extract text from PDF using pdfplumber (better table/layout support) """
2025-12-07 03:29:54 +01:00
try :
2025-12-08 09:15:52 +01:00
import pdfplumber
2025-12-07 03:29:54 +01:00
2025-12-08 09:15:52 +01:00
all_text = [ ]
with pdfplumber . open ( file_path ) as pdf :
for page_num , page in enumerate ( pdf . pages ) :
# Strategy: Use regular text extraction (includes tables)
# pdfplumber's extract_text() handles tables better than PyPDF2
page_text = page . extract_text ( layout = True , x_tolerance = 2 , y_tolerance = 2 )
if page_text :
all_text . append ( page_text )
2025-12-07 03:29:54 +01:00
2025-12-08 09:15:52 +01:00
text = " \\ n " . join ( all_text )
logger . info ( f " 📄 Extracted { len ( text ) } chars from PDF with pdfplumber " )
2025-12-07 03:29:54 +01:00
return text
except Exception as e :
logger . error ( f " ❌ PDF extraction failed: { e } " )
raise
async def _extract_text_from_image ( self , file_path : Path ) - > str :
""" Extract text from image using Tesseract OCR """
try :
import pytesseract
from PIL import Image
image = Image . open ( file_path )
# Use Danish + English for OCR
text = pytesseract . image_to_string ( image , lang = ' dan+eng ' )
logger . info ( f " 🖼️ Extracted { len ( text ) } chars from image via OCR " )
return text
except Exception as e :
logger . error ( f " ❌ OCR extraction failed: { e } " )
# Fallback to English only
try :
text = pytesseract . image_to_string ( Image . open ( file_path ) , lang = ' eng ' )
logger . warning ( f " ⚠️ Fallback to English OCR: { len ( text ) } chars " )
return text
except :
raise
def _get_mime_type ( self , file_path : Path ) - > str :
""" Get MIME type from file extension """
suffix = file_path . suffix . lower ( )
mime_types = {
' .pdf ' : ' application/pdf ' ,
' .png ' : ' image/png ' ,
' .jpg ' : ' image/jpeg ' ,
' .jpeg ' : ' image/jpeg ' ,
' .txt ' : ' text/plain ' ,
' .csv ' : ' text/csv '
}
return mime_types . get ( suffix , ' application/octet-stream ' )
feat: Implement quick analysis on PDF upload for CVR, document type, and number extraction
- Added `check_invoice_number_exists` method in `EconomicService` to verify invoice numbers in e-conomic journals.
- Introduced `quick_analysis_on_upload` method in `OllamaService` for extracting critical fields from uploaded PDFs, including CVR, document type, and document number.
- Created migration script to add new fields for storing detected CVR, vendor ID, document type, and document number in the `incoming_files` table.
- Developed comprehensive tests for the quick analysis functionality, validating CVR detection, document type identification, and invoice number extraction.
2025-12-09 14:54:33 +01:00
async def quick_analysis_on_upload ( self , pdf_text : str ) - > Dict :
"""
Quick analysis when file is uploaded - extracts critical fields only :
- CVR number ( to match vendor )
- Document type ( invoice vs credit note )
- Invoice / credit note number
This runs BEFORE template matching for early vendor detection .
Args :
pdf_text : Extracted text from PDF
Returns :
Dict with cvr , document_type , document_number , vendor_id , vendor_name , is_own_invoice
"""
from app . core . config import settings
logger . info ( " ⚡ Running quick analysis on upload... " )
result = {
" cvr " : None ,
" document_type " : None , # 'invoice' or 'credit_note'
" document_number " : None ,
" vendor_id " : None ,
" vendor_name " : None ,
" is_own_invoice " : False # True if this is an outgoing invoice (BMC's own CVR)
}
# 1. FIND CVR NUMBER (8 digits)
# Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678"
# Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR
# We need the SELLER's CVR (not BMC's own)
cvr_patterns = [
r ' CVR[: \ - \ s]*( \ d {8} ) ' ,
r ' CVR[: \ - \ s]*nr \ .? \ s*( \ d {8} ) ' ,
r ' CVR[: \ - \ s]*nummer \ s*( \ d {8} ) ' ,
r ' SE[: \ - \ s]*( \ d {8} ) ' , # SE = Svensk CVR, men også brugt i DK
r ' \ b( \ d {8} ) \ b ' # Fallback: any 8-digit number
]
# Find ALL CVR numbers in document
found_cvrs = [ ]
for pattern in cvr_patterns :
matches = re . finditer ( pattern , pdf_text , re . IGNORECASE )
for match in matches :
cvr_candidate = match . group ( 1 )
# Validate it's a real CVR (starts with 1-4, not a random number)
if cvr_candidate [ 0 ] in ' 1234 ' and cvr_candidate not in found_cvrs :
found_cvrs . append ( cvr_candidate )
# Remove BMC's own CVR from list (buyer CVR, not seller)
vendor_cvrs = [ cvr for cvr in found_cvrs if cvr != settings . OWN_CVR ]
if settings . OWN_CVR in found_cvrs :
# This is a proper invoice where BMC is the buyer
if len ( vendor_cvrs ) > 0 :
# Found vendor CVR - use the first non-BMC CVR
result [ ' cvr ' ] = vendor_cvrs [ 0 ]
logger . info ( f " 📋 Found vendor CVR: { vendor_cvrs [ 0 ] } (ignored BMC CVR: { settings . OWN_CVR } ) " )
# Try to match vendor
vendor = self . match_vendor_by_cvr ( vendor_cvrs [ 0 ] )
if vendor :
result [ ' vendor_id ' ] = vendor [ ' id ' ]
result [ ' vendor_name ' ] = vendor [ ' name ' ]
else :
# Only BMC's CVR found = this is an outgoing invoice
result [ ' is_own_invoice ' ] = True
result [ ' cvr ' ] = settings . OWN_CVR
logger . warning ( f " ⚠️ OUTGOING INVOICE: Only BMC CVR found " )
elif len ( vendor_cvrs ) > 0 :
# No BMC CVR, but other CVR found - use first one
result [ ' cvr ' ] = vendor_cvrs [ 0 ]
logger . info ( f " 📋 Found CVR: { vendor_cvrs [ 0 ] } " )
vendor = self . match_vendor_by_cvr ( vendor_cvrs [ 0 ] )
if vendor :
result [ ' vendor_id ' ] = vendor [ ' id ' ]
result [ ' vendor_name ' ] = vendor [ ' name ' ]
# 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note)
credit_keywords = [
' kreditnota ' , ' credit note ' , ' creditnote ' , ' kreditfaktura ' ,
' refusion ' , ' tilbagebetaling ' , ' godtgørelse ' , ' tilbageførsel '
]
text_lower = pdf_text . lower ( )
is_credit_note = any ( keyword in text_lower for keyword in credit_keywords )
if is_credit_note :
result [ ' document_type ' ] = ' credit_note '
logger . info ( " 📄 Document type: CREDIT NOTE " )
else :
result [ ' document_type ' ] = ' invoice '
logger . info ( " 📄 Document type: INVOICE " )
# 3. EXTRACT DOCUMENT NUMBER
# For invoices: "Faktura nr.", "Invoice number:", "Fakturanr."
# For credit notes: "Kreditnota nr.", "Credit note number:"
if result [ ' document_type ' ] == ' credit_note ' :
number_patterns = [
r ' kreditnota \ s*(?:nr \ .?|nummer)[: \ s]*( \ S+) ' ,
r ' credit \ s*note \ s*(?:no \ .?|number)[: \ s]*( \ S+) ' ,
r ' kreditfaktura \ s*(?:nr \ .?|nummer)[: \ s]*( \ S+) ' ,
]
else :
number_patterns = [
r ' faktura \ s*(?:nr \ .?|nummer)[: \ s]*( \ S+) ' ,
r ' invoice \ s*(?:no \ .?|number)[: \ s]*( \ S+) ' ,
r ' fakturanr \ .? \ s*[: \ s]*( \ S+) ' ,
]
for pattern in number_patterns :
match = re . search ( pattern , pdf_text , re . IGNORECASE )
if match :
result [ ' document_number ' ] = match . group ( 1 ) . strip ( )
logger . info ( f " 🔢 Document number: { result [ ' document_number ' ] } " )
break
logger . info ( f " ✅ Quick analysis complete: CVR= { result [ ' cvr ' ] } , Type= { result [ ' document_type ' ] } , Number= { result [ ' document_number ' ] } , Vendor= { result [ ' vendor_name ' ] } " )
return result
2025-12-07 03:29:54 +01:00
def match_vendor_by_cvr ( self , vendor_cvr : Optional [ str ] ) - > Optional [ Dict ] :
"""
Match vendor from database using CVR number
Args :
vendor_cvr : CVR number from extraction
Returns :
Vendor dict if found , None otherwise
"""
if not vendor_cvr :
return None
# Clean CVR (remove spaces, dashes)
cvr_clean = re . sub ( r ' [^0-9] ' , ' ' , vendor_cvr )
if len ( cvr_clean ) != 8 :
logger . warning ( f " ⚠️ Invalid CVR format: { vendor_cvr } (cleaned: { cvr_clean } ) " )
return None
# Search vendors table
2025-12-16 15:36:11 +01:00
vendor = execute_query_single (
feat: Implement quick analysis on PDF upload for CVR, document type, and number extraction
- Added `check_invoice_number_exists` method in `EconomicService` to verify invoice numbers in e-conomic journals.
- Introduced `quick_analysis_on_upload` method in `OllamaService` for extracting critical fields from uploaded PDFs, including CVR, document type, and document number.
- Created migration script to add new fields for storing detected CVR, vendor ID, document type, and document number in the `incoming_files` table.
- Developed comprehensive tests for the quick analysis functionality, validating CVR detection, document type identification, and invoice number extraction.
2025-12-09 14:54:33 +01:00
" SELECT * FROM vendors WHERE cvr_number = %s " ,
2025-12-16 15:36:11 +01:00
( cvr_clean , ) )
2025-12-07 03:29:54 +01:00
if vendor :
logger . info ( f " ✅ Matched vendor: { vendor [ ' name ' ] } (CVR: { cvr_clean } ) " )
return vendor
else :
logger . info ( f " ⚠️ No vendor found with CVR: { cvr_clean } " )
return None
2026-01-25 03:29:28 +01:00
async def generate_summary ( self , text : str ) - > str :
"""
Generate a short summary of the text using Ollama
"""
if not text :
return " "
system_prompt = " Du er en hjælpsom assistent, der laver korte, præcise resuméer på dansk. "
user_prompt = f " Lav et kort resumé (max 50 ord) af følgende tekst: \n \n { text } "
try :
import aiohttp
logger . info ( f " 🧠 Generating summary with Ollama ( { self . model } )... " )
async with aiohttp . ClientSession ( ) as session :
payload = {
" model " : self . model ,
" prompt " : system_prompt + " \n \n " + user_prompt ,
" stream " : False ,
" options " : { " temperature " : 0.3 }
}
async with session . post ( f " { self . endpoint } /api/generate " , json = payload , timeout = 60.0 ) as response :
if response . status == 200 :
data = await response . json ( )
summary = data . get ( " response " , " " ) . strip ( )
logger . info ( " ✅ Summary generated " )
return summary
else :
error_text = await response . text ( )
logger . error ( f " ❌ Ollama error: { error_text } " )
return " Kunne ikke generere resumé (API fejl). "
except Exception as e :
logger . error ( f " ❌ Ollama summary failed: { e } " )
return f " Ingen resumé (Fejl: { str ( e ) } ) "
2025-12-07 03:29:54 +01:00
# Global instance
ollama_service = OllamaService ( )