feat: Implement quick analysis on PDF upload for CVR, document type, and number extraction

- Added `check_invoice_number_exists` method in `EconomicService` to verify invoice numbers in e-conomic journals.
- Introduced `quick_analysis_on_upload` method in `OllamaService` for extracting critical fields from uploaded PDFs, including CVR, document type, and document number.
- Created migration script to add new fields for storing detected CVR, vendor ID, document type, and document number in the `incoming_files` table.
- Developed comprehensive tests for the quick analysis functionality, validating CVR detection, document type identification, and invoice number extraction.
This commit is contained in:
Christian 2025-12-09 14:54:33 +01:00
parent 890bd6245d
commit 3a8288f5a1
16 changed files with 2731 additions and 205 deletions

View File

@ -13,6 +13,7 @@ from app.core.config import settings
from app.services.economic_service import get_economic_service from app.services.economic_service import get_economic_service
from app.services.ollama_service import ollama_service from app.services.ollama_service import ollama_service
from app.services.template_service import template_service from app.services.template_service import template_service
from app.services.invoice2data_service import get_invoice2data_service
import logging import logging
import os import os
import re import re
@ -232,15 +233,25 @@ async def get_pending_files():
f.error_message, f.error_message,
f.template_id, f.template_id,
f.file_path, f.file_path,
-- Quick analysis results (available immediately on upload)
f.detected_cvr,
f.detected_vendor_id,
f.detected_document_type,
f.detected_document_number,
f.is_own_invoice,
v_detected.name as detected_vendor_name,
v_detected.cvr_number as detected_vendor_cvr,
-- Get vendor info from latest extraction -- Get vendor info from latest extraction
ext.vendor_name, ext.vendor_name,
ext.vendor_cvr, ext.vendor_cvr,
ext.vendor_matched_id, ext.vendor_matched_id,
v.name as matched_vendor_name, v.name as matched_vendor_name,
v.cvr_number as matched_vendor_cvr_number,
-- Check if already has invoice via latest extraction only -- Check if already has invoice via latest extraction only
si.id as existing_invoice_id, si.id as existing_invoice_id,
si.invoice_number as existing_invoice_number si.invoice_number as existing_invoice_number
FROM incoming_files f FROM incoming_files f
LEFT JOIN vendors v_detected ON v_detected.id = f.detected_vendor_id
LEFT JOIN LATERAL ( LEFT JOIN LATERAL (
SELECT extraction_id, file_id, vendor_name, vendor_cvr, vendor_matched_id SELECT extraction_id, file_id, vendor_name, vendor_cvr, vendor_matched_id
FROM extractions FROM extractions
@ -250,16 +261,82 @@ async def get_pending_files():
) ext ON true ) ext ON true
LEFT JOIN vendors v ON v.id = ext.vendor_matched_id LEFT JOIN vendors v ON v.id = ext.vendor_matched_id
LEFT JOIN supplier_invoices si ON si.extraction_id = ext.extraction_id LEFT JOIN supplier_invoices si ON si.extraction_id = ext.extraction_id
WHERE f.status IN ('pending', 'processing', 'failed', 'ai_extracted', 'processed') WHERE f.status IN ('pending', 'processing', 'failed', 'ai_extracted', 'processed', 'duplicate')
AND si.id IS NULL -- Only show files without invoice yet AND si.id IS NULL -- Only show files without invoice yet
ORDER BY f.file_id, f.uploaded_at DESC""" ORDER BY f.file_id, f.uploaded_at DESC"""
) )
# Convert to regular dicts so we can add new keys
files = [dict(file) for file in files] if files else []
# Check for invoice2data templates for each file
try:
from app.services.invoice2data_service import get_invoice2data_service
invoice2data = get_invoice2data_service()
logger.info(f"📋 Checking invoice2data templates: {len(invoice2data.templates)} loaded")
for file in files:
# Check if there's an invoice2data template for this vendor's CVR
vendor_cvr = file.get('matched_vendor_cvr_number') or file.get('detected_vendor_cvr') or file.get('vendor_cvr')
file['has_invoice2data_template'] = False
logger.debug(f" File {file['file_id']}: CVR={vendor_cvr}")
if vendor_cvr:
# Check all templates for this CVR in keywords
for template_name, template_data in invoice2data.templates.items():
keywords = template_data.get('keywords', [])
logger.debug(f" Template {template_name}: keywords={keywords}")
if str(vendor_cvr) in [str(k) for k in keywords]:
file['has_invoice2data_template'] = True
file['invoice2data_template_name'] = template_name
logger.info(f" ✅ File {file['file_id']} matched template: {template_name}")
break
except Exception as e:
logger.error(f"❌ Failed to check invoice2data templates: {e}", exc_info=True)
# Continue without invoice2data info
return {"files": files if files else [], "count": len(files) if files else 0} return {"files": files if files else [], "count": len(files) if files else 0}
except Exception as e: except Exception as e:
logger.error(f"❌ Failed to get pending files: {e}") logger.error(f"❌ Failed to get pending files: {e}")
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@router.get("/supplier-invoices/files/{file_id}/pdf-text")
async def get_file_pdf_text(file_id: int):
"""Hent fuld PDF tekst fra en uploaded fil (til template builder)"""
try:
# Get file info
file_info = execute_query(
"SELECT file_path, filename FROM incoming_files WHERE file_id = %s",
(file_id,),
fetchone=True
)
if not file_info:
raise HTTPException(status_code=404, detail="Fil ikke fundet")
# Read PDF text
from pathlib import Path
file_path = Path(file_info['file_path'])
if not file_path.exists():
raise HTTPException(status_code=404, detail=f"Fil ikke fundet på disk: {file_path}")
pdf_text = await ollama_service._extract_text_from_file(file_path)
return {
"file_id": file_id,
"filename": file_info['filename'],
"pdf_text": pdf_text
}
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to get PDF text: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/supplier-invoices/files/{file_id}/extracted-data") @router.get("/supplier-invoices/files/{file_id}/extracted-data")
async def get_file_extracted_data(file_id: int): async def get_file_extracted_data(file_id: int):
"""Hent AI-extracted data fra en uploaded fil""" """Hent AI-extracted data fra en uploaded fil"""
@ -758,8 +835,9 @@ async def create_invoice_from_extraction(file_id: int):
@router.get("/supplier-invoices/templates") @router.get("/supplier-invoices/templates")
async def list_templates(): async def list_templates():
"""Hent alle templates""" """Hent alle templates (både database og invoice2data YAML)"""
try: try:
# Get database templates
query = """ query = """
SELECT t.*, v.name as vendor_name SELECT t.*, v.name as vendor_name
FROM supplier_invoice_templates t FROM supplier_invoice_templates t
@ -767,9 +845,55 @@ async def list_templates():
WHERE t.is_active = true WHERE t.is_active = true
ORDER BY t.created_at DESC ORDER BY t.created_at DESC
""" """
templates = execute_query(query) db_templates = execute_query(query) or []
return templates if templates else [] # Get invoice2data templates
invoice2data_service = get_invoice2data_service()
invoice2data_templates = []
for template_name, template_data in invoice2data_service.templates.items():
# Extract vendor CVR from keywords
vendor_cvr = None
keywords = template_data.get('keywords', [])
for keyword in keywords:
if isinstance(keyword, str) and keyword.isdigit() and len(keyword) == 8:
vendor_cvr = keyword
break
# Get vendor info from database if CVR found
vendor_name = template_data.get('issuer', 'Ukendt')
vendor_id = None
if vendor_cvr:
vendor = execute_query(
"SELECT id, name FROM vendors WHERE cvr_number = %s",
(vendor_cvr,),
fetchone=True
)
if vendor:
vendor_id = vendor['id']
vendor_name = vendor['name']
invoice2data_templates.append({
'template_id': -1, # Negative ID to distinguish from DB templates
'template_name': f"Invoice2Data: {template_name}",
'template_type': 'invoice2data',
'yaml_filename': template_name,
'vendor_id': vendor_id,
'vendor_name': vendor_name,
'vendor_cvr': vendor_cvr,
'default_product_category': template_data.get('default_product_category', 'varesalg'),
'default_product_group_number': template_data.get('default_product_group_number', 1),
'usage_count': 0, # Could track this separately
'is_active': True,
'detection_patterns': keywords,
'field_mappings': template_data.get('fields', {}),
'created_at': None
})
# Combine both types
all_templates = db_templates + invoice2data_templates
return all_templates
except Exception as e: except Exception as e:
logger.error(f"❌ Failed to list templates: {e}") logger.error(f"❌ Failed to list templates: {e}")
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@ -978,6 +1102,7 @@ async def create_template(request: Dict):
template_name = request.get('template_name') template_name = request.get('template_name')
detection_patterns = request.get('detection_patterns', []) detection_patterns = request.get('detection_patterns', [])
field_mappings = request.get('field_mappings', {}) field_mappings = request.get('field_mappings', {})
default_product_category = request.get('default_product_category', 'varesalg')
if not vendor_id or not template_name: if not vendor_id or not template_name:
raise HTTPException(status_code=400, detail="vendor_id og template_name er påkrævet") raise HTTPException(status_code=400, detail="vendor_id og template_name er påkrævet")
@ -996,11 +1121,11 @@ async def create_template(request: Dict):
# Insert template and get template_id # Insert template and get template_id
query = """ query = """
INSERT INTO supplier_invoice_templates INSERT INTO supplier_invoice_templates
(vendor_id, template_name, detection_patterns, field_mappings) (vendor_id, template_name, detection_patterns, field_mappings, default_product_category)
VALUES (%s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s)
RETURNING template_id RETURNING template_id
""" """
result = execute_query(query, (vendor_id, template_name, json.dumps(detection_patterns), json.dumps(field_mappings))) result = execute_query(query, (vendor_id, template_name, json.dumps(detection_patterns), json.dumps(field_mappings), default_product_category))
template_id = result[0]['template_id'] if result else None template_id = result[0]['template_id'] if result else None
if not template_id: if not template_id:
@ -1657,6 +1782,97 @@ async def upload_supplier_invoice(file: UploadFile = File(...)):
logger.info(f"📄 Extracting text from {final_path.suffix}...") logger.info(f"📄 Extracting text from {final_path.suffix}...")
text = await ollama_service._extract_text_from_file(final_path) text = await ollama_service._extract_text_from_file(final_path)
# QUICK ANALYSIS: Extract CVR, document type, invoice number IMMEDIATELY
logger.info(f"⚡ Running quick analysis...")
quick_result = await ollama_service.quick_analysis_on_upload(text)
# Update file record with quick analysis results
execute_update(
"""UPDATE incoming_files
SET detected_cvr = %s,
detected_vendor_id = %s,
detected_document_type = %s,
detected_document_number = %s,
is_own_invoice = %s
WHERE file_id = %s""",
(quick_result.get('cvr'),
quick_result.get('vendor_id'),
quick_result.get('document_type'),
quick_result.get('document_number'),
quick_result.get('is_own_invoice', False),
file_id)
)
logger.info(f"📋 Quick analysis saved: CVR={quick_result.get('cvr')}, "
f"Vendor={quick_result.get('vendor_name')}, "
f"Type={quick_result.get('document_type')}, "
f"Number={quick_result.get('document_number')}")
# DUPLICATE CHECK: Check if invoice number already exists
document_number = quick_result.get('document_number')
if document_number:
logger.info(f"🔍 Checking for duplicate invoice number: {document_number}")
# Check 1: Search in local database (supplier_invoices table)
existing_invoice = execute_query(
"""SELECT si.id, si.invoice_number, si.created_at, v.name as vendor_name
FROM supplier_invoices si
LEFT JOIN vendors v ON v.id = si.vendor_id
WHERE si.invoice_number = %s
ORDER BY si.created_at DESC
LIMIT 1""",
(document_number,),
fetchone=True
)
if existing_invoice:
# DUPLICATE FOUND IN DATABASE
logger.error(f"🚫 DUPLICATE: Invoice {document_number} already exists in database (ID: {existing_invoice['id']})")
# Mark file as duplicate
execute_update(
"""UPDATE incoming_files
SET status = 'duplicate',
error_message = %s,
processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""",
(f"DUBLET: Fakturanummer {document_number} findes allerede i systemet (Faktura #{existing_invoice['id']}, {existing_invoice['vendor_name'] or 'Ukendt leverandør'})",
file_id)
)
raise HTTPException(
status_code=409, # 409 Conflict
detail=f"🚫 DUBLET: Fakturanummer {document_number} findes allerede i systemet (Faktura #{existing_invoice['id']}, oprettet {existing_invoice['created_at'].strftime('%d-%m-%Y')})"
)
# Check 2: Search in e-conomic (if configured)
from app.services.economic_service import economic_service
if hasattr(economic_service, 'app_secret_token') and economic_service.app_secret_token:
logger.info(f"🔍 Checking e-conomic for invoice number: {document_number}")
economic_duplicate = await economic_service.check_invoice_number_exists(document_number)
if economic_duplicate:
# DUPLICATE FOUND IN E-CONOMIC
logger.error(f"🚫 DUPLICATE: Invoice {document_number} found in e-conomic (Voucher #{economic_duplicate.get('voucher_number')})")
# Mark file as duplicate
execute_update(
"""UPDATE incoming_files
SET status = 'duplicate',
error_message = %s,
processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""",
(f"DUBLET: Fakturanummer {document_number} findes i e-conomic (Bilag #{economic_duplicate.get('voucher_number')})",
file_id)
)
raise HTTPException(
status_code=409, # 409 Conflict
detail=f"🚫 DUBLET: Fakturanummer {document_number} findes i e-conomic (Bilag #{economic_duplicate.get('voucher_number')}, {economic_duplicate.get('date')})"
)
logger.info(f"✅ No duplicate found for invoice {document_number}")
# Try template matching # Try template matching
logger.info(f"📋 Matching template...") logger.info(f"📋 Matching template...")
template_id, confidence = template_service.match_template(text) template_id, confidence = template_service.match_template(text)
@ -1699,7 +1915,8 @@ async def upload_supplier_invoice(file: UploadFile = File(...)):
"""INSERT INTO extraction_lines """INSERT INTO extraction_lines
(extraction_id, line_number, description, quantity, unit_price, (extraction_id, line_number, description, quantity, unit_price,
line_total, vat_rate, vat_note, confidence) line_total, vat_rate, vat_note, confidence)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)""", VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
RETURNING line_id""",
(extraction_id, idx, line.get('description'), (extraction_id, idx, line.get('description'),
line.get('quantity'), line.get('unit_price'), line.get('quantity'), line.get('unit_price'),
line.get('line_total'), line.get('vat_rate'), line.get('line_total'), line.get('vat_rate'),
@ -1744,13 +1961,41 @@ async def upload_supplier_invoice(file: UploadFile = File(...)):
"confidence": confidence, "confidence": confidence,
"extracted_fields": extracted_fields, "extracted_fields": extracted_fields,
"pdf_text": text[:500], # First 500 chars for reference "pdf_text": text[:500], # First 500 chars for reference
# Quick analysis results (available IMMEDIATELY on upload)
"quick_analysis": {
"cvr": quick_result.get('cvr'),
"vendor_id": quick_result.get('vendor_id'),
"vendor_name": quick_result.get('vendor_name'),
"document_type": quick_result.get('document_type'),
"document_number": quick_result.get('document_number')
},
"message": "Upload gennemført - gennemgå og bekræft data" "message": "Upload gennemført - gennemgå og bekræft data"
} }
except HTTPException: except HTTPException as he:
# Mark file as failed if we have file_id
if 'file_id' in locals():
execute_update(
"""UPDATE incoming_files
SET status = 'failed',
error_message = %s,
processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""",
(str(he.detail), file_id)
)
raise raise
except Exception as e: except Exception as e:
logger.error(f"❌ Upload failed (inner): {e}", exc_info=True) logger.error(f"❌ Upload failed (inner): {e}", exc_info=True)
# Mark file as failed if we have file_id
if 'file_id' in locals():
execute_update(
"""UPDATE incoming_files
SET status = 'failed',
error_message = %s,
processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""",
(str(e), file_id)
)
raise HTTPException(status_code=500, detail=f"Upload fejlede: {str(e)}") raise HTTPException(status_code=500, detail=f"Upload fejlede: {str(e)}")
except HTTPException: except HTTPException:
@ -1809,51 +2054,174 @@ async def reprocess_uploaded_file(file_id: int):
logger.info(f"✅ Matched template {template_id} ({confidence:.0%})") logger.info(f"✅ Matched template {template_id} ({confidence:.0%})")
extracted_fields = template_service.extract_fields(text, template_id) extracted_fields = template_service.extract_fields(text, template_id)
template = template_service.templates_cache.get(template_id) # Check if this is an invoice2data template (ID -1)
if template: is_invoice2data = (template_id == -1)
vendor_id = template.get('vendor_id')
template_service.log_usage(template_id, file_id, True, confidence, extracted_fields) if is_invoice2data:
# Invoice2data doesn't have vendor in cache
logger.info(f"📋 Using invoice2data template")
# Try to find vendor from extracted CVR
if extracted_fields.get('vendor_vat'):
vendor = execute_query(
"SELECT id FROM vendors WHERE cvr_number = %s",
(extracted_fields['vendor_vat'],),
fetchone=True
)
if vendor:
vendor_id = vendor['id']
# Store invoice2data extraction in database
extraction_id = execute_insert(
"""INSERT INTO extractions
(file_id, vendor_matched_id, vendor_name, vendor_cvr,
document_id, document_date, due_date, document_type, document_type_detected,
total_amount, currency, confidence, llm_response_json, status)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
RETURNING extraction_id""",
(file_id, vendor_id,
extracted_fields.get('issuer'), # vendor_name
extracted_fields.get('vendor_vat'), # vendor_cvr
str(extracted_fields.get('invoice_number')), # document_id
extracted_fields.get('invoice_date'), # document_date
extracted_fields.get('due_date'),
'invoice', # document_type
'invoice', # document_type_detected
extracted_fields.get('amount_total'),
extracted_fields.get('currency', 'DKK'),
1.0, # invoice2data always 100% confidence
json.dumps(extracted_fields), # llm_response_json
'extracted') # status
)
# Insert line items if extracted
if extracted_fields.get('lines'):
for idx, line in enumerate(extracted_fields['lines'], start=1):
execute_insert(
"""INSERT INTO extraction_lines
(extraction_id, line_number, description, quantity, unit_price,
line_total, vat_rate, vat_note, confidence,
ip_address, contract_number, location_street, location_zip, location_city)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
RETURNING line_id""",
(extraction_id, idx, line.get('description'),
line.get('quantity'), line.get('unit_price'),
line.get('line_total'), None, None, 1.0,
line.get('ip_address'), line.get('contract_number'),
line.get('location_street'), line.get('location_zip'), line.get('location_city'))
)
logger.info(f"✅ Saved {len(extracted_fields['lines'])} line items")
else:
# Custom template from database
template = template_service.templates_cache.get(template_id)
if template:
vendor_id = template.get('vendor_id')
template_service.log_usage(template_id, file_id, True, confidence, extracted_fields)
# Update file - use NULL for invoice2data templates to avoid FK constraint
db_template_id = None if is_invoice2data else template_id
execute_update( execute_update(
"""UPDATE incoming_files """UPDATE incoming_files
SET status = 'processed', template_id = %s, processed_at = CURRENT_TIMESTAMP SET status = 'processed', template_id = %s, processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""", WHERE file_id = %s""",
(template_id, file_id) (db_template_id, file_id)
) )
else: else:
# NO AI FALLBACK - Require template matching # FALLBACK TO AI EXTRACTION
logger.warning(f"⚠️ Ingen template match (confidence: {confidence:.0%}) - afviser fil") logger.info(f"⚠️ Ingen template match (confidence: {confidence:.0%}) - bruger AI extraction")
# Use detected vendor from quick analysis if available
vendor_id = file_record.get('detected_vendor_id')
# Call Ollama for full extraction
logger.info(f"🤖 Calling Ollama for AI extraction...")
llm_result = await ollama_service.extract_from_text(text)
if not llm_result or 'error' in llm_result:
error_msg = llm_result.get('error') if llm_result else 'AI extraction fejlede'
logger.error(f"❌ AI extraction failed: {error_msg}")
execute_update(
"""UPDATE incoming_files
SET status = 'failed',
error_message = %s,
processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""",
(f"AI extraction fejlede: {error_msg}", file_id)
)
raise HTTPException(status_code=500, detail=f"AI extraction fejlede: {error_msg}")
extracted_fields = llm_result
confidence = llm_result.get('confidence', 0.75)
# Store AI extracted data in extractions table
extraction_id = execute_insert(
"""INSERT INTO supplier_invoice_extractions
(file_id, vendor_id, invoice_number, invoice_date, due_date,
total_amount, currency, document_type, confidence, llm_data)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING extraction_id""",
(file_id, vendor_id,
llm_result.get('invoice_number'),
llm_result.get('invoice_date'),
llm_result.get('due_date'),
llm_result.get('total_amount'),
llm_result.get('currency', 'DKK'),
llm_result.get('document_type'),
confidence,
json.dumps(llm_result))
)
# Insert line items if extracted
if llm_result.get('lines'):
for idx, line in enumerate(llm_result['lines'], start=1):
execute_insert(
"""INSERT INTO extraction_lines
(extraction_id, line_number, description, quantity, unit_price,
line_total, vat_rate, vat_note, confidence)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
RETURNING line_id""",
(extraction_id, idx, line.get('description'),
line.get('quantity'), line.get('unit_price'),
line.get('line_total'), line.get('vat_rate'),
line.get('vat_note'), confidence)
)
# Update file status to ai_extracted
execute_update( execute_update(
"""UPDATE incoming_files """UPDATE incoming_files
SET status = 'failed', SET status = 'ai_extracted', processed_at = CURRENT_TIMESTAMP
error_message = 'Ingen template match - opret template for denne leverandør',
processed_at = CURRENT_TIMESTAMP
WHERE file_id = %s""", WHERE file_id = %s""",
(file_id,) (file_id,)
) )
return { logger.info(f"✅ AI extraction completed for file {file_id}")
"status": "failed",
"file_id": file_id,
"error": "Ingen template match - opret template for denne leverandør",
"confidence": confidence
}
# Return success with template data # Return success with template data or AI extraction result
return { result = {
"status": "success", "status": "success",
"file_id": file_id, "file_id": file_id,
"filename": file_record['filename'], "filename": file_record['filename'],
"template_matched": template_id is not None, "template_matched": template_id is not None,
"template_id": template_id, "template_id": template_id,
"vendor_id": vendor_id, "vendor_id": vendor_id,
"confidence": confidence if template_id else 0.8, "confidence": confidence if template_id else llm_result.get('confidence', 0.75),
"extracted_fields": extracted_fields, "extracted_fields": extracted_fields,
"pdf_text": text[:1000] if not template_id else text "pdf_text": text[:1000] if not template_id else text
} }
# Add warning if no template exists
if not template_id and vendor_id:
vendor = execute_query(
"SELECT name FROM vendors WHERE id = %s",
(vendor_id,),
fetchone=True
)
if vendor:
result["warning"] = f"⚠️ Ingen template fundet for {vendor['name']} - brugte AI extraction (langsommere)"
return result
except HTTPException: except HTTPException:
raise raise
except Exception as e: except Exception as e:
@ -1866,6 +2234,7 @@ async def update_template(
template_name: Optional[str] = None, template_name: Optional[str] = None,
detection_patterns: Optional[List[Dict]] = None, detection_patterns: Optional[List[Dict]] = None,
field_mappings: Optional[Dict] = None, field_mappings: Optional[Dict] = None,
default_product_category: Optional[str] = None,
is_active: Optional[bool] = None is_active: Optional[bool] = None
): ):
"""Opdater eksisterende template""" """Opdater eksisterende template"""
@ -1884,6 +2253,9 @@ async def update_template(
if field_mappings is not None: if field_mappings is not None:
updates.append("field_mappings = %s") updates.append("field_mappings = %s")
params.append(json.dumps(field_mappings)) params.append(json.dumps(field_mappings))
if default_product_category is not None:
updates.append("default_product_category = %s")
params.append(default_product_category)
if is_active is not None: if is_active is not None:
updates.append("is_active = %s") updates.append("is_active = %s")
params.append(is_active) params.append(is_active)
@ -1911,6 +2283,114 @@ async def update_template(
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@router.post("/supplier-invoices/templates/invoice2data/{template_name}/test")
async def test_invoice2data_template(template_name: str, request: Dict):
"""
Test invoice2data YAML template mod PDF tekst
Request body:
{
"pdf_text": "Full PDF text content..."
}
Returns samme format som test_template endpoint
"""
try:
pdf_text = request.get('pdf_text', '')
if not pdf_text:
raise HTTPException(status_code=400, detail="pdf_text er påkrævet")
# Get invoice2data service
invoice2data_service = get_invoice2data_service()
# Check if template exists
if template_name not in invoice2data_service.templates:
raise HTTPException(status_code=404, detail=f"Template '{template_name}' ikke fundet")
template_data = invoice2data_service.templates[template_name]
# Test extraction
result = invoice2data_service.extract_with_template(pdf_text, template_name)
if not result:
# Template didn't match
keywords = template_data.get('keywords', [])
detection_results = []
for keyword in keywords:
found = str(keyword).lower() in pdf_text.lower()
detection_results.append({
"pattern": str(keyword),
"type": "keyword",
"found": found,
"weight": 0.5
})
return {
"matched": False,
"confidence": 0.0,
"extracted_fields": {},
"line_items": [],
"detection_results": detection_results,
"template_name": template_name,
"error": "Template matchede ikke PDF'en"
}
# Extract line items
line_items = []
if 'lines' in result:
for line in result['lines']:
line_items.append({
"line_number": line.get('line_number', ''),
"item_number": line.get('item_number', ''),
"description": line.get('description_raw', '') or line.get('description', ''),
"quantity": line.get('quantity', ''),
"unit_price": line.get('unit_price', ''),
"line_total": line.get('line_total', ''),
# Context fields (circuit/location info)
"circuit_id": line.get('circuit_id', ''),
"ip_address": line.get('ip_address', ''),
"contract_number": line.get('contract_number', ''),
"location_street": line.get('location_street', ''),
"location_zip": line.get('location_zip', ''),
"location_city": line.get('location_city', ''),
})
# Build detection results
keywords = template_data.get('keywords', [])
detection_results = []
matched_count = 0
for keyword in keywords:
found = str(keyword).lower() in pdf_text.lower()
if found:
matched_count += 1
detection_results.append({
"pattern": str(keyword),
"type": "keyword",
"found": found,
"weight": 0.5
})
confidence = matched_count / len(keywords) if keywords else 1.0
# Remove 'lines' from extracted_fields to avoid duplication
extracted_fields = {k: v for k, v in result.items() if k != 'lines'}
return {
"matched": True,
"confidence": confidence,
"extracted_fields": extracted_fields,
"line_items": line_items,
"detection_results": detection_results,
"template_name": template_name
}
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Invoice2data template test failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/supplier-invoices/templates/{template_id}/test") @router.post("/supplier-invoices/templates/{template_id}/test")
async def test_template(template_id: int, request: Dict): async def test_template(template_id: int, request: Dict):
""" """
@ -2076,6 +2556,102 @@ async def test_template(template_id: int, request: Dict):
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@router.put("/supplier-invoices/templates/invoice2data/{template_name}/category")
async def update_yaml_category(template_name: str, request: Dict):
"""
Opdater default_product_category i YAML template fil
Request body:
{
"category": "drift" // varesalg, drift, anlæg, abonnement, lager, udlejning
}
"""
try:
import yaml
from pathlib import Path
new_category = request.get('category')
if not new_category:
raise HTTPException(status_code=400, detail="category er påkrævet")
# Validate category
valid_categories = ['varesalg', 'drift', 'anlæg', 'abonnement', 'lager', 'udlejning']
if new_category not in valid_categories:
raise HTTPException(status_code=400, detail=f"Ugyldig kategori. Skal være en af: {', '.join(valid_categories)}")
# Find YAML file
templates_dir = Path(__file__).parent.parent.parent.parent / 'data' / 'invoice_templates'
yaml_file = templates_dir / f"{template_name}.yml"
if not yaml_file.exists():
raise HTTPException(status_code=404, detail=f"YAML fil ikke fundet: {template_name}.yml")
# Load YAML
with open(yaml_file, 'r', encoding='utf-8') as f:
template_data = yaml.safe_load(f)
# Update category
template_data['default_product_category'] = new_category
# Save YAML with preserved formatting
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(template_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Reload invoice2data service to pick up changes
invoice2data_service = get_invoice2data_service()
invoice2data_service.__init__() # Reinitialize to reload templates
logger.info(f"✅ Updated category for {template_name}.yml to {new_category}")
return {
"message": "Kategori opdateret",
"template_name": template_name,
"new_category": new_category
}
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to update YAML category: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/supplier-invoices/templates/invoice2data/{template_name}/content")
async def get_yaml_content(template_name: str):
"""
Hent råt YAML indhold fra template fil
Returns:
{
"content": "issuer: DCS ApS\nkeywords: ..."
}
"""
try:
from pathlib import Path
# Find template file
template_dir = Path("data/invoice_templates")
template_file = template_dir / f"{template_name}.yml"
if not template_file.exists():
raise HTTPException(status_code=404, detail=f"Template fil ikke fundet: {template_name}.yml")
# Read file content
content = template_file.read_text(encoding='utf-8')
return {
"template_name": template_name,
"filename": f"{template_name}.yml",
"content": content
}
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to read YAML content: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/supplier-invoices/templates/{template_id}") @router.delete("/supplier-invoices/templates/{template_id}")
async def delete_template(template_id: int): async def delete_template(template_id: int):
"""Slet template (soft delete - sæt is_active=false)""" """Slet template (soft delete - sæt is_active=false)"""

File diff suppressed because it is too large Load Diff

View File

@ -127,6 +127,11 @@
<div class="row" id="filesList"> <div class="row" id="filesList">
<!-- Files loaded dynamically --> <!-- Files loaded dynamically -->
</div> </div>
<div class="mt-3 text-end">
<button class="btn btn-outline-secondary" onclick="skipFileSelection()">
Spring over <i class="bi bi-arrow-right ms-2"></i>
</button>
</div>
</div> </div>
</div> </div>
@ -156,6 +161,18 @@
<input type="text" class="form-control" id="templateName" placeholder="F.eks. 'BMC Standard Faktura'" required> <input type="text" class="form-control" id="templateName" placeholder="F.eks. 'BMC Standard Faktura'" required>
<small class="text-muted">Navn på templaten, f.eks. leverandør + "Standard" eller "Email faktura"</small> <small class="text-muted">Navn på templaten, f.eks. leverandør + "Standard" eller "Email faktura"</small>
</div> </div>
<div class="mb-3">
<label class="form-label">Produktkategori <span class="text-danger">*</span></label>
<select class="form-select" id="productCategory" required>
<option value="varesalg">🛒 Varesalg (videresalg af hardware)</option>
<option value="drift">🔧 Drift (internet, hosting, cloud services)</option>
<option value="anlæg">🏗️ Anlæg (investeringer, infrastruktur)</option>
<option value="abonnement">📅 Abonnement (løbende services)</option>
<option value="lager">📦 Lager (lagervarer)</option>
<option value="udlejning">🏪 Udlejning</option>
</select>
<small class="text-muted">Standardkategori for varelinjer fra denne leverandør</small>
</div>
<button class="btn btn-primary" onclick="validateAndNextStep(3)"> <button class="btn btn-primary" onclick="validateAndNextStep(3)">
Næste <i class="bi bi-arrow-right ms-2"></i> Næste <i class="bi bi-arrow-right ms-2"></i>
</button> </button>
@ -462,6 +479,137 @@ document.addEventListener('DOMContentLoaded', async () => {
} else { } else {
await loadPendingFiles(); await loadPendingFiles();
await loadVendors(); await loadVendors();
// Check if we're creating a template for a specific vendor/file
const vendorIdParam = urlParams.get('vendor');
const fileIdParam = urlParams.get('file');
// Check for sessionStorage data (from supplier invoices page)
const storedData = sessionStorage.getItem('templateCreateData');
let targetFileId = fileIdParam;
let targetVendorId = vendorIdParam;
let targetFileName = null;
let targetPdfText = null;
if (storedData) {
try {
const data = JSON.parse(storedData);
console.log('🔄 Loaded template creation data from sessionStorage:', data);
// Override with sessionStorage if available
if (data.fileId) targetFileId = data.fileId;
if (data.vendorId) targetVendorId = data.vendorId;
if (data.pdfText) targetPdfText = data.pdfText;
targetFileName = data.fileName || data.vendorName || targetFileName;
// Clear sessionStorage after use
sessionStorage.removeItem('templateCreateData');
} catch (error) {
console.error('Failed to parse template creation data:', error);
}
}
// If we have PDF text from sessionStorage, skip file selection
if (targetPdfText && targetVendorId && targetFileId) {
console.log('🚀 Fast-track: Using PDF text from sessionStorage');
// Set up the file data directly
currentFile = {
file_id: targetFileId,
filename: targetFileName || `File ${targetFileId}`,
text: targetPdfText
};
pdfText = targetPdfText;
// Wait for vendors to load
setTimeout(() => {
// Pre-select vendor
const vendorSelect = document.getElementById('vendorSelect');
if (vendorSelect) {
vendorSelect.value = targetVendorId;
console.log('✅ Vendor pre-selected:', targetVendorId);
}
// Auto-generate template name
const templateNameInput = document.getElementById('templateName');
if (templateNameInput && !templateNameInput.value) {
const vendorName = vendorSelect?.options[vendorSelect.selectedIndex]?.text || 'Template';
templateNameInput.value = `${vendorName} Standard Template`;
console.log('✅ Template name generated:', templateNameInput.value);
}
// Show PDF preview in step 2
document.getElementById('pdfPreview2').textContent = pdfText;
// Go directly to step 2
console.log('🎯 Jumping to step 2 (vendor & template name)');
nextStep(2);
// After a moment, auto-advance to step 3
setTimeout(() => {
console.log('🚀 Auto-advancing to step 3 (pattern definition)');
validateAndNextStep(3);
}, 500);
}, 500);
}
// If we have a target file but no PDF text, try to select from pending list
else if (targetFileId) {
console.log(`🎯 Auto-selecting file ${targetFileId} (${targetFileName || 'unknown'})`);
// Wait for files to load, then auto-select
setTimeout(async () => {
try {
// First check if file exists in the loaded files
const filesList = document.getElementById('filesList');
console.log('📋 Files list HTML:', filesList.innerHTML.substring(0, 200));
// Try to select the file
console.log('🔄 Calling selectFile...');
await selectFile(parseInt(targetFileId), targetFileName || `File ${targetFileId}`);
console.log('✅ selectFile completed');
// After file is selected, pre-select vendor if available
if (targetVendorId) {
console.log(`🎯 Pre-selecting vendor ${targetVendorId}`);
// Wait a bit for step 2 to render
setTimeout(() => {
const vendorSelect = document.getElementById('vendorSelect');
if (!vendorSelect) {
console.error('❌ vendorSelect not found!');
return;
}
vendorSelect.value = targetVendorId;
console.log('✅ Vendor selected:', vendorSelect.value);
// If both file and vendor are set, auto-advance to step 3
setTimeout(() => {
const templateNameInput = document.getElementById('templateName');
if (!templateNameInput) {
console.error('❌ templateName input not found!');
return;
}
if (!templateNameInput.value) {
// Auto-generate template name if empty
const vendorName = vendorSelect.options[vendorSelect.selectedIndex]?.text || 'Template';
templateNameInput.value = `${vendorName} Standard Template`;
console.log('✅ Template name set:', templateNameInput.value);
}
console.log('🚀 Auto-advancing to step 3 (pattern definition)');
validateAndNextStep(3);
}, 300);
}, 300);
}
} catch (error) {
console.error('❌ Failed to auto-select file:', error);
alert('Kunne ikke auto-vælge fil: ' + error.message);
}
}, 1000); // Increased timeout to 1 second
}
} }
}); });
@ -498,6 +646,11 @@ async function loadExistingTemplate(templateId) {
await loadVendors(); await loadVendors();
document.getElementById('vendorSelect').value = template.vendor_id; document.getElementById('vendorSelect').value = template.vendor_id;
// Set product category
if (template.default_product_category) {
document.getElementById('productCategory').value = template.default_product_category;
}
// Load detection patterns // Load detection patterns
detectionPatterns = template.detection_patterns || []; detectionPatterns = template.detection_patterns || [];
@ -727,30 +880,63 @@ async function loadVendors() {
async function selectFile(fileId, filename) { async function selectFile(fileId, filename) {
try { try {
// Reprocess file to get PDF text console.log(`🔄 Selecting file: ${fileId} (${filename})`);
const response = await fetch(`/api/v1/supplier-invoices/reprocess/${fileId}`, {
method: 'POST' // Get PDF text directly (fast endpoint, no AI processing)
}); console.log(`📡 Fetching: /api/v1/supplier-invoices/files/${fileId}/pdf-text`);
const response = await fetch(`/api/v1/supplier-invoices/files/${fileId}/pdf-text`);
console.log(`📥 Response status: ${response.status}`);
if (!response.ok) {
const errorText = await response.text();
console.error(`❌ HTTP error: ${response.status} - ${errorText}`);
throw new Error(`HTTP ${response.status}: ${errorText}`);
}
const data = await response.json(); const data = await response.json();
console.log('📦 Response data:', data);
if (!data.pdf_text) {
console.warn('⚠️ No PDF text in response');
}
currentFile = { currentFile = {
file_id: fileId, file_id: fileId,
filename: filename, filename: filename,
text: data.pdf_text text: data.pdf_text || ''
}; };
pdfText = data.pdf_text; pdfText = data.pdf_text || '';
console.log(`✅ File loaded, PDF text length: ${pdfText.length} chars`);
// Show PDF preview // Show PDF preview
document.getElementById('pdfPreview').textContent = pdfText; const pdfPreview = document.getElementById('pdfPreview');
if (pdfPreview) {
pdfPreview.textContent = pdfText;
}
console.log('🚀 Advancing to step 2');
nextStep(2); nextStep(2);
} catch (error) { } catch (error) {
console.error('Failed to load file:', error); console.error('Failed to load file:', error);
alert('Kunne ikke hente fil'); alert('Kunne ikke hente fil: ' + error.message);
} }
} }
function skipFileSelection() {
// Allow user to proceed without selecting a file
// They can upload/paste PDF text later
console.log('⏭️ Skipping file selection');
currentFile = null;
pdfText = '';
nextStep(2);
}
function validateAndNextStep(targetStep) { function validateAndNextStep(targetStep) {
// Validate step 2 fields // Validate step 2 fields
if (targetStep === 3) { if (targetStep === 3) {
@ -1289,8 +1475,9 @@ async function autoGenerateTemplate() {
async function saveTemplate() { async function saveTemplate() {
const vendorId = document.getElementById('vendorSelect').value; const vendorId = document.getElementById('vendorSelect').value;
const templateName = document.getElementById('templateName').value; const templateName = document.getElementById('templateName').value;
const productCategory = document.getElementById('productCategory').value;
console.log('Saving template...', { vendorId, templateName, editingTemplateId }); console.log('Saving template...', { vendorId, templateName, productCategory, editingTemplateId });
console.log('Detection patterns:', detectionPatterns); console.log('Detection patterns:', detectionPatterns);
console.log('Field patterns:', fieldPatterns); console.log('Field patterns:', fieldPatterns);
@ -1299,6 +1486,11 @@ async function saveTemplate() {
return; return;
} }
if (!productCategory) {
alert('Vælg produktkategori');
return;
}
if (detectionPatterns.length === 0) { if (detectionPatterns.length === 0) {
alert('Tilføj mindst ét detektionsmønster'); alert('Tilføj mindst ét detektionsmønster');
return; return;
@ -1378,6 +1570,7 @@ async function saveTemplate() {
body: JSON.stringify({ body: JSON.stringify({
vendor_id: parseInt(vendorId), vendor_id: parseInt(vendorId),
template_name: templateName, template_name: templateName,
default_product_category: productCategory,
detection_patterns: detectionPatternsData, detection_patterns: detectionPatternsData,
field_mappings: fieldMappings field_mappings: fieldMappings
}) })

View File

@ -56,12 +56,9 @@
<div class="container mt-4"> <div class="container mt-4">
<div class="d-flex justify-content-between align-items-center mb-4"> <div class="d-flex justify-content-between align-items-center mb-4">
<div> <div>
<h2><i class="bi bi-grid-3x3 me-2"></i>Faktura Templates</h2> <h2><i class="bi bi-file-earmark-code me-2"></i>Invoice2Data Templates (YAML)</h2>
<p class="text-muted">Administrer templates til automatisk faktura-udtrækning</p> <p class="text-muted">YAML-baserede templates til automatisk faktura-udtrækning</p>
</div> </div>
<a href="/billing/template-builder" class="btn btn-primary">
<i class="bi bi-plus-circle me-2"></i>Ny Template
</a>
</div> </div>
<div id="templatesList" class="row"> <div id="templatesList" class="row">
@ -69,6 +66,63 @@
</div> </div>
</div> </div>
<!-- Edit YAML Category Modal -->
<div class="modal fade" id="editYamlCategoryModal" tabindex="-1">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title">
<i class="bi bi-pencil me-2"></i>Rediger Kategori: <span id="yamlTemplateName"></span>
</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal"></button>
</div>
<div class="modal-body">
<div class="mb-3">
<label class="form-label">Produkt Kategori</label>
<select class="form-select" id="yamlCategorySelect">
<option value="varesalg">🛒 Varesalg</option>
<option value="drift">🔧 Drift</option>
<option value="anlæg">🏗️ Anlæg</option>
<option value="abonnement">📅 Abonnement</option>
<option value="lager">📦 Lager</option>
<option value="udlejning">🏪 Udlejning</option>
</select>
</div>
<div class="alert alert-info">
<i class="bi bi-info-circle me-2"></i>
<small>Dette ændrer default_product_category i YAML filen. Filen bliver opdateret på serveren.</small>
</div>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Annuller</button>
<button type="button" class="btn btn-primary" onclick="saveYamlCategory()">
<i class="bi bi-save me-2"></i>Gem Kategori
</button>
</div>
</div>
</div>
</div>
<!-- View YAML Content Modal -->
<div class="modal fade" id="viewYamlModal" tabindex="-1">
<div class="modal-dialog modal-lg">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title">
<i class="bi bi-file-earmark-code me-2"></i>YAML Indhold: <span id="viewYamlTemplateName"></span>
</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal"></button>
</div>
<div class="modal-body">
<pre id="yamlContent" style="background: #f8f9fa; padding: 15px; border-radius: 8px; max-height: 600px; overflow-y: auto;"><code></code></pre>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Luk</button>
</div>
</div>
</div>
</div>
<!-- Test Modal --> <!-- Test Modal -->
<div class="modal fade test-modal" id="testModal" tabindex="-1"> <div class="modal fade test-modal" id="testModal" tabindex="-1">
<div class="modal-dialog modal-xl"> <div class="modal-dialog modal-xl">
@ -116,6 +170,7 @@
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
<script> <script>
let currentTemplateId = null; let currentTemplateId = null;
let currentTemplateIsInvoice2data = false;
document.addEventListener('DOMContentLoaded', async () => { document.addEventListener('DOMContentLoaded', async () => {
await loadTemplates(); await loadTemplates();
@ -142,36 +197,65 @@ async function loadTemplates() {
return; return;
} }
templates.forEach(template => { // Filter to only show invoice2data templates
const invoice2dataTemplates = templates.filter(t => t.template_type === 'invoice2data');
if (invoice2dataTemplates.length === 0) {
container.innerHTML = `
<div class="col-12">
<div class="alert alert-info">
<i class="bi bi-info-circle me-2"></i>
Ingen YAML templates endnu. Opret .yml filer i <code>data/invoice_templates/</code>
</div>
</div>
`;
return;
}
invoice2dataTemplates.forEach(template => {
const detectionPatterns = template.detection_patterns || []; const detectionPatterns = template.detection_patterns || [];
const fieldMappings = template.field_mappings || {}; const fieldMappings = template.field_mappings || {};
const fieldCount = Object.keys(fieldMappings).filter(k => !['lines_start', 'lines_end', 'line_item'].includes(k)).length; const fieldCount = Object.keys(fieldMappings).filter(k => !['lines_start', 'lines_end', 'line_item'].includes(k)).length;
const category = template.default_product_category || 'varesalg';
const categoryIcons = {
'varesalg': '🛒',
'drift': '🔧',
'anlæg': '🏗️',
'abonnement': '📅',
'lager': '📦',
'udlejning': '🏪'
};
const categoryIcon = categoryIcons[category] || '📦';
container.innerHTML += ` container.innerHTML += `
<div class="col-md-4 mb-3"> <div class="col-md-4 mb-3">
<div class="card template-card"> <div class="card template-card">
<div class="card-body"> <div class="card-body">
<h5 class="card-title"> <div class="d-flex justify-content-between align-items-start mb-2">
<i class="bi bi-file-text me-2"></i>${template.template_name} <h5 class="card-title mb-0">
</h5> <i class="bi bi-file-earmark-code me-2"></i>${template.template_name}
</h5>
<span class="badge bg-success">YAML</span>
</div>
<p class="card-text text-muted mb-2"> <p class="card-text text-muted mb-2">
<small> <small>
<i class="bi bi-building me-1"></i>${template.vendor_name || 'Ingen leverandør'}<br> <i class="bi bi-building me-1"></i>${template.vendor_name || 'Ingen leverandør'}
<i class="bi bi-check-circle me-1"></i>${detectionPatterns.length} detektionsmønstre<br> ${template.vendor_cvr ? `<br><i class="bi bi-hash me-1"></i>CVR: ${template.vendor_cvr}` : ''}
<i class="bi bi-input-cursor me-1"></i>${fieldCount} felter<br> <br><i class="bi bi-check-circle me-1"></i>${detectionPatterns.length} detektionsmønstre
<i class="bi bi-graph-up me-1"></i>${template.usage_count || 0} gange brugt <br><i class="bi bi-input-cursor me-1"></i>${fieldCount} felter
<br><strong>${categoryIcon} Kategori: ${category}</strong>
</small> </small>
</p> </p>
<div class="d-flex gap-2"> <div class="d-flex gap-2 flex-wrap">
<button class="btn btn-sm btn-primary" onclick="editTemplate(${template.template_id})"> <button class="btn btn-sm btn-primary" onclick="viewYamlContent('${template.yaml_filename}')" title="Vis YAML indhold">
<i class="bi bi-pencil"></i> Rediger <i class="bi bi-file-earmark-code"></i> Vis YAML
</button> </button>
<button class="btn btn-sm btn-info" onclick="openTestModal(${template.template_id}, '${template.template_name}')"> <button class="btn btn-sm btn-warning" onclick="editYamlCategory('${template.yaml_filename}', '${category}')" title="Rediger kategori">
<i class="bi bi-pencil"></i> Kategori
</button>
<button class="btn btn-sm btn-info" onclick="openTestModal('${template.yaml_filename}', '${template.template_name}', true, ${template.vendor_id || 'null'})">
<i class="bi bi-flask"></i> Test <i class="bi bi-flask"></i> Test
</button> </button>
<button class="btn btn-sm btn-danger" onclick="deleteTemplate(${template.template_id})">
<i class="bi bi-trash"></i>
</button>
</div> </div>
</div> </div>
</div> </div>
@ -211,22 +295,31 @@ async function loadPendingFiles(vendorId = null) {
} }
} }
async function openTestModal(templateId, templateName) { async function openTestModal(templateId, templateName, isInvoice2data = false, vendorId = null) {
currentTemplateId = templateId; currentTemplateId = templateId;
currentTemplateIsInvoice2data = isInvoice2data;
document.getElementById('modalTemplateName').textContent = templateName; document.getElementById('modalTemplateName').textContent = templateName;
document.getElementById('testResultsContainer').classList.add('d-none'); document.getElementById('testResultsContainer').classList.add('d-none');
document.getElementById('testFileSelect').value = ''; document.getElementById('testFileSelect').value = '';
// Load template to get vendor_id // For invoice2data templates, use vendorId if provided
try { if (isInvoice2data && vendorId) {
const response = await fetch(`/api/v1/supplier-invoices/templates/${templateId}`); await loadPendingFiles(vendorId);
const template = await response.json(); } else if (!isInvoice2data) {
// Load database template to get vendor_id
try {
const response = await fetch(`/api/v1/supplier-invoices/templates/${templateId}`);
const template = await response.json();
// Reload files filtered by this template's vendor // Reload files filtered by this template's vendor
await loadPendingFiles(template.vendor_id); await loadPendingFiles(template.vendor_id);
} catch (error) { } catch (error) {
console.error('Failed to load template:', error); console.error('Failed to load template:', error);
await loadPendingFiles(); // Fallback to all files await loadPendingFiles(); // Fallback to all files
}
} else {
// No vendor - load all files
await loadPendingFiles();
} }
const modal = new bootstrap.Modal(document.getElementById('testModal')); const modal = new bootstrap.Modal(document.getElementById('testModal'));
@ -258,8 +351,15 @@ async function runTest() {
document.getElementById('testPdfPreview').textContent = pdfText; document.getElementById('testPdfPreview').textContent = pdfText;
document.getElementById('testResultsContainer').classList.remove('d-none'); document.getElementById('testResultsContainer').classList.remove('d-none');
// Test template // Test template - use different endpoint based on type
const testResponse = await fetch(`/api/v1/supplier-invoices/templates/${currentTemplateId}/test`, { let testUrl;
if (currentTemplateIsInvoice2data) {
testUrl = `/api/v1/supplier-invoices/templates/invoice2data/${currentTemplateId}/test`;
} else {
testUrl = `/api/v1/supplier-invoices/templates/${currentTemplateId}/test`;
}
const testResponse = await fetch(testUrl, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ pdf_text: pdfText }) body: JSON.stringify({ pdf_text: pdfText })
@ -303,21 +403,26 @@ async function runTest() {
<thead> <thead>
<tr> <tr>
<th>#</th> <th>#</th>
${lineItems[0].item_number ? '<th>Varenr</th>' : ''}
${lineItems[0].description ? '<th>Beskrivelse</th>' : ''} ${lineItems[0].description ? '<th>Beskrivelse</th>' : ''}
${lineItems[0].quantity ? '<th>Antal</th>' : ''} ${lineItems[0].quantity ? '<th>Antal</th>' : ''}
${lineItems[0].unit_price ? '<th>Pris</th>' : ''} ${lineItems[0].unit_price ? '<th>Pris</th>' : ''}
${lineItems.some(l => l.circuit_id || l.ip_address) ? '<th>Kredsløb/IP</th>' : ''}
${lineItems.some(l => l.location_street) ? '<th>Adresse</th>' : ''}
</tr> </tr>
</thead> </thead>
<tbody>`; <tbody>`;
lineItems.forEach(line => { lineItems.forEach((line, idx) => {
const locationText = [line.location_street, line.location_zip, line.location_city].filter(x => x).join(' ');
const circuitText = line.circuit_id || line.ip_address || '';
linesHtml += `<tr> linesHtml += `<tr>
<td>${line.line_number}</td> <td>${idx + 1}</td>
${line.item_number ? `<td>${line.item_number}</td>` : ''}
${line.description ? `<td>${line.description}</td>` : ''} ${line.description ? `<td>${line.description}</td>` : ''}
${line.quantity ? `<td>${line.quantity}</td>` : ''} ${line.quantity ? `<td>${line.quantity}</td>` : ''}
${line.unit_price ? `<td>${line.unit_price}</td>` : ''} ${line.unit_price ? `<td>${line.unit_price}</td>` : ''}
${lineItems.some(l => l.circuit_id || l.ip_address) ? `<td><small>${circuitText}</small></td>` : ''}
${lineItems.some(l => l.location_street) ? `<td><small>${locationText}</small></td>` : ''}
</tr>`; </tr>`;
}); });
@ -362,32 +467,65 @@ async function runTest() {
} }
} }
async function deleteTemplate(templateId) { let currentYamlTemplate = null;
if (!confirm('Er du sikker på at du vil slette denne template?')) {
async function viewYamlContent(yamlFilename) {
try {
const response = await fetch(`/api/v1/supplier-invoices/templates/invoice2data/${yamlFilename}/content`);
if (!response.ok) {
throw new Error('Kunne ikke hente YAML indhold');
}
const data = await response.json();
document.getElementById('viewYamlTemplateName').textContent = yamlFilename + '.yml';
document.getElementById('yamlContent').querySelector('code').textContent = data.content;
const modal = new bootstrap.Modal(document.getElementById('viewYamlModal'));
modal.show();
} catch (error) {
console.error('Failed to load YAML content:', error);
alert('❌ Kunne ikke hente YAML indhold');
}
}
function editYamlCategory(yamlFilename, currentCategory) {
currentYamlTemplate = yamlFilename;
document.getElementById('yamlTemplateName').textContent = yamlFilename + '.yml';
document.getElementById('yamlCategorySelect').value = currentCategory;
const modal = new bootstrap.Modal(document.getElementById('editYamlCategoryModal'));
modal.show();
}
async function saveYamlCategory() {
const newCategory = document.getElementById('yamlCategorySelect').value;
if (!currentYamlTemplate) {
alert('Ingen template valgt');
return; return;
} }
try { try {
const response = await fetch(`/api/v1/supplier-invoices/templates/${templateId}`, { const response = await fetch(`/api/v1/supplier-invoices/templates/invoice2data/${currentYamlTemplate}/category`, {
method: 'DELETE' method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ category: newCategory })
}); });
if (response.ok) { if (response.ok) {
alert('✅ Template slettet'); alert('✅ Kategori opdateret i YAML fil');
await loadTemplates(); bootstrap.Modal.getInstance(document.getElementById('editYamlCategoryModal')).hide();
await loadTemplates(); // Reload to show new category
} else { } else {
throw new Error('Sletning fejlede'); const error = await response.json();
throw new Error(error.detail || 'Opdatering fejlede');
} }
} catch (error) { } catch (error) {
console.error('Delete failed:', error); console.error('Category update failed:', error);
alert('❌ Kunne ikke slette template'); alert('❌ Kunne ikke opdatere kategori: ' + error.message);
} }
} }
function editTemplate(templateId) {
// Redirect to template builder with template ID
window.location.href = `/billing/template-builder?id=${templateId}`;
}
</script> </script>
</body> </body>

View File

@ -271,6 +271,54 @@ class EconomicService:
# ========== KASSEKLADDE (JOURNALS/VOUCHERS) ========== # ========== KASSEKLADDE (JOURNALS/VOUCHERS) ==========
async def check_invoice_number_exists(self, invoice_number: str, journal_number: Optional[int] = None) -> Optional[Dict]:
"""
Check if an invoice number already exists in e-conomic journals
Args:
invoice_number: Invoice number to check
journal_number: Optional specific journal to search (if None, searches all)
Returns:
Dict with voucher info if found, None otherwise
"""
try:
# Search in vouchers (posted journal entries)
url = f"{self.api_url}/vouchers"
params = {
'filter': f'voucherNumber${invoice_number}', # e-conomic filter syntax
'pagesize': 100
}
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self._get_headers(), params=params) as response:
if response.status != 200:
logger.warning(f"⚠️ Failed to search vouchers: {response.status}")
return None
data = await response.json()
vouchers = data.get('collection', [])
# Check if any voucher matches the invoice number
for voucher in vouchers:
# Check if invoice number appears in voucher text or entries
if invoice_number in str(voucher):
logger.warning(f"⚠️ Invoice number {invoice_number} found in e-conomic voucher #{voucher.get('voucherNumber')}")
return {
'found_in': 'e-conomic',
'voucher_number': voucher.get('voucherNumber'),
'date': voucher.get('date'),
'journal': voucher.get('journal', {}).get('journalNumber')
}
logger.info(f"✅ Invoice number {invoice_number} not found in e-conomic")
return None
except Exception as e:
logger.error(f"❌ Error checking invoice number in e-conomic: {e}")
# Don't block on e-conomic errors - assume not found
return None
async def get_supplier_invoice_journals(self) -> list: async def get_supplier_invoice_journals(self) -> list:
""" """
Get all available journals for supplier invoices (kassekladde) Get all available journals for supplier invoices (kassekladde)

View File

@ -0,0 +1,337 @@
"""
Invoice2Data Service
Wrapper around invoice2data library for template-based invoice extraction
"""
import logging
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
import yaml
logger = logging.getLogger(__name__)
class Invoice2DataService:
"""Service for extracting invoice data using invoice2data templates"""
def __init__(self):
self.template_dir = Path(__file__).parent.parent.parent / "data" / "invoice_templates"
self.templates = self._load_templates()
logger.info(f"📋 Loaded {len(self.templates)} invoice2data templates")
def _load_templates(self) -> Dict[str, Dict]:
"""Load all YAML templates from template directory"""
templates = {}
if not self.template_dir.exists():
logger.warning(f"Template directory not found: {self.template_dir}")
return templates
for template_file in self.template_dir.glob("*.yml"):
try:
with open(template_file, 'r', encoding='utf-8') as f:
template_data = yaml.safe_load(f)
template_name = template_file.stem
templates[template_name] = template_data
logger.debug(f" ✓ Loaded template: {template_name}")
except Exception as e:
logger.error(f" ✗ Failed to load template {template_file}: {e}")
return templates
def match_template(self, text: str) -> Optional[str]:
"""
Find matching template based on keywords
Returns template name or None
"""
text_lower = text.lower()
for template_name, template_data in self.templates.items():
keywords = template_data.get('keywords', [])
# Check if all keywords are present
matches = sum(1 for keyword in keywords if str(keyword).lower() in text_lower)
if matches >= len(keywords) * 0.7: # 70% of keywords must match
logger.info(f"✅ Matched template: {template_name} ({matches}/{len(keywords)} keywords)")
return template_name
logger.warning("⚠️ No template matched")
return None
def extract_with_template(self, text: str, template_name: str) -> Dict[str, Any]:
"""
Extract invoice data using specific template
"""
if template_name not in self.templates:
raise ValueError(f"Template not found: {template_name}")
template = self.templates[template_name]
fields = template.get('fields', {})
options = template.get('options', {})
extracted = {
'template': template_name,
'issuer': template.get('issuer'),
'country': template.get('country'),
'currency': options.get('currency', 'DKK')
}
# Extract each field using its regex
for field_name, field_config in fields.items():
if field_config.get('parser') != 'regex':
continue
pattern = field_config.get('regex')
field_type = field_config.get('type', 'string')
group = field_config.get('group', 1)
try:
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if match:
value = match.group(group).strip()
logger.debug(f" 🔍 Extracted raw value for {field_name}: '{value}' (type: {field_type})")
# Handle CVR filtering (avoid customer CVR)
if field_name == 'vendor_vat':
# Find ALL CVR numbers
all_cvr_matches = re.finditer(r'SE/CVR-nr\.\s+(\d{8})', text, re.IGNORECASE)
cvr_numbers = [m.group(1) for m in all_cvr_matches]
# Filter out BMC's CVR (29522790)
vendor_cvrs = [cvr for cvr in cvr_numbers if cvr != '29522790']
if vendor_cvrs:
value = vendor_cvrs[0]
logger.debug(f"{field_name}: {value} (filtered from {cvr_numbers})")
else:
logger.warning(f" ⚠️ Only customer CVR found, no vendor CVR")
continue
# Convert type
if field_type == 'float':
# Handle Danish number format (1.234,56 → 1234.56)
# OR (148,587.98 → 148587.98) - handle both formats
decimal_sep = options.get('decimal_separator', ',')
thousands_sep = options.get('thousands_separator', '.')
# Remove all spaces first
value = value.replace(' ', '')
# If both separators are present, we can determine the format
# Danish: 148.587,98 (thousands=., decimal=,)
# English: 148,587.98 (thousands=, decimal=.)
if thousands_sep in value and decimal_sep in value:
# Remove thousands separator, then convert decimal separator to .
value = value.replace(thousands_sep, '').replace(decimal_sep, '.')
elif thousands_sep in value:
# Only thousands separator present - just remove it
value = value.replace(thousands_sep, '')
elif decimal_sep in value and decimal_sep == ',':
# Only decimal separator and it's Danish comma - convert to .
value = value.replace(',', '.')
value = float(value)
elif field_type == 'int':
value = int(value)
elif field_type == 'date':
# Try to parse Danish dates
date_formats = options.get('date_formats', ['%B %d, %Y', '%d-%m-%Y'])
# Danish month names
value = value.replace('januar', 'January').replace('februar', 'February')
value = value.replace('marts', 'March').replace('april', 'April')
value = value.replace('maj', 'May').replace('juni', 'June')
value = value.replace('juli', 'July').replace('august', 'August')
value = value.replace('september', 'September').replace('oktober', 'October')
value = value.replace('november', 'November').replace('december', 'December')
for date_format in date_formats:
try:
parsed_date = datetime.strptime(value, date_format)
value = parsed_date.strftime('%Y-%m-%d')
break
except ValueError:
continue
extracted[field_name] = value
logger.debug(f"{field_name}: {value}")
else:
logger.debug(f"{field_name}: No match")
except Exception as e:
logger.warning(f" ✗ Failed to extract {field_name}: {e}")
# Extract line items if defined in template
lines_config = template.get('lines', [])
if lines_config:
extracted['lines'] = self._extract_lines(text, lines_config, options)
return extracted
def _extract_lines(self, text: str, lines_configs: List[Dict], options: Dict) -> List[Dict]:
"""Extract line items from invoice text"""
all_lines = []
logger.debug(f"🔍 Extracting lines with {len(lines_configs)} configurations")
for lines_config in lines_configs:
start_pattern = lines_config.get('start')
end_pattern = lines_config.get('end')
line_config = lines_config.get('line', {})
if not start_pattern or not line_config:
continue
try:
# Find section between start and end patterns
if end_pattern:
section_pattern = f"{start_pattern}(.*?){end_pattern}"
section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE)
else:
section_pattern = f"{start_pattern}(.*?)$"
section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE)
if not section_match:
logger.debug(f" ✗ Line section not found (start: {start_pattern[:50]}, end: {end_pattern[:50] if end_pattern else 'None'})")
continue
section_text = section_match.group(1)
logger.debug(f" ✓ Found line section ({len(section_text)} chars)")
# Extract individual lines
line_pattern = line_config.get('regex')
field_names = line_config.get('fields', [])
field_types = line_config.get('types', {})
context_config = line_config.get('context_before', {})
if not line_pattern or not field_names:
continue
# Split section into lines for context processing
section_lines = section_text.split('\n')
line_matches = []
# Find all matching lines with their indices
for line_idx, line_text in enumerate(section_lines):
match = re.search(line_pattern, line_text, re.MULTILINE)
if match:
line_matches.append((line_idx, line_text, match))
logger.debug(f" ✓ Found {len(line_matches)} matching lines")
for line_idx, line_text, match in line_matches:
line_data = {}
# Extract main line fields
for idx, field_name in enumerate(field_names, start=1):
try:
value = match.group(idx).strip()
field_type = field_types.get(field_name, 'string')
# Convert type
if field_type == 'float':
thousands_sep = options.get('thousands_separator', ',')
decimal_sep = options.get('decimal_separator', '.')
value = value.replace(' ', '')
if thousands_sep in value and decimal_sep in value:
value = value.replace(thousands_sep, '').replace(decimal_sep, '.')
elif thousands_sep in value:
value = value.replace(thousands_sep, '')
elif decimal_sep in value and decimal_sep == ',':
value = value.replace(',', '.')
value = float(value)
elif field_type == 'int':
value = int(value)
line_data[field_name] = value
except Exception as e:
logger.debug(f" ✗ Failed to extract line field {field_name}: {e}")
# Extract context_before if configured
if context_config and line_idx > 0:
max_lines = context_config.get('max_lines', 5)
patterns = context_config.get('patterns', [])
# Look at lines BEFORE this line
start_idx = max(0, line_idx - max_lines)
context_lines = section_lines[start_idx:line_idx]
for pattern_config in patterns:
pattern_regex = pattern_config.get('regex')
pattern_fields = pattern_config.get('fields', [])
if not pattern_regex or not pattern_fields:
continue
# Try to match against context lines (most recent first)
for ctx_line in reversed(context_lines):
ctx_match = re.search(pattern_regex, ctx_line)
if ctx_match:
# Extract fields from context
for ctx_idx, ctx_field_name in enumerate(pattern_fields, start=1):
try:
ctx_value = ctx_match.group(ctx_idx).strip()
line_data[ctx_field_name] = ctx_value
except Exception as e:
logger.debug(f" ✗ Failed to extract context field {ctx_field_name}: {e}")
break # Stop after first match for this pattern
if line_data:
all_lines.append(line_data)
logger.info(f" ✓ Extracted {len(all_lines)} line items")
except Exception as e:
logger.warning(f" ✗ Failed to extract lines: {e}")
return all_lines
def extract(self, text: str, template_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""
Extract invoice data from text
If template_name is None, auto-detect template
"""
try:
# Auto-detect template if not specified
if template_name is None:
template_name = self.match_template(text)
if template_name is None:
return None
# Extract with template
result = self.extract_with_template(text, template_name)
logger.info(f"✅ Extracted {len(result)} fields using template: {template_name}")
return result
except Exception as e:
logger.error(f"❌ Extraction failed: {e}")
return None
def get_template_list(self) -> List[Dict[str, str]]:
"""Get list of available templates"""
return [
{
'name': name,
'issuer': template.get('issuer'),
'country': template.get('country')
}
for name, template in self.templates.items()
]
# Singleton instance
_invoice2data_service = None
def get_invoice2data_service() -> Invoice2DataService:
"""Get singleton instance of Invoice2Data service"""
global _invoice2data_service
if _invoice2data_service is None:
_invoice2data_service = Invoice2DataService()
return _invoice2data_service

View File

@ -437,6 +437,130 @@ Output: {
} }
return mime_types.get(suffix, 'application/octet-stream') return mime_types.get(suffix, 'application/octet-stream')
async def quick_analysis_on_upload(self, pdf_text: str) -> Dict:
"""
Quick analysis when file is uploaded - extracts critical fields only:
- CVR number (to match vendor)
- Document type (invoice vs credit note)
- Invoice/credit note number
This runs BEFORE template matching for early vendor detection.
Args:
pdf_text: Extracted text from PDF
Returns:
Dict with cvr, document_type, document_number, vendor_id, vendor_name, is_own_invoice
"""
from app.core.config import settings
logger.info("⚡ Running quick analysis on upload...")
result = {
"cvr": None,
"document_type": None, # 'invoice' or 'credit_note'
"document_number": None,
"vendor_id": None,
"vendor_name": None,
"is_own_invoice": False # True if this is an outgoing invoice (BMC's own CVR)
}
# 1. FIND CVR NUMBER (8 digits)
# Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678"
# Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR
# We need the SELLER's CVR (not BMC's own)
cvr_patterns = [
r'CVR[:\-\s]*(\d{8})',
r'CVR[:\-\s]*nr\.?\s*(\d{8})',
r'CVR[:\-\s]*nummer\s*(\d{8})',
r'SE[:\-\s]*(\d{8})', # SE = Svensk CVR, men også brugt i DK
r'\b(\d{8})\b' # Fallback: any 8-digit number
]
# Find ALL CVR numbers in document
found_cvrs = []
for pattern in cvr_patterns:
matches = re.finditer(pattern, pdf_text, re.IGNORECASE)
for match in matches:
cvr_candidate = match.group(1)
# Validate it's a real CVR (starts with 1-4, not a random number)
if cvr_candidate[0] in '1234' and cvr_candidate not in found_cvrs:
found_cvrs.append(cvr_candidate)
# Remove BMC's own CVR from list (buyer CVR, not seller)
vendor_cvrs = [cvr for cvr in found_cvrs if cvr != settings.OWN_CVR]
if settings.OWN_CVR in found_cvrs:
# This is a proper invoice where BMC is the buyer
if len(vendor_cvrs) > 0:
# Found vendor CVR - use the first non-BMC CVR
result['cvr'] = vendor_cvrs[0]
logger.info(f"📋 Found vendor CVR: {vendor_cvrs[0]} (ignored BMC CVR: {settings.OWN_CVR})")
# Try to match vendor
vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
if vendor:
result['vendor_id'] = vendor['id']
result['vendor_name'] = vendor['name']
else:
# Only BMC's CVR found = this is an outgoing invoice
result['is_own_invoice'] = True
result['cvr'] = settings.OWN_CVR
logger.warning(f"⚠️ OUTGOING INVOICE: Only BMC CVR found")
elif len(vendor_cvrs) > 0:
# No BMC CVR, but other CVR found - use first one
result['cvr'] = vendor_cvrs[0]
logger.info(f"📋 Found CVR: {vendor_cvrs[0]}")
vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
if vendor:
result['vendor_id'] = vendor['id']
result['vendor_name'] = vendor['name']
# 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note)
credit_keywords = [
'kreditnota', 'credit note', 'creditnote', 'kreditfaktura',
'refusion', 'tilbagebetaling', 'godtgørelse', 'tilbageførsel'
]
text_lower = pdf_text.lower()
is_credit_note = any(keyword in text_lower for keyword in credit_keywords)
if is_credit_note:
result['document_type'] = 'credit_note'
logger.info("📄 Document type: CREDIT NOTE")
else:
result['document_type'] = 'invoice'
logger.info("📄 Document type: INVOICE")
# 3. EXTRACT DOCUMENT NUMBER
# For invoices: "Faktura nr.", "Invoice number:", "Fakturanr."
# For credit notes: "Kreditnota nr.", "Credit note number:"
if result['document_type'] == 'credit_note':
number_patterns = [
r'kreditnota\s*(?:nr\.?|nummer)[:\s]*(\S+)',
r'credit\s*note\s*(?:no\.?|number)[:\s]*(\S+)',
r'kreditfaktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
]
else:
number_patterns = [
r'faktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
r'invoice\s*(?:no\.?|number)[:\s]*(\S+)',
r'fakturanr\.?\s*[:\s]*(\S+)',
]
for pattern in number_patterns:
match = re.search(pattern, pdf_text, re.IGNORECASE)
if match:
result['document_number'] = match.group(1).strip()
logger.info(f"🔢 Document number: {result['document_number']}")
break
logger.info(f"✅ Quick analysis complete: CVR={result['cvr']}, Type={result['document_type']}, Number={result['document_number']}, Vendor={result['vendor_name']}")
return result
def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]: def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
""" """
Match vendor from database using CVR number Match vendor from database using CVR number
@ -459,7 +583,7 @@ Output: {
# Search vendors table # Search vendors table
vendor = execute_query( vendor = execute_query(
"SELECT * FROM vendors WHERE cvr = %s", "SELECT * FROM vendors WHERE cvr_number = %s",
(cvr_clean,), (cvr_clean,),
fetchone=True fetchone=True
) )

View File

@ -1,6 +1,6 @@
""" """
Supplier Invoice Template Service Supplier Invoice Template Service
Simple template-based invoice field extraction (no AI) Hybrid approach: invoice2data templates + custom regex templates
Inspired by OmniSync's invoice template system Inspired by OmniSync's invoice template system
""" """
@ -11,6 +11,7 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from app.core.database import execute_query, execute_insert, execute_update from app.core.database import execute_query, execute_insert, execute_update
from app.services.invoice2data_service import get_invoice2data_service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,12 +22,19 @@ class TemplateService:
def __init__(self): def __init__(self):
self.templates_cache = {} self.templates_cache = {}
self._initialized = False self._initialized = False
self.invoice2data = None
def _ensure_loaded(self): def _ensure_loaded(self):
"""Lazy load templates on first use""" """Lazy load templates on first use"""
if not self._initialized: if not self._initialized:
logger.info("🔄 Lazy loading templates...") logger.info("🔄 Lazy loading templates...")
self._load_templates() self._load_templates()
# Also load invoice2data templates
try:
self.invoice2data = get_invoice2data_service()
logger.info(f"✅ Invoice2Data service initialized")
except Exception as e:
logger.warning(f"⚠️ Failed to load invoice2data: {e}")
self._initialized = True self._initialized = True
def _load_templates(self): def _load_templates(self):
@ -51,11 +59,24 @@ class TemplateService:
def match_template(self, pdf_text: str) -> Tuple[Optional[int], float]: def match_template(self, pdf_text: str) -> Tuple[Optional[int], float]:
""" """
Find best matching template for PDF text Find best matching template for PDF text
First tries invoice2data templates, then falls back to custom templates
Returns: (template_id, confidence_score) Returns: (template_id, confidence_score)
""" """
self._ensure_loaded() # Lazy load templates self._ensure_loaded() # Lazy load templates
logger.info(f"🔍 Matching against {len(self.templates_cache)} templates") # Try invoice2data templates first
if self.invoice2data:
try:
template_name = self.invoice2data.match_template(pdf_text)
if template_name:
logger.info(f"✅ Matched invoice2data template: {template_name}")
# Return special ID to indicate invoice2data template
return (-1, 1.0) # -1 = invoice2data, 100% confidence
except Exception as e:
logger.warning(f"⚠️ Invoice2data matching failed: {e}")
# Fallback to custom templates
logger.info(f"🔍 Matching against {len(self.templates_cache)} custom templates")
best_match = None best_match = None
best_score = 0.0 best_score = 0.0
@ -112,6 +133,19 @@ class TemplateService:
"""Extract invoice fields using template's regex patterns""" """Extract invoice fields using template's regex patterns"""
self._ensure_loaded() # Lazy load templates self._ensure_loaded() # Lazy load templates
# Check if this is an invoice2data template
if template_id == -1:
if self.invoice2data:
try:
result = self.invoice2data.extract(pdf_text)
if result:
logger.info(f"✅ Extracted fields using invoice2data")
return result
except Exception as e:
logger.error(f"❌ Invoice2data extraction failed: {e}")
return {}
# Use custom template
template = self.templates_cache.get(template_id) template = self.templates_cache.get(template_id)
if not template: if not template:
logger.warning(f"⚠️ Template {template_id} not found in cache") logger.warning(f"⚠️ Template {template_id} not found in cache")
@ -128,11 +162,36 @@ class TemplateService:
continue continue
try: try:
match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE) # Special handling for CVR to avoid extracting own CVR
if match and len(match.groups()) >= group: if field_name == 'vendor_cvr':
value = match.group(group).strip() from app.core.config import settings
extracted[field_name] = value own_cvr = getattr(settings, 'OWN_CVR', '29522790')
logger.debug(f"{field_name}: {value}")
# Find ALL CVR matches
all_matches = list(re.finditer(pattern, pdf_text, re.IGNORECASE | re.MULTILINE))
found_cvrs = []
for match in all_matches:
if len(match.groups()) >= group:
cvr = match.group(group).strip()
found_cvrs.append(cvr)
# Filter out own CVR
vendor_cvrs = [cvr for cvr in found_cvrs if cvr != own_cvr]
if vendor_cvrs:
# Use first non-own CVR as vendor CVR
extracted[field_name] = vendor_cvrs[0]
logger.debug(f"{field_name}: {vendor_cvrs[0]} (filtered out own CVR: {own_cvr})")
else:
logger.warning(f" ⚠️ Only found own CVR ({own_cvr}), no vendor CVR found")
else:
# Normal extraction for other fields
match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE)
if match and len(match.groups()) >= group:
value = match.group(group).strip()
extracted[field_name] = value
logger.debug(f"{field_name}: {value}")
except Exception as e: except Exception as e:
logger.warning(f" ✗ Failed to extract {field_name}: {e}") logger.warning(f" ✗ Failed to extract {field_name}: {e}")

View File

@ -0,0 +1,18 @@
-- Migration 011: Add context fields to extraction_lines
-- These fields capture additional context information from invoice line items
ALTER TABLE extraction_lines
ADD COLUMN IF NOT EXISTS ip_address VARCHAR(50),
ADD COLUMN IF NOT EXISTS contract_number VARCHAR(100),
ADD COLUMN IF NOT EXISTS location_street VARCHAR(255),
ADD COLUMN IF NOT EXISTS location_zip VARCHAR(10),
ADD COLUMN IF NOT EXISTS location_city VARCHAR(100);
-- Add index for contract number lookups
CREATE INDEX IF NOT EXISTS idx_extraction_lines_contract_number ON extraction_lines(contract_number);
COMMENT ON COLUMN extraction_lines.ip_address IS 'IP address/subnet from line context (e.g., 152.115.56.192/27)';
COMMENT ON COLUMN extraction_lines.contract_number IS 'Contract number from line context (e.g., NKA-008225)';
COMMENT ON COLUMN extraction_lines.location_street IS 'Street address from line context';
COMMENT ON COLUMN extraction_lines.location_zip IS 'Zip code from line context';
COMMENT ON COLUMN extraction_lines.location_city IS 'City from line context';

View File

@ -0,0 +1,19 @@
-- Migration 011: Quick Analysis on Upload
-- Adds fields to store automatic CVR, document type, and document number detection
-- Add quick analysis fields to incoming_files
ALTER TABLE incoming_files
ADD COLUMN IF NOT EXISTS detected_cvr VARCHAR(8),
ADD COLUMN IF NOT EXISTS detected_vendor_id INTEGER REFERENCES vendors(id),
ADD COLUMN IF NOT EXISTS detected_document_type VARCHAR(20), -- 'invoice' or 'credit_note'
ADD COLUMN IF NOT EXISTS detected_document_number VARCHAR(100);
-- Add index for CVR lookups
CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_cvr ON incoming_files(detected_cvr);
CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_vendor ON incoming_files(detected_vendor_id);
-- Add comments
COMMENT ON COLUMN incoming_files.detected_cvr IS 'Automatically detected CVR number from PDF text';
COMMENT ON COLUMN incoming_files.detected_vendor_id IS 'Vendor matched by CVR on upload';
COMMENT ON COLUMN incoming_files.detected_document_type IS 'Auto-detected: invoice or credit_note';
COMMENT ON COLUMN incoming_files.detected_document_number IS 'Automatically extracted invoice/credit note number';

View File

@ -0,0 +1,20 @@
-- Migration 012: Add is_own_invoice flag to filter outgoing invoices
-- BMC's own CVR: 29522790
-- Add column to track outgoing invoices (BMC's own invoices to customers)
ALTER TABLE incoming_files
ADD COLUMN IF NOT EXISTS is_own_invoice BOOLEAN DEFAULT FALSE;
-- Mark existing files with BMC's CVR as outgoing invoices
UPDATE incoming_files
SET is_own_invoice = TRUE
WHERE detected_cvr = '29522790';
-- Add index for faster filtering
CREATE INDEX IF NOT EXISTS idx_incoming_files_is_own_invoice
ON incoming_files(is_own_invoice)
WHERE is_own_invoice = TRUE;
-- Add comment
COMMENT ON COLUMN incoming_files.is_own_invoice IS
'TRUE hvis filen er en udgående faktura fra BMC (CVR 29522790), FALSE hvis leverandør faktura';

View File

@ -0,0 +1,13 @@
-- Migration 012: Add default product category to templates
-- Allows templates to specify default category for line items (varesalg, drift, etc.)
ALTER TABLE supplier_invoice_templates
ADD COLUMN IF NOT EXISTS default_product_category VARCHAR(50) DEFAULT 'varesalg',
ADD COLUMN IF NOT EXISTS default_product_group_number INTEGER;
-- Valid categories: varesalg, drift, anlæg, abonnement, lager, udlejning
COMMENT ON COLUMN supplier_invoice_templates.default_product_category IS 'Default kategori for varelinjer: varesalg, drift, anlæg, abonnement, lager, udlejning';
COMMENT ON COLUMN supplier_invoice_templates.default_product_group_number IS 'Default e-conomic produktgruppe nummer';
-- Add index for category lookups
CREATE INDEX IF NOT EXISTS idx_supplier_invoice_templates_category ON supplier_invoice_templates(default_product_category);

View File

@ -15,3 +15,5 @@ PyPDF2==3.0.1
pdfplumber==0.11.4 pdfplumber==0.11.4
pytesseract==0.3.13 pytesseract==0.3.13
Pillow==11.0.0 Pillow==11.0.0
invoice2data==0.4.4
pyyaml==6.0.2

View File

@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""
Backfill quick analysis for existing files
"""
import sys
import asyncio
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.core.database import execute_query, execute_update, init_db
from app.services.ollama_service import ollama_service
async def backfill_quick_analysis():
"""Run quick analysis on all files that don't have it"""
# Initialize database
init_db()
try:
# Get files without quick analysis
files = execute_query(
"""SELECT file_id, filename, file_path
FROM incoming_files
WHERE (detected_cvr IS NULL OR detected_document_number IS NULL)
AND status NOT IN ('duplicate')
AND file_path IS NOT NULL
ORDER BY file_id DESC"""
)
print(f"📋 Found {len(files)} files without quick analysis")
success_count = 0
fail_count = 0
for file in files:
try:
file_path = Path(file['file_path'])
if not file_path.exists():
print(f"⚠️ File not found: {file_path}")
fail_count += 1
continue
print(f"\n🔍 Processing: {file['filename']} (ID: {file['file_id']})")
# Extract text
text = await ollama_service._extract_text_from_file(file_path)
# Run quick analysis
quick_result = await ollama_service.quick_analysis_on_upload(text)
# Update database
execute_update(
"""UPDATE incoming_files
SET detected_cvr = %s,
detected_vendor_id = %s,
detected_document_type = %s,
detected_document_number = %s
WHERE file_id = %s""",
(quick_result.get('cvr'),
quick_result.get('vendor_id'),
quick_result.get('document_type'),
quick_result.get('document_number'),
file['file_id'])
)
print(f"✅ Updated: CVR={quick_result.get('cvr')}, "
f"Type={quick_result.get('document_type')}, "
f"Number={quick_result.get('document_number')}, "
f"Vendor={quick_result.get('vendor_name')}")
success_count += 1
except Exception as e:
print(f"❌ Error processing {file['filename']}: {e}")
fail_count += 1
print(f"\n📊 Summary: {success_count} successful, {fail_count} failed")
except Exception as e:
print(f"❌ Fatal error: {e}")
raise
if __name__ == "__main__":
asyncio.run(backfill_quick_analysis())

View File

@ -1,4 +1,4 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="en" data-bs-theme="dark"> <html lang="en" data-bs-theme="dark">
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">

86
test_quick_analysis.py Normal file
View File

@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Test Quick Analysis on Upload
Tests CVR detection, document type, and invoice number extraction
"""
import asyncio
import sys
from pathlib import Path
# Add app directory to path
sys.path.insert(0, str(Path(__file__).parent / "app"))
from app.services.ollama_service import ollama_service
async def test_quick_analysis():
"""Test quick analysis with sample text"""
# Sample invoice text with CVR
sample_invoice = """
ALSO Danmark A/S
Jupitervej 4
6000 Kolding
CVR-nr.: 35812428
FAKTURA
Faktura nr.: INV-2024-12345
Dato: 2024-12-08
Beløb i alt: 5.965,18 DKK
"""
# Sample credit note text
sample_credit_note = """
Test Leverandør A/S
CVR: 12345678
KREDITNOTA
Kreditnota nr.: CN-2024-5678
Original faktura: INV-2024-1000
Beløb: -1.234,56 DKK
"""
print("🧪 Testing Quick Analysis\n")
print("=" * 60)
# Test 1: Invoice with CVR
print("\n📄 TEST 1: Invoice with CVR")
print("-" * 60)
result1 = await ollama_service.quick_analysis_on_upload(sample_invoice)
print(f"CVR: {result1['cvr']}")
print(f"Document Type: {result1['document_type']}")
print(f"Document Number: {result1['document_number']}")
print(f"Vendor ID: {result1['vendor_id']}")
print(f"Vendor Name: {result1['vendor_name']}")
assert result1['cvr'] == '35812428', f"Expected CVR 35812428, got {result1['cvr']}"
assert result1['document_type'] == 'invoice', f"Expected invoice, got {result1['document_type']}"
assert result1['document_number'] == 'INV-2024-12345', f"Expected INV-2024-12345, got {result1['document_number']}"
print("✅ Test 1 PASSED")
# Test 2: Credit Note
print("\n📄 TEST 2: Credit Note")
print("-" * 60)
result2 = await ollama_service.quick_analysis_on_upload(sample_credit_note)
print(f"CVR: {result2['cvr']}")
print(f"Document Type: {result2['document_type']}")
print(f"Document Number: {result2['document_number']}")
print(f"Vendor ID: {result2['vendor_id']}")
print(f"Vendor Name: {result2['vendor_name']}")
assert result2['cvr'] == '12345678', f"Expected CVR 12345678, got {result2['cvr']}"
assert result2['document_type'] == 'credit_note', f"Expected credit_note, got {result2['document_type']}"
assert result2['document_number'] == 'CN-2024-5678', f"Expected CN-2024-5678, got {result2['document_number']}"
print("✅ Test 2 PASSED")
print("\n" + "=" * 60)
print("✅ ALL TESTS PASSED!")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(test_quick_analysis())