feat: Implement quick analysis on PDF upload for CVR, document type, and number extraction
- Added `check_invoice_number_exists` method in `EconomicService` to verify invoice numbers in e-conomic journals. - Introduced `quick_analysis_on_upload` method in `OllamaService` for extracting critical fields from uploaded PDFs, including CVR, document type, and document number. - Created migration script to add new fields for storing detected CVR, vendor ID, document type, and document number in the `incoming_files` table. - Developed comprehensive tests for the quick analysis functionality, validating CVR detection, document type identification, and invoice number extraction.
This commit is contained in:
parent
890bd6245d
commit
3a8288f5a1
@ -13,6 +13,7 @@ from app.core.config import settings
|
||||
from app.services.economic_service import get_economic_service
|
||||
from app.services.ollama_service import ollama_service
|
||||
from app.services.template_service import template_service
|
||||
from app.services.invoice2data_service import get_invoice2data_service
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@ -232,15 +233,25 @@ async def get_pending_files():
|
||||
f.error_message,
|
||||
f.template_id,
|
||||
f.file_path,
|
||||
-- Quick analysis results (available immediately on upload)
|
||||
f.detected_cvr,
|
||||
f.detected_vendor_id,
|
||||
f.detected_document_type,
|
||||
f.detected_document_number,
|
||||
f.is_own_invoice,
|
||||
v_detected.name as detected_vendor_name,
|
||||
v_detected.cvr_number as detected_vendor_cvr,
|
||||
-- Get vendor info from latest extraction
|
||||
ext.vendor_name,
|
||||
ext.vendor_cvr,
|
||||
ext.vendor_matched_id,
|
||||
v.name as matched_vendor_name,
|
||||
v.cvr_number as matched_vendor_cvr_number,
|
||||
-- Check if already has invoice via latest extraction only
|
||||
si.id as existing_invoice_id,
|
||||
si.invoice_number as existing_invoice_number
|
||||
FROM incoming_files f
|
||||
LEFT JOIN vendors v_detected ON v_detected.id = f.detected_vendor_id
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT extraction_id, file_id, vendor_name, vendor_cvr, vendor_matched_id
|
||||
FROM extractions
|
||||
@ -250,16 +261,82 @@ async def get_pending_files():
|
||||
) ext ON true
|
||||
LEFT JOIN vendors v ON v.id = ext.vendor_matched_id
|
||||
LEFT JOIN supplier_invoices si ON si.extraction_id = ext.extraction_id
|
||||
WHERE f.status IN ('pending', 'processing', 'failed', 'ai_extracted', 'processed')
|
||||
WHERE f.status IN ('pending', 'processing', 'failed', 'ai_extracted', 'processed', 'duplicate')
|
||||
AND si.id IS NULL -- Only show files without invoice yet
|
||||
ORDER BY f.file_id, f.uploaded_at DESC"""
|
||||
)
|
||||
|
||||
# Convert to regular dicts so we can add new keys
|
||||
files = [dict(file) for file in files] if files else []
|
||||
|
||||
# Check for invoice2data templates for each file
|
||||
try:
|
||||
from app.services.invoice2data_service import get_invoice2data_service
|
||||
invoice2data = get_invoice2data_service()
|
||||
logger.info(f"📋 Checking invoice2data templates: {len(invoice2data.templates)} loaded")
|
||||
|
||||
for file in files:
|
||||
# Check if there's an invoice2data template for this vendor's CVR
|
||||
vendor_cvr = file.get('matched_vendor_cvr_number') or file.get('detected_vendor_cvr') or file.get('vendor_cvr')
|
||||
file['has_invoice2data_template'] = False
|
||||
|
||||
logger.debug(f" File {file['file_id']}: CVR={vendor_cvr}")
|
||||
|
||||
if vendor_cvr:
|
||||
# Check all templates for this CVR in keywords
|
||||
for template_name, template_data in invoice2data.templates.items():
|
||||
keywords = template_data.get('keywords', [])
|
||||
logger.debug(f" Template {template_name}: keywords={keywords}")
|
||||
if str(vendor_cvr) in [str(k) for k in keywords]:
|
||||
file['has_invoice2data_template'] = True
|
||||
file['invoice2data_template_name'] = template_name
|
||||
logger.info(f" ✅ File {file['file_id']} matched template: {template_name}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to check invoice2data templates: {e}", exc_info=True)
|
||||
# Continue without invoice2data info
|
||||
|
||||
return {"files": files if files else [], "count": len(files) if files else 0}
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get pending files: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/supplier-invoices/files/{file_id}/pdf-text")
|
||||
async def get_file_pdf_text(file_id: int):
|
||||
"""Hent fuld PDF tekst fra en uploaded fil (til template builder)"""
|
||||
try:
|
||||
# Get file info
|
||||
file_info = execute_query(
|
||||
"SELECT file_path, filename FROM incoming_files WHERE file_id = %s",
|
||||
(file_id,),
|
||||
fetchone=True
|
||||
)
|
||||
|
||||
if not file_info:
|
||||
raise HTTPException(status_code=404, detail="Fil ikke fundet")
|
||||
|
||||
# Read PDF text
|
||||
from pathlib import Path
|
||||
file_path = Path(file_info['file_path'])
|
||||
if not file_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"Fil ikke fundet på disk: {file_path}")
|
||||
|
||||
pdf_text = await ollama_service._extract_text_from_file(file_path)
|
||||
|
||||
return {
|
||||
"file_id": file_id,
|
||||
"filename": file_info['filename'],
|
||||
"pdf_text": pdf_text
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get PDF text: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/supplier-invoices/files/{file_id}/extracted-data")
|
||||
async def get_file_extracted_data(file_id: int):
|
||||
"""Hent AI-extracted data fra en uploaded fil"""
|
||||
@ -758,8 +835,9 @@ async def create_invoice_from_extraction(file_id: int):
|
||||
|
||||
@router.get("/supplier-invoices/templates")
|
||||
async def list_templates():
|
||||
"""Hent alle templates"""
|
||||
"""Hent alle templates (både database og invoice2data YAML)"""
|
||||
try:
|
||||
# Get database templates
|
||||
query = """
|
||||
SELECT t.*, v.name as vendor_name
|
||||
FROM supplier_invoice_templates t
|
||||
@ -767,9 +845,55 @@ async def list_templates():
|
||||
WHERE t.is_active = true
|
||||
ORDER BY t.created_at DESC
|
||||
"""
|
||||
templates = execute_query(query)
|
||||
db_templates = execute_query(query) or []
|
||||
|
||||
return templates if templates else []
|
||||
# Get invoice2data templates
|
||||
invoice2data_service = get_invoice2data_service()
|
||||
invoice2data_templates = []
|
||||
|
||||
for template_name, template_data in invoice2data_service.templates.items():
|
||||
# Extract vendor CVR from keywords
|
||||
vendor_cvr = None
|
||||
keywords = template_data.get('keywords', [])
|
||||
for keyword in keywords:
|
||||
if isinstance(keyword, str) and keyword.isdigit() and len(keyword) == 8:
|
||||
vendor_cvr = keyword
|
||||
break
|
||||
|
||||
# Get vendor info from database if CVR found
|
||||
vendor_name = template_data.get('issuer', 'Ukendt')
|
||||
vendor_id = None
|
||||
if vendor_cvr:
|
||||
vendor = execute_query(
|
||||
"SELECT id, name FROM vendors WHERE cvr_number = %s",
|
||||
(vendor_cvr,),
|
||||
fetchone=True
|
||||
)
|
||||
if vendor:
|
||||
vendor_id = vendor['id']
|
||||
vendor_name = vendor['name']
|
||||
|
||||
invoice2data_templates.append({
|
||||
'template_id': -1, # Negative ID to distinguish from DB templates
|
||||
'template_name': f"Invoice2Data: {template_name}",
|
||||
'template_type': 'invoice2data',
|
||||
'yaml_filename': template_name,
|
||||
'vendor_id': vendor_id,
|
||||
'vendor_name': vendor_name,
|
||||
'vendor_cvr': vendor_cvr,
|
||||
'default_product_category': template_data.get('default_product_category', 'varesalg'),
|
||||
'default_product_group_number': template_data.get('default_product_group_number', 1),
|
||||
'usage_count': 0, # Could track this separately
|
||||
'is_active': True,
|
||||
'detection_patterns': keywords,
|
||||
'field_mappings': template_data.get('fields', {}),
|
||||
'created_at': None
|
||||
})
|
||||
|
||||
# Combine both types
|
||||
all_templates = db_templates + invoice2data_templates
|
||||
|
||||
return all_templates
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to list templates: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@ -978,6 +1102,7 @@ async def create_template(request: Dict):
|
||||
template_name = request.get('template_name')
|
||||
detection_patterns = request.get('detection_patterns', [])
|
||||
field_mappings = request.get('field_mappings', {})
|
||||
default_product_category = request.get('default_product_category', 'varesalg')
|
||||
|
||||
if not vendor_id or not template_name:
|
||||
raise HTTPException(status_code=400, detail="vendor_id og template_name er påkrævet")
|
||||
@ -996,11 +1121,11 @@ async def create_template(request: Dict):
|
||||
# Insert template and get template_id
|
||||
query = """
|
||||
INSERT INTO supplier_invoice_templates
|
||||
(vendor_id, template_name, detection_patterns, field_mappings)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
(vendor_id, template_name, detection_patterns, field_mappings, default_product_category)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
RETURNING template_id
|
||||
"""
|
||||
result = execute_query(query, (vendor_id, template_name, json.dumps(detection_patterns), json.dumps(field_mappings)))
|
||||
result = execute_query(query, (vendor_id, template_name, json.dumps(detection_patterns), json.dumps(field_mappings), default_product_category))
|
||||
template_id = result[0]['template_id'] if result else None
|
||||
|
||||
if not template_id:
|
||||
@ -1657,6 +1782,97 @@ async def upload_supplier_invoice(file: UploadFile = File(...)):
|
||||
logger.info(f"📄 Extracting text from {final_path.suffix}...")
|
||||
text = await ollama_service._extract_text_from_file(final_path)
|
||||
|
||||
# QUICK ANALYSIS: Extract CVR, document type, invoice number IMMEDIATELY
|
||||
logger.info(f"⚡ Running quick analysis...")
|
||||
quick_result = await ollama_service.quick_analysis_on_upload(text)
|
||||
|
||||
# Update file record with quick analysis results
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET detected_cvr = %s,
|
||||
detected_vendor_id = %s,
|
||||
detected_document_type = %s,
|
||||
detected_document_number = %s,
|
||||
is_own_invoice = %s
|
||||
WHERE file_id = %s""",
|
||||
(quick_result.get('cvr'),
|
||||
quick_result.get('vendor_id'),
|
||||
quick_result.get('document_type'),
|
||||
quick_result.get('document_number'),
|
||||
quick_result.get('is_own_invoice', False),
|
||||
file_id)
|
||||
)
|
||||
|
||||
logger.info(f"📋 Quick analysis saved: CVR={quick_result.get('cvr')}, "
|
||||
f"Vendor={quick_result.get('vendor_name')}, "
|
||||
f"Type={quick_result.get('document_type')}, "
|
||||
f"Number={quick_result.get('document_number')}")
|
||||
|
||||
# DUPLICATE CHECK: Check if invoice number already exists
|
||||
document_number = quick_result.get('document_number')
|
||||
if document_number:
|
||||
logger.info(f"🔍 Checking for duplicate invoice number: {document_number}")
|
||||
|
||||
# Check 1: Search in local database (supplier_invoices table)
|
||||
existing_invoice = execute_query(
|
||||
"""SELECT si.id, si.invoice_number, si.created_at, v.name as vendor_name
|
||||
FROM supplier_invoices si
|
||||
LEFT JOIN vendors v ON v.id = si.vendor_id
|
||||
WHERE si.invoice_number = %s
|
||||
ORDER BY si.created_at DESC
|
||||
LIMIT 1""",
|
||||
(document_number,),
|
||||
fetchone=True
|
||||
)
|
||||
|
||||
if existing_invoice:
|
||||
# DUPLICATE FOUND IN DATABASE
|
||||
logger.error(f"🚫 DUPLICATE: Invoice {document_number} already exists in database (ID: {existing_invoice['id']})")
|
||||
|
||||
# Mark file as duplicate
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'duplicate',
|
||||
error_message = %s,
|
||||
processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(f"DUBLET: Fakturanummer {document_number} findes allerede i systemet (Faktura #{existing_invoice['id']}, {existing_invoice['vendor_name'] or 'Ukendt leverandør'})",
|
||||
file_id)
|
||||
)
|
||||
|
||||
raise HTTPException(
|
||||
status_code=409, # 409 Conflict
|
||||
detail=f"🚫 DUBLET: Fakturanummer {document_number} findes allerede i systemet (Faktura #{existing_invoice['id']}, oprettet {existing_invoice['created_at'].strftime('%d-%m-%Y')})"
|
||||
)
|
||||
|
||||
# Check 2: Search in e-conomic (if configured)
|
||||
from app.services.economic_service import economic_service
|
||||
if hasattr(economic_service, 'app_secret_token') and economic_service.app_secret_token:
|
||||
logger.info(f"🔍 Checking e-conomic for invoice number: {document_number}")
|
||||
economic_duplicate = await economic_service.check_invoice_number_exists(document_number)
|
||||
|
||||
if economic_duplicate:
|
||||
# DUPLICATE FOUND IN E-CONOMIC
|
||||
logger.error(f"🚫 DUPLICATE: Invoice {document_number} found in e-conomic (Voucher #{economic_duplicate.get('voucher_number')})")
|
||||
|
||||
# Mark file as duplicate
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'duplicate',
|
||||
error_message = %s,
|
||||
processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(f"DUBLET: Fakturanummer {document_number} findes i e-conomic (Bilag #{economic_duplicate.get('voucher_number')})",
|
||||
file_id)
|
||||
)
|
||||
|
||||
raise HTTPException(
|
||||
status_code=409, # 409 Conflict
|
||||
detail=f"🚫 DUBLET: Fakturanummer {document_number} findes i e-conomic (Bilag #{economic_duplicate.get('voucher_number')}, {economic_duplicate.get('date')})"
|
||||
)
|
||||
|
||||
logger.info(f"✅ No duplicate found for invoice {document_number}")
|
||||
|
||||
# Try template matching
|
||||
logger.info(f"📋 Matching template...")
|
||||
template_id, confidence = template_service.match_template(text)
|
||||
@ -1699,7 +1915,8 @@ async def upload_supplier_invoice(file: UploadFile = File(...)):
|
||||
"""INSERT INTO extraction_lines
|
||||
(extraction_id, line_number, description, quantity, unit_price,
|
||||
line_total, vat_rate, vat_note, confidence)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
RETURNING line_id""",
|
||||
(extraction_id, idx, line.get('description'),
|
||||
line.get('quantity'), line.get('unit_price'),
|
||||
line.get('line_total'), line.get('vat_rate'),
|
||||
@ -1744,13 +1961,41 @@ async def upload_supplier_invoice(file: UploadFile = File(...)):
|
||||
"confidence": confidence,
|
||||
"extracted_fields": extracted_fields,
|
||||
"pdf_text": text[:500], # First 500 chars for reference
|
||||
# Quick analysis results (available IMMEDIATELY on upload)
|
||||
"quick_analysis": {
|
||||
"cvr": quick_result.get('cvr'),
|
||||
"vendor_id": quick_result.get('vendor_id'),
|
||||
"vendor_name": quick_result.get('vendor_name'),
|
||||
"document_type": quick_result.get('document_type'),
|
||||
"document_number": quick_result.get('document_number')
|
||||
},
|
||||
"message": "Upload gennemført - gennemgå og bekræft data"
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
except HTTPException as he:
|
||||
# Mark file as failed if we have file_id
|
||||
if 'file_id' in locals():
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'failed',
|
||||
error_message = %s,
|
||||
processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(str(he.detail), file_id)
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Upload failed (inner): {e}", exc_info=True)
|
||||
# Mark file as failed if we have file_id
|
||||
if 'file_id' in locals():
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'failed',
|
||||
error_message = %s,
|
||||
processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(str(e), file_id)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail=f"Upload fejlede: {str(e)}")
|
||||
|
||||
except HTTPException:
|
||||
@ -1809,51 +2054,174 @@ async def reprocess_uploaded_file(file_id: int):
|
||||
logger.info(f"✅ Matched template {template_id} ({confidence:.0%})")
|
||||
extracted_fields = template_service.extract_fields(text, template_id)
|
||||
|
||||
# Check if this is an invoice2data template (ID -1)
|
||||
is_invoice2data = (template_id == -1)
|
||||
|
||||
if is_invoice2data:
|
||||
# Invoice2data doesn't have vendor in cache
|
||||
logger.info(f"📋 Using invoice2data template")
|
||||
# Try to find vendor from extracted CVR
|
||||
if extracted_fields.get('vendor_vat'):
|
||||
vendor = execute_query(
|
||||
"SELECT id FROM vendors WHERE cvr_number = %s",
|
||||
(extracted_fields['vendor_vat'],),
|
||||
fetchone=True
|
||||
)
|
||||
if vendor:
|
||||
vendor_id = vendor['id']
|
||||
|
||||
# Store invoice2data extraction in database
|
||||
extraction_id = execute_insert(
|
||||
"""INSERT INTO extractions
|
||||
(file_id, vendor_matched_id, vendor_name, vendor_cvr,
|
||||
document_id, document_date, due_date, document_type, document_type_detected,
|
||||
total_amount, currency, confidence, llm_response_json, status)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
RETURNING extraction_id""",
|
||||
(file_id, vendor_id,
|
||||
extracted_fields.get('issuer'), # vendor_name
|
||||
extracted_fields.get('vendor_vat'), # vendor_cvr
|
||||
str(extracted_fields.get('invoice_number')), # document_id
|
||||
extracted_fields.get('invoice_date'), # document_date
|
||||
extracted_fields.get('due_date'),
|
||||
'invoice', # document_type
|
||||
'invoice', # document_type_detected
|
||||
extracted_fields.get('amount_total'),
|
||||
extracted_fields.get('currency', 'DKK'),
|
||||
1.0, # invoice2data always 100% confidence
|
||||
json.dumps(extracted_fields), # llm_response_json
|
||||
'extracted') # status
|
||||
)
|
||||
|
||||
# Insert line items if extracted
|
||||
if extracted_fields.get('lines'):
|
||||
for idx, line in enumerate(extracted_fields['lines'], start=1):
|
||||
execute_insert(
|
||||
"""INSERT INTO extraction_lines
|
||||
(extraction_id, line_number, description, quantity, unit_price,
|
||||
line_total, vat_rate, vat_note, confidence,
|
||||
ip_address, contract_number, location_street, location_zip, location_city)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
RETURNING line_id""",
|
||||
(extraction_id, idx, line.get('description'),
|
||||
line.get('quantity'), line.get('unit_price'),
|
||||
line.get('line_total'), None, None, 1.0,
|
||||
line.get('ip_address'), line.get('contract_number'),
|
||||
line.get('location_street'), line.get('location_zip'), line.get('location_city'))
|
||||
)
|
||||
logger.info(f"✅ Saved {len(extracted_fields['lines'])} line items")
|
||||
else:
|
||||
# Custom template from database
|
||||
template = template_service.templates_cache.get(template_id)
|
||||
if template:
|
||||
vendor_id = template.get('vendor_id')
|
||||
|
||||
template_service.log_usage(template_id, file_id, True, confidence, extracted_fields)
|
||||
|
||||
# Update file - use NULL for invoice2data templates to avoid FK constraint
|
||||
db_template_id = None if is_invoice2data else template_id
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'processed', template_id = %s, processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(template_id, file_id)
|
||||
(db_template_id, file_id)
|
||||
)
|
||||
else:
|
||||
# NO AI FALLBACK - Require template matching
|
||||
logger.warning(f"⚠️ Ingen template match (confidence: {confidence:.0%}) - afviser fil")
|
||||
# FALLBACK TO AI EXTRACTION
|
||||
logger.info(f"⚠️ Ingen template match (confidence: {confidence:.0%}) - bruger AI extraction")
|
||||
|
||||
# Use detected vendor from quick analysis if available
|
||||
vendor_id = file_record.get('detected_vendor_id')
|
||||
|
||||
# Call Ollama for full extraction
|
||||
logger.info(f"🤖 Calling Ollama for AI extraction...")
|
||||
llm_result = await ollama_service.extract_from_text(text)
|
||||
|
||||
if not llm_result or 'error' in llm_result:
|
||||
error_msg = llm_result.get('error') if llm_result else 'AI extraction fejlede'
|
||||
logger.error(f"❌ AI extraction failed: {error_msg}")
|
||||
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'failed',
|
||||
error_message = 'Ingen template match - opret template for denne leverandør',
|
||||
error_message = %s,
|
||||
processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(f"AI extraction fejlede: {error_msg}", file_id)
|
||||
)
|
||||
|
||||
raise HTTPException(status_code=500, detail=f"AI extraction fejlede: {error_msg}")
|
||||
|
||||
extracted_fields = llm_result
|
||||
confidence = llm_result.get('confidence', 0.75)
|
||||
|
||||
# Store AI extracted data in extractions table
|
||||
extraction_id = execute_insert(
|
||||
"""INSERT INTO supplier_invoice_extractions
|
||||
(file_id, vendor_id, invoice_number, invoice_date, due_date,
|
||||
total_amount, currency, document_type, confidence, llm_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING extraction_id""",
|
||||
(file_id, vendor_id,
|
||||
llm_result.get('invoice_number'),
|
||||
llm_result.get('invoice_date'),
|
||||
llm_result.get('due_date'),
|
||||
llm_result.get('total_amount'),
|
||||
llm_result.get('currency', 'DKK'),
|
||||
llm_result.get('document_type'),
|
||||
confidence,
|
||||
json.dumps(llm_result))
|
||||
)
|
||||
|
||||
# Insert line items if extracted
|
||||
if llm_result.get('lines'):
|
||||
for idx, line in enumerate(llm_result['lines'], start=1):
|
||||
execute_insert(
|
||||
"""INSERT INTO extraction_lines
|
||||
(extraction_id, line_number, description, quantity, unit_price,
|
||||
line_total, vat_rate, vat_note, confidence)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
RETURNING line_id""",
|
||||
(extraction_id, idx, line.get('description'),
|
||||
line.get('quantity'), line.get('unit_price'),
|
||||
line.get('line_total'), line.get('vat_rate'),
|
||||
line.get('vat_note'), confidence)
|
||||
)
|
||||
|
||||
# Update file status to ai_extracted
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET status = 'ai_extracted', processed_at = CURRENT_TIMESTAMP
|
||||
WHERE file_id = %s""",
|
||||
(file_id,)
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "failed",
|
||||
"file_id": file_id,
|
||||
"error": "Ingen template match - opret template for denne leverandør",
|
||||
"confidence": confidence
|
||||
}
|
||||
logger.info(f"✅ AI extraction completed for file {file_id}")
|
||||
|
||||
# Return success with template data
|
||||
return {
|
||||
# Return success with template data or AI extraction result
|
||||
result = {
|
||||
"status": "success",
|
||||
"file_id": file_id,
|
||||
"filename": file_record['filename'],
|
||||
"template_matched": template_id is not None,
|
||||
"template_id": template_id,
|
||||
"vendor_id": vendor_id,
|
||||
"confidence": confidence if template_id else 0.8,
|
||||
"confidence": confidence if template_id else llm_result.get('confidence', 0.75),
|
||||
"extracted_fields": extracted_fields,
|
||||
"pdf_text": text[:1000] if not template_id else text
|
||||
}
|
||||
|
||||
# Add warning if no template exists
|
||||
if not template_id and vendor_id:
|
||||
vendor = execute_query(
|
||||
"SELECT name FROM vendors WHERE id = %s",
|
||||
(vendor_id,),
|
||||
fetchone=True
|
||||
)
|
||||
if vendor:
|
||||
result["warning"] = f"⚠️ Ingen template fundet for {vendor['name']} - brugte AI extraction (langsommere)"
|
||||
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
@ -1866,6 +2234,7 @@ async def update_template(
|
||||
template_name: Optional[str] = None,
|
||||
detection_patterns: Optional[List[Dict]] = None,
|
||||
field_mappings: Optional[Dict] = None,
|
||||
default_product_category: Optional[str] = None,
|
||||
is_active: Optional[bool] = None
|
||||
):
|
||||
"""Opdater eksisterende template"""
|
||||
@ -1884,6 +2253,9 @@ async def update_template(
|
||||
if field_mappings is not None:
|
||||
updates.append("field_mappings = %s")
|
||||
params.append(json.dumps(field_mappings))
|
||||
if default_product_category is not None:
|
||||
updates.append("default_product_category = %s")
|
||||
params.append(default_product_category)
|
||||
if is_active is not None:
|
||||
updates.append("is_active = %s")
|
||||
params.append(is_active)
|
||||
@ -1911,6 +2283,114 @@ async def update_template(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/supplier-invoices/templates/invoice2data/{template_name}/test")
|
||||
async def test_invoice2data_template(template_name: str, request: Dict):
|
||||
"""
|
||||
Test invoice2data YAML template mod PDF tekst
|
||||
|
||||
Request body:
|
||||
{
|
||||
"pdf_text": "Full PDF text content..."
|
||||
}
|
||||
|
||||
Returns samme format som test_template endpoint
|
||||
"""
|
||||
try:
|
||||
pdf_text = request.get('pdf_text', '')
|
||||
if not pdf_text:
|
||||
raise HTTPException(status_code=400, detail="pdf_text er påkrævet")
|
||||
|
||||
# Get invoice2data service
|
||||
invoice2data_service = get_invoice2data_service()
|
||||
|
||||
# Check if template exists
|
||||
if template_name not in invoice2data_service.templates:
|
||||
raise HTTPException(status_code=404, detail=f"Template '{template_name}' ikke fundet")
|
||||
|
||||
template_data = invoice2data_service.templates[template_name]
|
||||
|
||||
# Test extraction
|
||||
result = invoice2data_service.extract_with_template(pdf_text, template_name)
|
||||
|
||||
if not result:
|
||||
# Template didn't match
|
||||
keywords = template_data.get('keywords', [])
|
||||
detection_results = []
|
||||
for keyword in keywords:
|
||||
found = str(keyword).lower() in pdf_text.lower()
|
||||
detection_results.append({
|
||||
"pattern": str(keyword),
|
||||
"type": "keyword",
|
||||
"found": found,
|
||||
"weight": 0.5
|
||||
})
|
||||
|
||||
return {
|
||||
"matched": False,
|
||||
"confidence": 0.0,
|
||||
"extracted_fields": {},
|
||||
"line_items": [],
|
||||
"detection_results": detection_results,
|
||||
"template_name": template_name,
|
||||
"error": "Template matchede ikke PDF'en"
|
||||
}
|
||||
|
||||
# Extract line items
|
||||
line_items = []
|
||||
if 'lines' in result:
|
||||
for line in result['lines']:
|
||||
line_items.append({
|
||||
"line_number": line.get('line_number', ''),
|
||||
"item_number": line.get('item_number', ''),
|
||||
"description": line.get('description_raw', '') or line.get('description', ''),
|
||||
"quantity": line.get('quantity', ''),
|
||||
"unit_price": line.get('unit_price', ''),
|
||||
"line_total": line.get('line_total', ''),
|
||||
# Context fields (circuit/location info)
|
||||
"circuit_id": line.get('circuit_id', ''),
|
||||
"ip_address": line.get('ip_address', ''),
|
||||
"contract_number": line.get('contract_number', ''),
|
||||
"location_street": line.get('location_street', ''),
|
||||
"location_zip": line.get('location_zip', ''),
|
||||
"location_city": line.get('location_city', ''),
|
||||
})
|
||||
|
||||
# Build detection results
|
||||
keywords = template_data.get('keywords', [])
|
||||
detection_results = []
|
||||
matched_count = 0
|
||||
for keyword in keywords:
|
||||
found = str(keyword).lower() in pdf_text.lower()
|
||||
if found:
|
||||
matched_count += 1
|
||||
detection_results.append({
|
||||
"pattern": str(keyword),
|
||||
"type": "keyword",
|
||||
"found": found,
|
||||
"weight": 0.5
|
||||
})
|
||||
|
||||
confidence = matched_count / len(keywords) if keywords else 1.0
|
||||
|
||||
# Remove 'lines' from extracted_fields to avoid duplication
|
||||
extracted_fields = {k: v for k, v in result.items() if k != 'lines'}
|
||||
|
||||
return {
|
||||
"matched": True,
|
||||
"confidence": confidence,
|
||||
"extracted_fields": extracted_fields,
|
||||
"line_items": line_items,
|
||||
"detection_results": detection_results,
|
||||
"template_name": template_name
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Invoice2data template test failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/supplier-invoices/templates/{template_id}/test")
|
||||
async def test_template(template_id: int, request: Dict):
|
||||
"""
|
||||
@ -2076,6 +2556,102 @@ async def test_template(template_id: int, request: Dict):
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.put("/supplier-invoices/templates/invoice2data/{template_name}/category")
|
||||
async def update_yaml_category(template_name: str, request: Dict):
|
||||
"""
|
||||
Opdater default_product_category i YAML template fil
|
||||
|
||||
Request body:
|
||||
{
|
||||
"category": "drift" // varesalg, drift, anlæg, abonnement, lager, udlejning
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
new_category = request.get('category')
|
||||
if not new_category:
|
||||
raise HTTPException(status_code=400, detail="category er påkrævet")
|
||||
|
||||
# Validate category
|
||||
valid_categories = ['varesalg', 'drift', 'anlæg', 'abonnement', 'lager', 'udlejning']
|
||||
if new_category not in valid_categories:
|
||||
raise HTTPException(status_code=400, detail=f"Ugyldig kategori. Skal være en af: {', '.join(valid_categories)}")
|
||||
|
||||
# Find YAML file
|
||||
templates_dir = Path(__file__).parent.parent.parent.parent / 'data' / 'invoice_templates'
|
||||
yaml_file = templates_dir / f"{template_name}.yml"
|
||||
|
||||
if not yaml_file.exists():
|
||||
raise HTTPException(status_code=404, detail=f"YAML fil ikke fundet: {template_name}.yml")
|
||||
|
||||
# Load YAML
|
||||
with open(yaml_file, 'r', encoding='utf-8') as f:
|
||||
template_data = yaml.safe_load(f)
|
||||
|
||||
# Update category
|
||||
template_data['default_product_category'] = new_category
|
||||
|
||||
# Save YAML with preserved formatting
|
||||
with open(yaml_file, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(template_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
# Reload invoice2data service to pick up changes
|
||||
invoice2data_service = get_invoice2data_service()
|
||||
invoice2data_service.__init__() # Reinitialize to reload templates
|
||||
|
||||
logger.info(f"✅ Updated category for {template_name}.yml to {new_category}")
|
||||
|
||||
return {
|
||||
"message": "Kategori opdateret",
|
||||
"template_name": template_name,
|
||||
"new_category": new_category
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to update YAML category: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/supplier-invoices/templates/invoice2data/{template_name}/content")
|
||||
async def get_yaml_content(template_name: str):
|
||||
"""
|
||||
Hent råt YAML indhold fra template fil
|
||||
|
||||
Returns:
|
||||
{
|
||||
"content": "issuer: DCS ApS\nkeywords: ..."
|
||||
}
|
||||
"""
|
||||
try:
|
||||
from pathlib import Path
|
||||
|
||||
# Find template file
|
||||
template_dir = Path("data/invoice_templates")
|
||||
template_file = template_dir / f"{template_name}.yml"
|
||||
|
||||
if not template_file.exists():
|
||||
raise HTTPException(status_code=404, detail=f"Template fil ikke fundet: {template_name}.yml")
|
||||
|
||||
# Read file content
|
||||
content = template_file.read_text(encoding='utf-8')
|
||||
|
||||
return {
|
||||
"template_name": template_name,
|
||||
"filename": f"{template_name}.yml",
|
||||
"content": content
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to read YAML content: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/supplier-invoices/templates/{template_id}")
|
||||
async def delete_template(template_id: int):
|
||||
"""Slet template (soft delete - sæt is_active=false)"""
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -127,6 +127,11 @@
|
||||
<div class="row" id="filesList">
|
||||
<!-- Files loaded dynamically -->
|
||||
</div>
|
||||
<div class="mt-3 text-end">
|
||||
<button class="btn btn-outline-secondary" onclick="skipFileSelection()">
|
||||
Spring over <i class="bi bi-arrow-right ms-2"></i>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -156,6 +161,18 @@
|
||||
<input type="text" class="form-control" id="templateName" placeholder="F.eks. 'BMC Standard Faktura'" required>
|
||||
<small class="text-muted">Navn på templaten, f.eks. leverandør + "Standard" eller "Email faktura"</small>
|
||||
</div>
|
||||
<div class="mb-3">
|
||||
<label class="form-label">Produktkategori <span class="text-danger">*</span></label>
|
||||
<select class="form-select" id="productCategory" required>
|
||||
<option value="varesalg">🛒 Varesalg (videresalg af hardware)</option>
|
||||
<option value="drift">🔧 Drift (internet, hosting, cloud services)</option>
|
||||
<option value="anlæg">🏗️ Anlæg (investeringer, infrastruktur)</option>
|
||||
<option value="abonnement">📅 Abonnement (løbende services)</option>
|
||||
<option value="lager">📦 Lager (lagervarer)</option>
|
||||
<option value="udlejning">🏪 Udlejning</option>
|
||||
</select>
|
||||
<small class="text-muted">Standardkategori for varelinjer fra denne leverandør</small>
|
||||
</div>
|
||||
<button class="btn btn-primary" onclick="validateAndNextStep(3)">
|
||||
Næste <i class="bi bi-arrow-right ms-2"></i>
|
||||
</button>
|
||||
@ -462,6 +479,137 @@ document.addEventListener('DOMContentLoaded', async () => {
|
||||
} else {
|
||||
await loadPendingFiles();
|
||||
await loadVendors();
|
||||
|
||||
// Check if we're creating a template for a specific vendor/file
|
||||
const vendorIdParam = urlParams.get('vendor');
|
||||
const fileIdParam = urlParams.get('file');
|
||||
|
||||
// Check for sessionStorage data (from supplier invoices page)
|
||||
const storedData = sessionStorage.getItem('templateCreateData');
|
||||
let targetFileId = fileIdParam;
|
||||
let targetVendorId = vendorIdParam;
|
||||
let targetFileName = null;
|
||||
let targetPdfText = null;
|
||||
|
||||
if (storedData) {
|
||||
try {
|
||||
const data = JSON.parse(storedData);
|
||||
console.log('🔄 Loaded template creation data from sessionStorage:', data);
|
||||
|
||||
// Override with sessionStorage if available
|
||||
if (data.fileId) targetFileId = data.fileId;
|
||||
if (data.vendorId) targetVendorId = data.vendorId;
|
||||
if (data.pdfText) targetPdfText = data.pdfText;
|
||||
targetFileName = data.fileName || data.vendorName || targetFileName;
|
||||
|
||||
// Clear sessionStorage after use
|
||||
sessionStorage.removeItem('templateCreateData');
|
||||
} catch (error) {
|
||||
console.error('Failed to parse template creation data:', error);
|
||||
}
|
||||
}
|
||||
|
||||
// If we have PDF text from sessionStorage, skip file selection
|
||||
if (targetPdfText && targetVendorId && targetFileId) {
|
||||
console.log('🚀 Fast-track: Using PDF text from sessionStorage');
|
||||
|
||||
// Set up the file data directly
|
||||
currentFile = {
|
||||
file_id: targetFileId,
|
||||
filename: targetFileName || `File ${targetFileId}`,
|
||||
text: targetPdfText
|
||||
};
|
||||
pdfText = targetPdfText;
|
||||
|
||||
// Wait for vendors to load
|
||||
setTimeout(() => {
|
||||
// Pre-select vendor
|
||||
const vendorSelect = document.getElementById('vendorSelect');
|
||||
if (vendorSelect) {
|
||||
vendorSelect.value = targetVendorId;
|
||||
console.log('✅ Vendor pre-selected:', targetVendorId);
|
||||
}
|
||||
|
||||
// Auto-generate template name
|
||||
const templateNameInput = document.getElementById('templateName');
|
||||
if (templateNameInput && !templateNameInput.value) {
|
||||
const vendorName = vendorSelect?.options[vendorSelect.selectedIndex]?.text || 'Template';
|
||||
templateNameInput.value = `${vendorName} Standard Template`;
|
||||
console.log('✅ Template name generated:', templateNameInput.value);
|
||||
}
|
||||
|
||||
// Show PDF preview in step 2
|
||||
document.getElementById('pdfPreview2').textContent = pdfText;
|
||||
|
||||
// Go directly to step 2
|
||||
console.log('🎯 Jumping to step 2 (vendor & template name)');
|
||||
nextStep(2);
|
||||
|
||||
// After a moment, auto-advance to step 3
|
||||
setTimeout(() => {
|
||||
console.log('🚀 Auto-advancing to step 3 (pattern definition)');
|
||||
validateAndNextStep(3);
|
||||
}, 500);
|
||||
|
||||
}, 500);
|
||||
}
|
||||
// If we have a target file but no PDF text, try to select from pending list
|
||||
else if (targetFileId) {
|
||||
console.log(`🎯 Auto-selecting file ${targetFileId} (${targetFileName || 'unknown'})`);
|
||||
|
||||
// Wait for files to load, then auto-select
|
||||
setTimeout(async () => {
|
||||
try {
|
||||
// First check if file exists in the loaded files
|
||||
const filesList = document.getElementById('filesList');
|
||||
console.log('📋 Files list HTML:', filesList.innerHTML.substring(0, 200));
|
||||
|
||||
// Try to select the file
|
||||
console.log('🔄 Calling selectFile...');
|
||||
await selectFile(parseInt(targetFileId), targetFileName || `File ${targetFileId}`);
|
||||
console.log('✅ selectFile completed');
|
||||
|
||||
// After file is selected, pre-select vendor if available
|
||||
if (targetVendorId) {
|
||||
console.log(`🎯 Pre-selecting vendor ${targetVendorId}`);
|
||||
|
||||
// Wait a bit for step 2 to render
|
||||
setTimeout(() => {
|
||||
const vendorSelect = document.getElementById('vendorSelect');
|
||||
if (!vendorSelect) {
|
||||
console.error('❌ vendorSelect not found!');
|
||||
return;
|
||||
}
|
||||
|
||||
vendorSelect.value = targetVendorId;
|
||||
console.log('✅ Vendor selected:', vendorSelect.value);
|
||||
|
||||
// If both file and vendor are set, auto-advance to step 3
|
||||
setTimeout(() => {
|
||||
const templateNameInput = document.getElementById('templateName');
|
||||
if (!templateNameInput) {
|
||||
console.error('❌ templateName input not found!');
|
||||
return;
|
||||
}
|
||||
|
||||
if (!templateNameInput.value) {
|
||||
// Auto-generate template name if empty
|
||||
const vendorName = vendorSelect.options[vendorSelect.selectedIndex]?.text || 'Template';
|
||||
templateNameInput.value = `${vendorName} Standard Template`;
|
||||
console.log('✅ Template name set:', templateNameInput.value);
|
||||
}
|
||||
|
||||
console.log('🚀 Auto-advancing to step 3 (pattern definition)');
|
||||
validateAndNextStep(3);
|
||||
}, 300);
|
||||
}, 300);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('❌ Failed to auto-select file:', error);
|
||||
alert('Kunne ikke auto-vælge fil: ' + error.message);
|
||||
}
|
||||
}, 1000); // Increased timeout to 1 second
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@ -498,6 +646,11 @@ async function loadExistingTemplate(templateId) {
|
||||
await loadVendors();
|
||||
document.getElementById('vendorSelect').value = template.vendor_id;
|
||||
|
||||
// Set product category
|
||||
if (template.default_product_category) {
|
||||
document.getElementById('productCategory').value = template.default_product_category;
|
||||
}
|
||||
|
||||
// Load detection patterns
|
||||
detectionPatterns = template.detection_patterns || [];
|
||||
|
||||
@ -727,28 +880,61 @@ async function loadVendors() {
|
||||
|
||||
async function selectFile(fileId, filename) {
|
||||
try {
|
||||
// Reprocess file to get PDF text
|
||||
const response = await fetch(`/api/v1/supplier-invoices/reprocess/${fileId}`, {
|
||||
method: 'POST'
|
||||
});
|
||||
console.log(`🔄 Selecting file: ${fileId} (${filename})`);
|
||||
|
||||
// Get PDF text directly (fast endpoint, no AI processing)
|
||||
console.log(`📡 Fetching: /api/v1/supplier-invoices/files/${fileId}/pdf-text`);
|
||||
const response = await fetch(`/api/v1/supplier-invoices/files/${fileId}/pdf-text`);
|
||||
|
||||
console.log(`📥 Response status: ${response.status}`);
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
console.error(`❌ HTTP error: ${response.status} - ${errorText}`);
|
||||
throw new Error(`HTTP ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
console.log('📦 Response data:', data);
|
||||
|
||||
if (!data.pdf_text) {
|
||||
console.warn('⚠️ No PDF text in response');
|
||||
}
|
||||
|
||||
currentFile = {
|
||||
file_id: fileId,
|
||||
filename: filename,
|
||||
text: data.pdf_text
|
||||
text: data.pdf_text || ''
|
||||
};
|
||||
|
||||
pdfText = data.pdf_text;
|
||||
pdfText = data.pdf_text || '';
|
||||
|
||||
console.log(`✅ File loaded, PDF text length: ${pdfText.length} chars`);
|
||||
|
||||
// Show PDF preview
|
||||
document.getElementById('pdfPreview').textContent = pdfText;
|
||||
const pdfPreview = document.getElementById('pdfPreview');
|
||||
if (pdfPreview) {
|
||||
pdfPreview.textContent = pdfText;
|
||||
}
|
||||
|
||||
console.log('🚀 Advancing to step 2');
|
||||
nextStep(2);
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Failed to load file:', error);
|
||||
alert('Kunne ikke hente fil: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
function skipFileSelection() {
|
||||
// Allow user to proceed without selecting a file
|
||||
// They can upload/paste PDF text later
|
||||
console.log('⏭️ Skipping file selection');
|
||||
|
||||
currentFile = null;
|
||||
pdfText = '';
|
||||
|
||||
nextStep(2);
|
||||
} catch (error) {
|
||||
console.error('Failed to load file:', error);
|
||||
alert('Kunne ikke hente fil');
|
||||
}
|
||||
}
|
||||
|
||||
function validateAndNextStep(targetStep) {
|
||||
@ -1289,8 +1475,9 @@ async function autoGenerateTemplate() {
|
||||
async function saveTemplate() {
|
||||
const vendorId = document.getElementById('vendorSelect').value;
|
||||
const templateName = document.getElementById('templateName').value;
|
||||
const productCategory = document.getElementById('productCategory').value;
|
||||
|
||||
console.log('Saving template...', { vendorId, templateName, editingTemplateId });
|
||||
console.log('Saving template...', { vendorId, templateName, productCategory, editingTemplateId });
|
||||
console.log('Detection patterns:', detectionPatterns);
|
||||
console.log('Field patterns:', fieldPatterns);
|
||||
|
||||
@ -1299,6 +1486,11 @@ async function saveTemplate() {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!productCategory) {
|
||||
alert('Vælg produktkategori');
|
||||
return;
|
||||
}
|
||||
|
||||
if (detectionPatterns.length === 0) {
|
||||
alert('Tilføj mindst ét detektionsmønster');
|
||||
return;
|
||||
@ -1378,6 +1570,7 @@ async function saveTemplate() {
|
||||
body: JSON.stringify({
|
||||
vendor_id: parseInt(vendorId),
|
||||
template_name: templateName,
|
||||
default_product_category: productCategory,
|
||||
detection_patterns: detectionPatternsData,
|
||||
field_mappings: fieldMappings
|
||||
})
|
||||
|
||||
@ -56,12 +56,9 @@
|
||||
<div class="container mt-4">
|
||||
<div class="d-flex justify-content-between align-items-center mb-4">
|
||||
<div>
|
||||
<h2><i class="bi bi-grid-3x3 me-2"></i>Faktura Templates</h2>
|
||||
<p class="text-muted">Administrer templates til automatisk faktura-udtrækning</p>
|
||||
<h2><i class="bi bi-file-earmark-code me-2"></i>Invoice2Data Templates (YAML)</h2>
|
||||
<p class="text-muted">YAML-baserede templates til automatisk faktura-udtrækning</p>
|
||||
</div>
|
||||
<a href="/billing/template-builder" class="btn btn-primary">
|
||||
<i class="bi bi-plus-circle me-2"></i>Ny Template
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div id="templatesList" class="row">
|
||||
@ -69,6 +66,63 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Edit YAML Category Modal -->
|
||||
<div class="modal fade" id="editYamlCategoryModal" tabindex="-1">
|
||||
<div class="modal-dialog">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title">
|
||||
<i class="bi bi-pencil me-2"></i>Rediger Kategori: <span id="yamlTemplateName"></span>
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<div class="mb-3">
|
||||
<label class="form-label">Produkt Kategori</label>
|
||||
<select class="form-select" id="yamlCategorySelect">
|
||||
<option value="varesalg">🛒 Varesalg</option>
|
||||
<option value="drift">🔧 Drift</option>
|
||||
<option value="anlæg">🏗️ Anlæg</option>
|
||||
<option value="abonnement">📅 Abonnement</option>
|
||||
<option value="lager">📦 Lager</option>
|
||||
<option value="udlejning">🏪 Udlejning</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="alert alert-info">
|
||||
<i class="bi bi-info-circle me-2"></i>
|
||||
<small>Dette ændrer default_product_category i YAML filen. Filen bliver opdateret på serveren.</small>
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Annuller</button>
|
||||
<button type="button" class="btn btn-primary" onclick="saveYamlCategory()">
|
||||
<i class="bi bi-save me-2"></i>Gem Kategori
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- View YAML Content Modal -->
|
||||
<div class="modal fade" id="viewYamlModal" tabindex="-1">
|
||||
<div class="modal-dialog modal-lg">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title">
|
||||
<i class="bi bi-file-earmark-code me-2"></i>YAML Indhold: <span id="viewYamlTemplateName"></span>
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<pre id="yamlContent" style="background: #f8f9fa; padding: 15px; border-radius: 8px; max-height: 600px; overflow-y: auto;"><code></code></pre>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Luk</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Test Modal -->
|
||||
<div class="modal fade test-modal" id="testModal" tabindex="-1">
|
||||
<div class="modal-dialog modal-xl">
|
||||
@ -116,6 +170,7 @@
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
|
||||
<script>
|
||||
let currentTemplateId = null;
|
||||
let currentTemplateIsInvoice2data = false;
|
||||
|
||||
document.addEventListener('DOMContentLoaded', async () => {
|
||||
await loadTemplates();
|
||||
@ -142,36 +197,65 @@ async function loadTemplates() {
|
||||
return;
|
||||
}
|
||||
|
||||
templates.forEach(template => {
|
||||
// Filter to only show invoice2data templates
|
||||
const invoice2dataTemplates = templates.filter(t => t.template_type === 'invoice2data');
|
||||
|
||||
if (invoice2dataTemplates.length === 0) {
|
||||
container.innerHTML = `
|
||||
<div class="col-12">
|
||||
<div class="alert alert-info">
|
||||
<i class="bi bi-info-circle me-2"></i>
|
||||
Ingen YAML templates endnu. Opret .yml filer i <code>data/invoice_templates/</code>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
invoice2dataTemplates.forEach(template => {
|
||||
const detectionPatterns = template.detection_patterns || [];
|
||||
const fieldMappings = template.field_mappings || {};
|
||||
const fieldCount = Object.keys(fieldMappings).filter(k => !['lines_start', 'lines_end', 'line_item'].includes(k)).length;
|
||||
const category = template.default_product_category || 'varesalg';
|
||||
const categoryIcons = {
|
||||
'varesalg': '🛒',
|
||||
'drift': '🔧',
|
||||
'anlæg': '🏗️',
|
||||
'abonnement': '📅',
|
||||
'lager': '📦',
|
||||
'udlejning': '🏪'
|
||||
};
|
||||
const categoryIcon = categoryIcons[category] || '📦';
|
||||
|
||||
container.innerHTML += `
|
||||
<div class="col-md-4 mb-3">
|
||||
<div class="card template-card">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">
|
||||
<i class="bi bi-file-text me-2"></i>${template.template_name}
|
||||
<div class="d-flex justify-content-between align-items-start mb-2">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-file-earmark-code me-2"></i>${template.template_name}
|
||||
</h5>
|
||||
<span class="badge bg-success">YAML</span>
|
||||
</div>
|
||||
<p class="card-text text-muted mb-2">
|
||||
<small>
|
||||
<i class="bi bi-building me-1"></i>${template.vendor_name || 'Ingen leverandør'}<br>
|
||||
<i class="bi bi-check-circle me-1"></i>${detectionPatterns.length} detektionsmønstre<br>
|
||||
<i class="bi bi-input-cursor me-1"></i>${fieldCount} felter<br>
|
||||
<i class="bi bi-graph-up me-1"></i>${template.usage_count || 0} gange brugt
|
||||
<i class="bi bi-building me-1"></i>${template.vendor_name || 'Ingen leverandør'}
|
||||
${template.vendor_cvr ? `<br><i class="bi bi-hash me-1"></i>CVR: ${template.vendor_cvr}` : ''}
|
||||
<br><i class="bi bi-check-circle me-1"></i>${detectionPatterns.length} detektionsmønstre
|
||||
<br><i class="bi bi-input-cursor me-1"></i>${fieldCount} felter
|
||||
<br><strong>${categoryIcon} Kategori: ${category}</strong>
|
||||
</small>
|
||||
</p>
|
||||
<div class="d-flex gap-2">
|
||||
<button class="btn btn-sm btn-primary" onclick="editTemplate(${template.template_id})">
|
||||
<i class="bi bi-pencil"></i> Rediger
|
||||
<div class="d-flex gap-2 flex-wrap">
|
||||
<button class="btn btn-sm btn-primary" onclick="viewYamlContent('${template.yaml_filename}')" title="Vis YAML indhold">
|
||||
<i class="bi bi-file-earmark-code"></i> Vis YAML
|
||||
</button>
|
||||
<button class="btn btn-sm btn-info" onclick="openTestModal(${template.template_id}, '${template.template_name}')">
|
||||
<button class="btn btn-sm btn-warning" onclick="editYamlCategory('${template.yaml_filename}', '${category}')" title="Rediger kategori">
|
||||
<i class="bi bi-pencil"></i> Kategori
|
||||
</button>
|
||||
<button class="btn btn-sm btn-info" onclick="openTestModal('${template.yaml_filename}', '${template.template_name}', true, ${template.vendor_id || 'null'})">
|
||||
<i class="bi bi-flask"></i> Test
|
||||
</button>
|
||||
<button class="btn btn-sm btn-danger" onclick="deleteTemplate(${template.template_id})">
|
||||
<i class="bi bi-trash"></i>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -211,13 +295,18 @@ async function loadPendingFiles(vendorId = null) {
|
||||
}
|
||||
}
|
||||
|
||||
async function openTestModal(templateId, templateName) {
|
||||
async function openTestModal(templateId, templateName, isInvoice2data = false, vendorId = null) {
|
||||
currentTemplateId = templateId;
|
||||
currentTemplateIsInvoice2data = isInvoice2data;
|
||||
document.getElementById('modalTemplateName').textContent = templateName;
|
||||
document.getElementById('testResultsContainer').classList.add('d-none');
|
||||
document.getElementById('testFileSelect').value = '';
|
||||
|
||||
// Load template to get vendor_id
|
||||
// For invoice2data templates, use vendorId if provided
|
||||
if (isInvoice2data && vendorId) {
|
||||
await loadPendingFiles(vendorId);
|
||||
} else if (!isInvoice2data) {
|
||||
// Load database template to get vendor_id
|
||||
try {
|
||||
const response = await fetch(`/api/v1/supplier-invoices/templates/${templateId}`);
|
||||
const template = await response.json();
|
||||
@ -228,6 +317,10 @@ async function openTestModal(templateId, templateName) {
|
||||
console.error('Failed to load template:', error);
|
||||
await loadPendingFiles(); // Fallback to all files
|
||||
}
|
||||
} else {
|
||||
// No vendor - load all files
|
||||
await loadPendingFiles();
|
||||
}
|
||||
|
||||
const modal = new bootstrap.Modal(document.getElementById('testModal'));
|
||||
modal.show();
|
||||
@ -258,8 +351,15 @@ async function runTest() {
|
||||
document.getElementById('testPdfPreview').textContent = pdfText;
|
||||
document.getElementById('testResultsContainer').classList.remove('d-none');
|
||||
|
||||
// Test template
|
||||
const testResponse = await fetch(`/api/v1/supplier-invoices/templates/${currentTemplateId}/test`, {
|
||||
// Test template - use different endpoint based on type
|
||||
let testUrl;
|
||||
if (currentTemplateIsInvoice2data) {
|
||||
testUrl = `/api/v1/supplier-invoices/templates/invoice2data/${currentTemplateId}/test`;
|
||||
} else {
|
||||
testUrl = `/api/v1/supplier-invoices/templates/${currentTemplateId}/test`;
|
||||
}
|
||||
|
||||
const testResponse = await fetch(testUrl, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ pdf_text: pdfText })
|
||||
@ -303,21 +403,26 @@ async function runTest() {
|
||||
<thead>
|
||||
<tr>
|
||||
<th>#</th>
|
||||
${lineItems[0].item_number ? '<th>Varenr</th>' : ''}
|
||||
${lineItems[0].description ? '<th>Beskrivelse</th>' : ''}
|
||||
${lineItems[0].quantity ? '<th>Antal</th>' : ''}
|
||||
${lineItems[0].unit_price ? '<th>Pris</th>' : ''}
|
||||
${lineItems.some(l => l.circuit_id || l.ip_address) ? '<th>Kredsløb/IP</th>' : ''}
|
||||
${lineItems.some(l => l.location_street) ? '<th>Adresse</th>' : ''}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>`;
|
||||
|
||||
lineItems.forEach(line => {
|
||||
lineItems.forEach((line, idx) => {
|
||||
const locationText = [line.location_street, line.location_zip, line.location_city].filter(x => x).join(' ');
|
||||
const circuitText = line.circuit_id || line.ip_address || '';
|
||||
|
||||
linesHtml += `<tr>
|
||||
<td>${line.line_number}</td>
|
||||
${line.item_number ? `<td>${line.item_number}</td>` : ''}
|
||||
<td>${idx + 1}</td>
|
||||
${line.description ? `<td>${line.description}</td>` : ''}
|
||||
${line.quantity ? `<td>${line.quantity}</td>` : ''}
|
||||
${line.unit_price ? `<td>${line.unit_price}</td>` : ''}
|
||||
${lineItems.some(l => l.circuit_id || l.ip_address) ? `<td><small>${circuitText}</small></td>` : ''}
|
||||
${lineItems.some(l => l.location_street) ? `<td><small>${locationText}</small></td>` : ''}
|
||||
</tr>`;
|
||||
});
|
||||
|
||||
@ -362,32 +467,65 @@ async function runTest() {
|
||||
}
|
||||
}
|
||||
|
||||
async function deleteTemplate(templateId) {
|
||||
if (!confirm('Er du sikker på at du vil slette denne template?')) {
|
||||
let currentYamlTemplate = null;
|
||||
|
||||
async function viewYamlContent(yamlFilename) {
|
||||
try {
|
||||
const response = await fetch(`/api/v1/supplier-invoices/templates/invoice2data/${yamlFilename}/content`);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Kunne ikke hente YAML indhold');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
document.getElementById('viewYamlTemplateName').textContent = yamlFilename + '.yml';
|
||||
document.getElementById('yamlContent').querySelector('code').textContent = data.content;
|
||||
|
||||
const modal = new bootstrap.Modal(document.getElementById('viewYamlModal'));
|
||||
modal.show();
|
||||
} catch (error) {
|
||||
console.error('Failed to load YAML content:', error);
|
||||
alert('❌ Kunne ikke hente YAML indhold');
|
||||
}
|
||||
}
|
||||
|
||||
function editYamlCategory(yamlFilename, currentCategory) {
|
||||
currentYamlTemplate = yamlFilename;
|
||||
document.getElementById('yamlTemplateName').textContent = yamlFilename + '.yml';
|
||||
document.getElementById('yamlCategorySelect').value = currentCategory;
|
||||
|
||||
const modal = new bootstrap.Modal(document.getElementById('editYamlCategoryModal'));
|
||||
modal.show();
|
||||
}
|
||||
|
||||
async function saveYamlCategory() {
|
||||
const newCategory = document.getElementById('yamlCategorySelect').value;
|
||||
|
||||
if (!currentYamlTemplate) {
|
||||
alert('Ingen template valgt');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`/api/v1/supplier-invoices/templates/${templateId}`, {
|
||||
method: 'DELETE'
|
||||
const response = await fetch(`/api/v1/supplier-invoices/templates/invoice2data/${currentYamlTemplate}/category`, {
|
||||
method: 'PUT',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ category: newCategory })
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
alert('✅ Template slettet');
|
||||
await loadTemplates();
|
||||
alert('✅ Kategori opdateret i YAML fil');
|
||||
bootstrap.Modal.getInstance(document.getElementById('editYamlCategoryModal')).hide();
|
||||
await loadTemplates(); // Reload to show new category
|
||||
} else {
|
||||
throw new Error('Sletning fejlede');
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || 'Opdatering fejlede');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Delete failed:', error);
|
||||
alert('❌ Kunne ikke slette template');
|
||||
console.error('Category update failed:', error);
|
||||
alert('❌ Kunne ikke opdatere kategori: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
function editTemplate(templateId) {
|
||||
// Redirect to template builder with template ID
|
||||
window.location.href = `/billing/template-builder?id=${templateId}`;
|
||||
}
|
||||
</script>
|
||||
|
||||
</body>
|
||||
|
||||
@ -271,6 +271,54 @@ class EconomicService:
|
||||
|
||||
# ========== KASSEKLADDE (JOURNALS/VOUCHERS) ==========
|
||||
|
||||
async def check_invoice_number_exists(self, invoice_number: str, journal_number: Optional[int] = None) -> Optional[Dict]:
|
||||
"""
|
||||
Check if an invoice number already exists in e-conomic journals
|
||||
|
||||
Args:
|
||||
invoice_number: Invoice number to check
|
||||
journal_number: Optional specific journal to search (if None, searches all)
|
||||
|
||||
Returns:
|
||||
Dict with voucher info if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
# Search in vouchers (posted journal entries)
|
||||
url = f"{self.api_url}/vouchers"
|
||||
params = {
|
||||
'filter': f'voucherNumber${invoice_number}', # e-conomic filter syntax
|
||||
'pagesize': 100
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers=self._get_headers(), params=params) as response:
|
||||
if response.status != 200:
|
||||
logger.warning(f"⚠️ Failed to search vouchers: {response.status}")
|
||||
return None
|
||||
|
||||
data = await response.json()
|
||||
vouchers = data.get('collection', [])
|
||||
|
||||
# Check if any voucher matches the invoice number
|
||||
for voucher in vouchers:
|
||||
# Check if invoice number appears in voucher text or entries
|
||||
if invoice_number in str(voucher):
|
||||
logger.warning(f"⚠️ Invoice number {invoice_number} found in e-conomic voucher #{voucher.get('voucherNumber')}")
|
||||
return {
|
||||
'found_in': 'e-conomic',
|
||||
'voucher_number': voucher.get('voucherNumber'),
|
||||
'date': voucher.get('date'),
|
||||
'journal': voucher.get('journal', {}).get('journalNumber')
|
||||
}
|
||||
|
||||
logger.info(f"✅ Invoice number {invoice_number} not found in e-conomic")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error checking invoice number in e-conomic: {e}")
|
||||
# Don't block on e-conomic errors - assume not found
|
||||
return None
|
||||
|
||||
async def get_supplier_invoice_journals(self) -> list:
|
||||
"""
|
||||
Get all available journals for supplier invoices (kassekladde)
|
||||
|
||||
337
app/services/invoice2data_service.py
Normal file
337
app/services/invoice2data_service.py
Normal file
@ -0,0 +1,337 @@
|
||||
"""
|
||||
Invoice2Data Service
|
||||
Wrapper around invoice2data library for template-based invoice extraction
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class Invoice2DataService:
|
||||
"""Service for extracting invoice data using invoice2data templates"""
|
||||
|
||||
def __init__(self):
|
||||
self.template_dir = Path(__file__).parent.parent.parent / "data" / "invoice_templates"
|
||||
self.templates = self._load_templates()
|
||||
logger.info(f"📋 Loaded {len(self.templates)} invoice2data templates")
|
||||
|
||||
def _load_templates(self) -> Dict[str, Dict]:
|
||||
"""Load all YAML templates from template directory"""
|
||||
templates = {}
|
||||
|
||||
if not self.template_dir.exists():
|
||||
logger.warning(f"Template directory not found: {self.template_dir}")
|
||||
return templates
|
||||
|
||||
for template_file in self.template_dir.glob("*.yml"):
|
||||
try:
|
||||
with open(template_file, 'r', encoding='utf-8') as f:
|
||||
template_data = yaml.safe_load(f)
|
||||
template_name = template_file.stem
|
||||
templates[template_name] = template_data
|
||||
logger.debug(f" ✓ Loaded template: {template_name}")
|
||||
except Exception as e:
|
||||
logger.error(f" ✗ Failed to load template {template_file}: {e}")
|
||||
|
||||
return templates
|
||||
|
||||
def match_template(self, text: str) -> Optional[str]:
|
||||
"""
|
||||
Find matching template based on keywords
|
||||
Returns template name or None
|
||||
"""
|
||||
text_lower = text.lower()
|
||||
|
||||
for template_name, template_data in self.templates.items():
|
||||
keywords = template_data.get('keywords', [])
|
||||
|
||||
# Check if all keywords are present
|
||||
matches = sum(1 for keyword in keywords if str(keyword).lower() in text_lower)
|
||||
|
||||
if matches >= len(keywords) * 0.7: # 70% of keywords must match
|
||||
logger.info(f"✅ Matched template: {template_name} ({matches}/{len(keywords)} keywords)")
|
||||
return template_name
|
||||
|
||||
logger.warning("⚠️ No template matched")
|
||||
return None
|
||||
|
||||
def extract_with_template(self, text: str, template_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract invoice data using specific template
|
||||
"""
|
||||
if template_name not in self.templates:
|
||||
raise ValueError(f"Template not found: {template_name}")
|
||||
|
||||
template = self.templates[template_name]
|
||||
fields = template.get('fields', {})
|
||||
options = template.get('options', {})
|
||||
|
||||
extracted = {
|
||||
'template': template_name,
|
||||
'issuer': template.get('issuer'),
|
||||
'country': template.get('country'),
|
||||
'currency': options.get('currency', 'DKK')
|
||||
}
|
||||
|
||||
# Extract each field using its regex
|
||||
for field_name, field_config in fields.items():
|
||||
if field_config.get('parser') != 'regex':
|
||||
continue
|
||||
|
||||
pattern = field_config.get('regex')
|
||||
field_type = field_config.get('type', 'string')
|
||||
group = field_config.get('group', 1)
|
||||
|
||||
try:
|
||||
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
if match:
|
||||
value = match.group(group).strip()
|
||||
|
||||
logger.debug(f" 🔍 Extracted raw value for {field_name}: '{value}' (type: {field_type})")
|
||||
|
||||
# Handle CVR filtering (avoid customer CVR)
|
||||
if field_name == 'vendor_vat':
|
||||
# Find ALL CVR numbers
|
||||
all_cvr_matches = re.finditer(r'SE/CVR-nr\.\s+(\d{8})', text, re.IGNORECASE)
|
||||
cvr_numbers = [m.group(1) for m in all_cvr_matches]
|
||||
|
||||
# Filter out BMC's CVR (29522790)
|
||||
vendor_cvrs = [cvr for cvr in cvr_numbers if cvr != '29522790']
|
||||
|
||||
if vendor_cvrs:
|
||||
value = vendor_cvrs[0]
|
||||
logger.debug(f" ✓ {field_name}: {value} (filtered from {cvr_numbers})")
|
||||
else:
|
||||
logger.warning(f" ⚠️ Only customer CVR found, no vendor CVR")
|
||||
continue
|
||||
|
||||
# Convert type
|
||||
if field_type == 'float':
|
||||
# Handle Danish number format (1.234,56 → 1234.56)
|
||||
# OR (148,587.98 → 148587.98) - handle both formats
|
||||
decimal_sep = options.get('decimal_separator', ',')
|
||||
thousands_sep = options.get('thousands_separator', '.')
|
||||
|
||||
# Remove all spaces first
|
||||
value = value.replace(' ', '')
|
||||
|
||||
# If both separators are present, we can determine the format
|
||||
# Danish: 148.587,98 (thousands=., decimal=,)
|
||||
# English: 148,587.98 (thousands=, decimal=.)
|
||||
if thousands_sep in value and decimal_sep in value:
|
||||
# Remove thousands separator, then convert decimal separator to .
|
||||
value = value.replace(thousands_sep, '').replace(decimal_sep, '.')
|
||||
elif thousands_sep in value:
|
||||
# Only thousands separator present - just remove it
|
||||
value = value.replace(thousands_sep, '')
|
||||
elif decimal_sep in value and decimal_sep == ',':
|
||||
# Only decimal separator and it's Danish comma - convert to .
|
||||
value = value.replace(',', '.')
|
||||
|
||||
value = float(value)
|
||||
elif field_type == 'int':
|
||||
value = int(value)
|
||||
elif field_type == 'date':
|
||||
# Try to parse Danish dates
|
||||
date_formats = options.get('date_formats', ['%B %d, %Y', '%d-%m-%Y'])
|
||||
|
||||
# Danish month names
|
||||
value = value.replace('januar', 'January').replace('februar', 'February')
|
||||
value = value.replace('marts', 'March').replace('april', 'April')
|
||||
value = value.replace('maj', 'May').replace('juni', 'June')
|
||||
value = value.replace('juli', 'July').replace('august', 'August')
|
||||
value = value.replace('september', 'September').replace('oktober', 'October')
|
||||
value = value.replace('november', 'November').replace('december', 'December')
|
||||
|
||||
for date_format in date_formats:
|
||||
try:
|
||||
parsed_date = datetime.strptime(value, date_format)
|
||||
value = parsed_date.strftime('%Y-%m-%d')
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
extracted[field_name] = value
|
||||
logger.debug(f" ✓ {field_name}: {value}")
|
||||
else:
|
||||
logger.debug(f" ✗ {field_name}: No match")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" ✗ Failed to extract {field_name}: {e}")
|
||||
|
||||
# Extract line items if defined in template
|
||||
lines_config = template.get('lines', [])
|
||||
if lines_config:
|
||||
extracted['lines'] = self._extract_lines(text, lines_config, options)
|
||||
|
||||
return extracted
|
||||
|
||||
def _extract_lines(self, text: str, lines_configs: List[Dict], options: Dict) -> List[Dict]:
|
||||
"""Extract line items from invoice text"""
|
||||
all_lines = []
|
||||
|
||||
logger.debug(f"🔍 Extracting lines with {len(lines_configs)} configurations")
|
||||
|
||||
for lines_config in lines_configs:
|
||||
start_pattern = lines_config.get('start')
|
||||
end_pattern = lines_config.get('end')
|
||||
line_config = lines_config.get('line', {})
|
||||
|
||||
if not start_pattern or not line_config:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Find section between start and end patterns
|
||||
if end_pattern:
|
||||
section_pattern = f"{start_pattern}(.*?){end_pattern}"
|
||||
section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE)
|
||||
else:
|
||||
section_pattern = f"{start_pattern}(.*?)$"
|
||||
section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
if not section_match:
|
||||
logger.debug(f" ✗ Line section not found (start: {start_pattern[:50]}, end: {end_pattern[:50] if end_pattern else 'None'})")
|
||||
continue
|
||||
|
||||
section_text = section_match.group(1)
|
||||
logger.debug(f" ✓ Found line section ({len(section_text)} chars)")
|
||||
|
||||
# Extract individual lines
|
||||
line_pattern = line_config.get('regex')
|
||||
field_names = line_config.get('fields', [])
|
||||
field_types = line_config.get('types', {})
|
||||
context_config = line_config.get('context_before', {})
|
||||
|
||||
if not line_pattern or not field_names:
|
||||
continue
|
||||
|
||||
# Split section into lines for context processing
|
||||
section_lines = section_text.split('\n')
|
||||
line_matches = []
|
||||
|
||||
# Find all matching lines with their indices
|
||||
for line_idx, line_text in enumerate(section_lines):
|
||||
match = re.search(line_pattern, line_text, re.MULTILINE)
|
||||
if match:
|
||||
line_matches.append((line_idx, line_text, match))
|
||||
|
||||
logger.debug(f" ✓ Found {len(line_matches)} matching lines")
|
||||
|
||||
for line_idx, line_text, match in line_matches:
|
||||
line_data = {}
|
||||
|
||||
# Extract main line fields
|
||||
for idx, field_name in enumerate(field_names, start=1):
|
||||
try:
|
||||
value = match.group(idx).strip()
|
||||
field_type = field_types.get(field_name, 'string')
|
||||
|
||||
# Convert type
|
||||
if field_type == 'float':
|
||||
thousands_sep = options.get('thousands_separator', ',')
|
||||
decimal_sep = options.get('decimal_separator', '.')
|
||||
value = value.replace(' ', '')
|
||||
|
||||
if thousands_sep in value and decimal_sep in value:
|
||||
value = value.replace(thousands_sep, '').replace(decimal_sep, '.')
|
||||
elif thousands_sep in value:
|
||||
value = value.replace(thousands_sep, '')
|
||||
elif decimal_sep in value and decimal_sep == ',':
|
||||
value = value.replace(',', '.')
|
||||
|
||||
value = float(value)
|
||||
elif field_type == 'int':
|
||||
value = int(value)
|
||||
|
||||
line_data[field_name] = value
|
||||
except Exception as e:
|
||||
logger.debug(f" ✗ Failed to extract line field {field_name}: {e}")
|
||||
|
||||
# Extract context_before if configured
|
||||
if context_config and line_idx > 0:
|
||||
max_lines = context_config.get('max_lines', 5)
|
||||
patterns = context_config.get('patterns', [])
|
||||
|
||||
# Look at lines BEFORE this line
|
||||
start_idx = max(0, line_idx - max_lines)
|
||||
context_lines = section_lines[start_idx:line_idx]
|
||||
|
||||
for pattern_config in patterns:
|
||||
pattern_regex = pattern_config.get('regex')
|
||||
pattern_fields = pattern_config.get('fields', [])
|
||||
|
||||
if not pattern_regex or not pattern_fields:
|
||||
continue
|
||||
|
||||
# Try to match against context lines (most recent first)
|
||||
for ctx_line in reversed(context_lines):
|
||||
ctx_match = re.search(pattern_regex, ctx_line)
|
||||
if ctx_match:
|
||||
# Extract fields from context
|
||||
for ctx_idx, ctx_field_name in enumerate(pattern_fields, start=1):
|
||||
try:
|
||||
ctx_value = ctx_match.group(ctx_idx).strip()
|
||||
line_data[ctx_field_name] = ctx_value
|
||||
except Exception as e:
|
||||
logger.debug(f" ✗ Failed to extract context field {ctx_field_name}: {e}")
|
||||
break # Stop after first match for this pattern
|
||||
|
||||
if line_data:
|
||||
all_lines.append(line_data)
|
||||
|
||||
logger.info(f" ✓ Extracted {len(all_lines)} line items")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" ✗ Failed to extract lines: {e}")
|
||||
|
||||
return all_lines
|
||||
|
||||
def extract(self, text: str, template_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract invoice data from text
|
||||
If template_name is None, auto-detect template
|
||||
"""
|
||||
try:
|
||||
# Auto-detect template if not specified
|
||||
if template_name is None:
|
||||
template_name = self.match_template(text)
|
||||
if template_name is None:
|
||||
return None
|
||||
|
||||
# Extract with template
|
||||
result = self.extract_with_template(text, template_name)
|
||||
|
||||
logger.info(f"✅ Extracted {len(result)} fields using template: {template_name}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Extraction failed: {e}")
|
||||
return None
|
||||
|
||||
def get_template_list(self) -> List[Dict[str, str]]:
|
||||
"""Get list of available templates"""
|
||||
return [
|
||||
{
|
||||
'name': name,
|
||||
'issuer': template.get('issuer'),
|
||||
'country': template.get('country')
|
||||
}
|
||||
for name, template in self.templates.items()
|
||||
]
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_invoice2data_service = None
|
||||
|
||||
def get_invoice2data_service() -> Invoice2DataService:
|
||||
"""Get singleton instance of Invoice2Data service"""
|
||||
global _invoice2data_service
|
||||
if _invoice2data_service is None:
|
||||
_invoice2data_service = Invoice2DataService()
|
||||
return _invoice2data_service
|
||||
@ -437,6 +437,130 @@ Output: {
|
||||
}
|
||||
return mime_types.get(suffix, 'application/octet-stream')
|
||||
|
||||
async def quick_analysis_on_upload(self, pdf_text: str) -> Dict:
|
||||
"""
|
||||
Quick analysis when file is uploaded - extracts critical fields only:
|
||||
- CVR number (to match vendor)
|
||||
- Document type (invoice vs credit note)
|
||||
- Invoice/credit note number
|
||||
|
||||
This runs BEFORE template matching for early vendor detection.
|
||||
|
||||
Args:
|
||||
pdf_text: Extracted text from PDF
|
||||
|
||||
Returns:
|
||||
Dict with cvr, document_type, document_number, vendor_id, vendor_name, is_own_invoice
|
||||
"""
|
||||
from app.core.config import settings
|
||||
|
||||
logger.info("⚡ Running quick analysis on upload...")
|
||||
|
||||
result = {
|
||||
"cvr": None,
|
||||
"document_type": None, # 'invoice' or 'credit_note'
|
||||
"document_number": None,
|
||||
"vendor_id": None,
|
||||
"vendor_name": None,
|
||||
"is_own_invoice": False # True if this is an outgoing invoice (BMC's own CVR)
|
||||
}
|
||||
|
||||
# 1. FIND CVR NUMBER (8 digits)
|
||||
# Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678"
|
||||
# Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR
|
||||
# We need the SELLER's CVR (not BMC's own)
|
||||
|
||||
cvr_patterns = [
|
||||
r'CVR[:\-\s]*(\d{8})',
|
||||
r'CVR[:\-\s]*nr\.?\s*(\d{8})',
|
||||
r'CVR[:\-\s]*nummer\s*(\d{8})',
|
||||
r'SE[:\-\s]*(\d{8})', # SE = Svensk CVR, men også brugt i DK
|
||||
r'\b(\d{8})\b' # Fallback: any 8-digit number
|
||||
]
|
||||
|
||||
# Find ALL CVR numbers in document
|
||||
found_cvrs = []
|
||||
for pattern in cvr_patterns:
|
||||
matches = re.finditer(pattern, pdf_text, re.IGNORECASE)
|
||||
for match in matches:
|
||||
cvr_candidate = match.group(1)
|
||||
# Validate it's a real CVR (starts with 1-4, not a random number)
|
||||
if cvr_candidate[0] in '1234' and cvr_candidate not in found_cvrs:
|
||||
found_cvrs.append(cvr_candidate)
|
||||
|
||||
# Remove BMC's own CVR from list (buyer CVR, not seller)
|
||||
vendor_cvrs = [cvr for cvr in found_cvrs if cvr != settings.OWN_CVR]
|
||||
|
||||
if settings.OWN_CVR in found_cvrs:
|
||||
# This is a proper invoice where BMC is the buyer
|
||||
if len(vendor_cvrs) > 0:
|
||||
# Found vendor CVR - use the first non-BMC CVR
|
||||
result['cvr'] = vendor_cvrs[0]
|
||||
logger.info(f"📋 Found vendor CVR: {vendor_cvrs[0]} (ignored BMC CVR: {settings.OWN_CVR})")
|
||||
|
||||
# Try to match vendor
|
||||
vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
|
||||
if vendor:
|
||||
result['vendor_id'] = vendor['id']
|
||||
result['vendor_name'] = vendor['name']
|
||||
else:
|
||||
# Only BMC's CVR found = this is an outgoing invoice
|
||||
result['is_own_invoice'] = True
|
||||
result['cvr'] = settings.OWN_CVR
|
||||
logger.warning(f"⚠️ OUTGOING INVOICE: Only BMC CVR found")
|
||||
elif len(vendor_cvrs) > 0:
|
||||
# No BMC CVR, but other CVR found - use first one
|
||||
result['cvr'] = vendor_cvrs[0]
|
||||
logger.info(f"📋 Found CVR: {vendor_cvrs[0]}")
|
||||
|
||||
vendor = self.match_vendor_by_cvr(vendor_cvrs[0])
|
||||
if vendor:
|
||||
result['vendor_id'] = vendor['id']
|
||||
result['vendor_name'] = vendor['name']
|
||||
|
||||
# 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note)
|
||||
credit_keywords = [
|
||||
'kreditnota', 'credit note', 'creditnote', 'kreditfaktura',
|
||||
'refusion', 'tilbagebetaling', 'godtgørelse', 'tilbageførsel'
|
||||
]
|
||||
|
||||
text_lower = pdf_text.lower()
|
||||
is_credit_note = any(keyword in text_lower for keyword in credit_keywords)
|
||||
|
||||
if is_credit_note:
|
||||
result['document_type'] = 'credit_note'
|
||||
logger.info("📄 Document type: CREDIT NOTE")
|
||||
else:
|
||||
result['document_type'] = 'invoice'
|
||||
logger.info("📄 Document type: INVOICE")
|
||||
|
||||
# 3. EXTRACT DOCUMENT NUMBER
|
||||
# For invoices: "Faktura nr.", "Invoice number:", "Fakturanr."
|
||||
# For credit notes: "Kreditnota nr.", "Credit note number:"
|
||||
|
||||
if result['document_type'] == 'credit_note':
|
||||
number_patterns = [
|
||||
r'kreditnota\s*(?:nr\.?|nummer)[:\s]*(\S+)',
|
||||
r'credit\s*note\s*(?:no\.?|number)[:\s]*(\S+)',
|
||||
r'kreditfaktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
|
||||
]
|
||||
else:
|
||||
number_patterns = [
|
||||
r'faktura\s*(?:nr\.?|nummer)[:\s]*(\S+)',
|
||||
r'invoice\s*(?:no\.?|number)[:\s]*(\S+)',
|
||||
r'fakturanr\.?\s*[:\s]*(\S+)',
|
||||
]
|
||||
|
||||
for pattern in number_patterns:
|
||||
match = re.search(pattern, pdf_text, re.IGNORECASE)
|
||||
if match:
|
||||
result['document_number'] = match.group(1).strip()
|
||||
logger.info(f"🔢 Document number: {result['document_number']}")
|
||||
break
|
||||
|
||||
logger.info(f"✅ Quick analysis complete: CVR={result['cvr']}, Type={result['document_type']}, Number={result['document_number']}, Vendor={result['vendor_name']}")
|
||||
return result
|
||||
|
||||
def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]:
|
||||
"""
|
||||
Match vendor from database using CVR number
|
||||
@ -459,7 +583,7 @@ Output: {
|
||||
|
||||
# Search vendors table
|
||||
vendor = execute_query(
|
||||
"SELECT * FROM vendors WHERE cvr = %s",
|
||||
"SELECT * FROM vendors WHERE cvr_number = %s",
|
||||
(cvr_clean,),
|
||||
fetchone=True
|
||||
)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
"""
|
||||
Supplier Invoice Template Service
|
||||
Simple template-based invoice field extraction (no AI)
|
||||
Hybrid approach: invoice2data templates + custom regex templates
|
||||
Inspired by OmniSync's invoice template system
|
||||
"""
|
||||
|
||||
@ -11,6 +11,7 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.core.database import execute_query, execute_insert, execute_update
|
||||
from app.services.invoice2data_service import get_invoice2data_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -21,12 +22,19 @@ class TemplateService:
|
||||
def __init__(self):
|
||||
self.templates_cache = {}
|
||||
self._initialized = False
|
||||
self.invoice2data = None
|
||||
|
||||
def _ensure_loaded(self):
|
||||
"""Lazy load templates on first use"""
|
||||
if not self._initialized:
|
||||
logger.info("🔄 Lazy loading templates...")
|
||||
self._load_templates()
|
||||
# Also load invoice2data templates
|
||||
try:
|
||||
self.invoice2data = get_invoice2data_service()
|
||||
logger.info(f"✅ Invoice2Data service initialized")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to load invoice2data: {e}")
|
||||
self._initialized = True
|
||||
|
||||
def _load_templates(self):
|
||||
@ -51,11 +59,24 @@ class TemplateService:
|
||||
def match_template(self, pdf_text: str) -> Tuple[Optional[int], float]:
|
||||
"""
|
||||
Find best matching template for PDF text
|
||||
First tries invoice2data templates, then falls back to custom templates
|
||||
Returns: (template_id, confidence_score)
|
||||
"""
|
||||
self._ensure_loaded() # Lazy load templates
|
||||
|
||||
logger.info(f"🔍 Matching against {len(self.templates_cache)} templates")
|
||||
# Try invoice2data templates first
|
||||
if self.invoice2data:
|
||||
try:
|
||||
template_name = self.invoice2data.match_template(pdf_text)
|
||||
if template_name:
|
||||
logger.info(f"✅ Matched invoice2data template: {template_name}")
|
||||
# Return special ID to indicate invoice2data template
|
||||
return (-1, 1.0) # -1 = invoice2data, 100% confidence
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Invoice2data matching failed: {e}")
|
||||
|
||||
# Fallback to custom templates
|
||||
logger.info(f"🔍 Matching against {len(self.templates_cache)} custom templates")
|
||||
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
@ -112,6 +133,19 @@ class TemplateService:
|
||||
"""Extract invoice fields using template's regex patterns"""
|
||||
self._ensure_loaded() # Lazy load templates
|
||||
|
||||
# Check if this is an invoice2data template
|
||||
if template_id == -1:
|
||||
if self.invoice2data:
|
||||
try:
|
||||
result = self.invoice2data.extract(pdf_text)
|
||||
if result:
|
||||
logger.info(f"✅ Extracted fields using invoice2data")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Invoice2data extraction failed: {e}")
|
||||
return {}
|
||||
|
||||
# Use custom template
|
||||
template = self.templates_cache.get(template_id)
|
||||
if not template:
|
||||
logger.warning(f"⚠️ Template {template_id} not found in cache")
|
||||
@ -128,6 +162,31 @@ class TemplateService:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Special handling for CVR to avoid extracting own CVR
|
||||
if field_name == 'vendor_cvr':
|
||||
from app.core.config import settings
|
||||
own_cvr = getattr(settings, 'OWN_CVR', '29522790')
|
||||
|
||||
# Find ALL CVR matches
|
||||
all_matches = list(re.finditer(pattern, pdf_text, re.IGNORECASE | re.MULTILINE))
|
||||
found_cvrs = []
|
||||
|
||||
for match in all_matches:
|
||||
if len(match.groups()) >= group:
|
||||
cvr = match.group(group).strip()
|
||||
found_cvrs.append(cvr)
|
||||
|
||||
# Filter out own CVR
|
||||
vendor_cvrs = [cvr for cvr in found_cvrs if cvr != own_cvr]
|
||||
|
||||
if vendor_cvrs:
|
||||
# Use first non-own CVR as vendor CVR
|
||||
extracted[field_name] = vendor_cvrs[0]
|
||||
logger.debug(f" ✓ {field_name}: {vendor_cvrs[0]} (filtered out own CVR: {own_cvr})")
|
||||
else:
|
||||
logger.warning(f" ⚠️ Only found own CVR ({own_cvr}), no vendor CVR found")
|
||||
else:
|
||||
# Normal extraction for other fields
|
||||
match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE)
|
||||
if match and len(match.groups()) >= group:
|
||||
value = match.group(group).strip()
|
||||
|
||||
18
migrations/011_extraction_lines_context.sql
Normal file
18
migrations/011_extraction_lines_context.sql
Normal file
@ -0,0 +1,18 @@
|
||||
-- Migration 011: Add context fields to extraction_lines
|
||||
-- These fields capture additional context information from invoice line items
|
||||
|
||||
ALTER TABLE extraction_lines
|
||||
ADD COLUMN IF NOT EXISTS ip_address VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS contract_number VARCHAR(100),
|
||||
ADD COLUMN IF NOT EXISTS location_street VARCHAR(255),
|
||||
ADD COLUMN IF NOT EXISTS location_zip VARCHAR(10),
|
||||
ADD COLUMN IF NOT EXISTS location_city VARCHAR(100);
|
||||
|
||||
-- Add index for contract number lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_lines_contract_number ON extraction_lines(contract_number);
|
||||
|
||||
COMMENT ON COLUMN extraction_lines.ip_address IS 'IP address/subnet from line context (e.g., 152.115.56.192/27)';
|
||||
COMMENT ON COLUMN extraction_lines.contract_number IS 'Contract number from line context (e.g., NKA-008225)';
|
||||
COMMENT ON COLUMN extraction_lines.location_street IS 'Street address from line context';
|
||||
COMMENT ON COLUMN extraction_lines.location_zip IS 'Zip code from line context';
|
||||
COMMENT ON COLUMN extraction_lines.location_city IS 'City from line context';
|
||||
19
migrations/011_quick_analysis.sql
Normal file
19
migrations/011_quick_analysis.sql
Normal file
@ -0,0 +1,19 @@
|
||||
-- Migration 011: Quick Analysis on Upload
|
||||
-- Adds fields to store automatic CVR, document type, and document number detection
|
||||
|
||||
-- Add quick analysis fields to incoming_files
|
||||
ALTER TABLE incoming_files
|
||||
ADD COLUMN IF NOT EXISTS detected_cvr VARCHAR(8),
|
||||
ADD COLUMN IF NOT EXISTS detected_vendor_id INTEGER REFERENCES vendors(id),
|
||||
ADD COLUMN IF NOT EXISTS detected_document_type VARCHAR(20), -- 'invoice' or 'credit_note'
|
||||
ADD COLUMN IF NOT EXISTS detected_document_number VARCHAR(100);
|
||||
|
||||
-- Add index for CVR lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_cvr ON incoming_files(detected_cvr);
|
||||
CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_vendor ON incoming_files(detected_vendor_id);
|
||||
|
||||
-- Add comments
|
||||
COMMENT ON COLUMN incoming_files.detected_cvr IS 'Automatically detected CVR number from PDF text';
|
||||
COMMENT ON COLUMN incoming_files.detected_vendor_id IS 'Vendor matched by CVR on upload';
|
||||
COMMENT ON COLUMN incoming_files.detected_document_type IS 'Auto-detected: invoice or credit_note';
|
||||
COMMENT ON COLUMN incoming_files.detected_document_number IS 'Automatically extracted invoice/credit note number';
|
||||
20
migrations/012_own_invoice_filter.sql
Normal file
20
migrations/012_own_invoice_filter.sql
Normal file
@ -0,0 +1,20 @@
|
||||
-- Migration 012: Add is_own_invoice flag to filter outgoing invoices
|
||||
-- BMC's own CVR: 29522790
|
||||
|
||||
-- Add column to track outgoing invoices (BMC's own invoices to customers)
|
||||
ALTER TABLE incoming_files
|
||||
ADD COLUMN IF NOT EXISTS is_own_invoice BOOLEAN DEFAULT FALSE;
|
||||
|
||||
-- Mark existing files with BMC's CVR as outgoing invoices
|
||||
UPDATE incoming_files
|
||||
SET is_own_invoice = TRUE
|
||||
WHERE detected_cvr = '29522790';
|
||||
|
||||
-- Add index for faster filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_incoming_files_is_own_invoice
|
||||
ON incoming_files(is_own_invoice)
|
||||
WHERE is_own_invoice = TRUE;
|
||||
|
||||
-- Add comment
|
||||
COMMENT ON COLUMN incoming_files.is_own_invoice IS
|
||||
'TRUE hvis filen er en udgående faktura fra BMC (CVR 29522790), FALSE hvis leverandør faktura';
|
||||
13
migrations/012_template_default_category.sql
Normal file
13
migrations/012_template_default_category.sql
Normal file
@ -0,0 +1,13 @@
|
||||
-- Migration 012: Add default product category to templates
|
||||
-- Allows templates to specify default category for line items (varesalg, drift, etc.)
|
||||
|
||||
ALTER TABLE supplier_invoice_templates
|
||||
ADD COLUMN IF NOT EXISTS default_product_category VARCHAR(50) DEFAULT 'varesalg',
|
||||
ADD COLUMN IF NOT EXISTS default_product_group_number INTEGER;
|
||||
|
||||
-- Valid categories: varesalg, drift, anlæg, abonnement, lager, udlejning
|
||||
COMMENT ON COLUMN supplier_invoice_templates.default_product_category IS 'Default kategori for varelinjer: varesalg, drift, anlæg, abonnement, lager, udlejning';
|
||||
COMMENT ON COLUMN supplier_invoice_templates.default_product_group_number IS 'Default e-conomic produktgruppe nummer';
|
||||
|
||||
-- Add index for category lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_supplier_invoice_templates_category ON supplier_invoice_templates(default_product_category);
|
||||
@ -15,3 +15,5 @@ PyPDF2==3.0.1
|
||||
pdfplumber==0.11.4
|
||||
pytesseract==0.3.13
|
||||
Pillow==11.0.0
|
||||
invoice2data==0.4.4
|
||||
pyyaml==6.0.2
|
||||
|
||||
89
scripts/backfill_quick_analysis.py
Normal file
89
scripts/backfill_quick_analysis.py
Normal file
@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill quick analysis for existing files
|
||||
"""
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from app.core.database import execute_query, execute_update, init_db
|
||||
from app.services.ollama_service import ollama_service
|
||||
|
||||
|
||||
async def backfill_quick_analysis():
|
||||
"""Run quick analysis on all files that don't have it"""
|
||||
|
||||
# Initialize database
|
||||
init_db()
|
||||
|
||||
try:
|
||||
# Get files without quick analysis
|
||||
files = execute_query(
|
||||
"""SELECT file_id, filename, file_path
|
||||
FROM incoming_files
|
||||
WHERE (detected_cvr IS NULL OR detected_document_number IS NULL)
|
||||
AND status NOT IN ('duplicate')
|
||||
AND file_path IS NOT NULL
|
||||
ORDER BY file_id DESC"""
|
||||
)
|
||||
|
||||
print(f"📋 Found {len(files)} files without quick analysis")
|
||||
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for file in files:
|
||||
try:
|
||||
file_path = Path(file['file_path'])
|
||||
|
||||
if not file_path.exists():
|
||||
print(f"⚠️ File not found: {file_path}")
|
||||
fail_count += 1
|
||||
continue
|
||||
|
||||
print(f"\n🔍 Processing: {file['filename']} (ID: {file['file_id']})")
|
||||
|
||||
# Extract text
|
||||
text = await ollama_service._extract_text_from_file(file_path)
|
||||
|
||||
# Run quick analysis
|
||||
quick_result = await ollama_service.quick_analysis_on_upload(text)
|
||||
|
||||
# Update database
|
||||
execute_update(
|
||||
"""UPDATE incoming_files
|
||||
SET detected_cvr = %s,
|
||||
detected_vendor_id = %s,
|
||||
detected_document_type = %s,
|
||||
detected_document_number = %s
|
||||
WHERE file_id = %s""",
|
||||
(quick_result.get('cvr'),
|
||||
quick_result.get('vendor_id'),
|
||||
quick_result.get('document_type'),
|
||||
quick_result.get('document_number'),
|
||||
file['file_id'])
|
||||
)
|
||||
|
||||
print(f"✅ Updated: CVR={quick_result.get('cvr')}, "
|
||||
f"Type={quick_result.get('document_type')}, "
|
||||
f"Number={quick_result.get('document_number')}, "
|
||||
f"Vendor={quick_result.get('vendor_name')}")
|
||||
|
||||
success_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing {file['filename']}: {e}")
|
||||
fail_count += 1
|
||||
|
||||
print(f"\n📊 Summary: {success_count} successful, {fail_count} failed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Fatal error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(backfill_quick_analysis())
|
||||
@ -1,4 +1,4 @@
|
||||
<!DOCTYPE html>
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" data-bs-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
|
||||
86
test_quick_analysis.py
Normal file
86
test_quick_analysis.py
Normal file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test Quick Analysis on Upload
|
||||
Tests CVR detection, document type, and invoice number extraction
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add app directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "app"))
|
||||
|
||||
from app.services.ollama_service import ollama_service
|
||||
|
||||
async def test_quick_analysis():
|
||||
"""Test quick analysis with sample text"""
|
||||
|
||||
# Sample invoice text with CVR
|
||||
sample_invoice = """
|
||||
ALSO Danmark A/S
|
||||
Jupitervej 4
|
||||
6000 Kolding
|
||||
|
||||
CVR-nr.: 35812428
|
||||
|
||||
FAKTURA
|
||||
|
||||
Faktura nr.: INV-2024-12345
|
||||
Dato: 2024-12-08
|
||||
|
||||
Beløb i alt: 5.965,18 DKK
|
||||
"""
|
||||
|
||||
# Sample credit note text
|
||||
sample_credit_note = """
|
||||
Test Leverandør A/S
|
||||
CVR: 12345678
|
||||
|
||||
KREDITNOTA
|
||||
|
||||
Kreditnota nr.: CN-2024-5678
|
||||
Original faktura: INV-2024-1000
|
||||
|
||||
Beløb: -1.234,56 DKK
|
||||
"""
|
||||
|
||||
print("🧪 Testing Quick Analysis\n")
|
||||
print("=" * 60)
|
||||
|
||||
# Test 1: Invoice with CVR
|
||||
print("\n📄 TEST 1: Invoice with CVR")
|
||||
print("-" * 60)
|
||||
result1 = await ollama_service.quick_analysis_on_upload(sample_invoice)
|
||||
print(f"CVR: {result1['cvr']}")
|
||||
print(f"Document Type: {result1['document_type']}")
|
||||
print(f"Document Number: {result1['document_number']}")
|
||||
print(f"Vendor ID: {result1['vendor_id']}")
|
||||
print(f"Vendor Name: {result1['vendor_name']}")
|
||||
|
||||
assert result1['cvr'] == '35812428', f"Expected CVR 35812428, got {result1['cvr']}"
|
||||
assert result1['document_type'] == 'invoice', f"Expected invoice, got {result1['document_type']}"
|
||||
assert result1['document_number'] == 'INV-2024-12345', f"Expected INV-2024-12345, got {result1['document_number']}"
|
||||
print("✅ Test 1 PASSED")
|
||||
|
||||
# Test 2: Credit Note
|
||||
print("\n📄 TEST 2: Credit Note")
|
||||
print("-" * 60)
|
||||
result2 = await ollama_service.quick_analysis_on_upload(sample_credit_note)
|
||||
print(f"CVR: {result2['cvr']}")
|
||||
print(f"Document Type: {result2['document_type']}")
|
||||
print(f"Document Number: {result2['document_number']}")
|
||||
print(f"Vendor ID: {result2['vendor_id']}")
|
||||
print(f"Vendor Name: {result2['vendor_name']}")
|
||||
|
||||
assert result2['cvr'] == '12345678', f"Expected CVR 12345678, got {result2['cvr']}"
|
||||
assert result2['document_type'] == 'credit_note', f"Expected credit_note, got {result2['document_type']}"
|
||||
assert result2['document_number'] == 'CN-2024-5678', f"Expected CN-2024-5678, got {result2['document_number']}"
|
||||
print("✅ Test 2 PASSED")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ ALL TESTS PASSED!")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_quick_analysis())
|
||||
Loading…
Reference in New Issue
Block a user