From 3a8288f5a137587a7aba25bce261e69047fe03d4 Mon Sep 17 00:00:00 2001
From: Christian
+
+
+
Fakturanr.
Leverandør
Fakturadato
@@ -217,7 +240,7 @@
-
+
📁 Uploadede filer afventer behandling
+ ⏳ Filer der mangler behandling
+
+
+
Filnavn
Upload Dato
Status
+ Quick Analysis
Leverandør
Template
Handlinger
@@ -257,7 +305,7 @@
-
+
diff --git a/app/services/economic_service.py b/app/services/economic_service.py index f7cb6b2..c5d6375 100644 --- a/app/services/economic_service.py +++ b/app/services/economic_service.py @@ -271,6 +271,54 @@ class EconomicService: # ========== KASSEKLADDE (JOURNALS/VOUCHERS) ========== + async def check_invoice_number_exists(self, invoice_number: str, journal_number: Optional[int] = None) -> Optional[Dict]: + """ + Check if an invoice number already exists in e-conomic journals + + Args: + invoice_number: Invoice number to check + journal_number: Optional specific journal to search (if None, searches all) + + Returns: + Dict with voucher info if found, None otherwise + """ + try: + # Search in vouchers (posted journal entries) + url = f"{self.api_url}/vouchers" + params = { + 'filter': f'voucherNumber${invoice_number}', # e-conomic filter syntax + 'pagesize': 100 + } + + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self._get_headers(), params=params) as response: + if response.status != 200: + logger.warning(f"⚠️ Failed to search vouchers: {response.status}") + return None + + data = await response.json() + vouchers = data.get('collection', []) + + # Check if any voucher matches the invoice number + for voucher in vouchers: + # Check if invoice number appears in voucher text or entries + if invoice_number in str(voucher): + logger.warning(f"⚠️ Invoice number {invoice_number} found in e-conomic voucher #{voucher.get('voucherNumber')}") + return { + 'found_in': 'e-conomic', + 'voucher_number': voucher.get('voucherNumber'), + 'date': voucher.get('date'), + 'journal': voucher.get('journal', {}).get('journalNumber') + } + + logger.info(f"✅ Invoice number {invoice_number} not found in e-conomic") + return None + + except Exception as e: + logger.error(f"❌ Error checking invoice number in e-conomic: {e}") + # Don't block on e-conomic errors - assume not found + return None + async def get_supplier_invoice_journals(self) -> list: """ Get all available journals for supplier invoices (kassekladde) diff --git a/app/services/invoice2data_service.py b/app/services/invoice2data_service.py new file mode 100644 index 0000000..8123364 --- /dev/null +++ b/app/services/invoice2data_service.py @@ -0,0 +1,337 @@ +""" +Invoice2Data Service +Wrapper around invoice2data library for template-based invoice extraction +""" + +import logging +import re +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Any +import yaml + +logger = logging.getLogger(__name__) + +class Invoice2DataService: + """Service for extracting invoice data using invoice2data templates""" + + def __init__(self): + self.template_dir = Path(__file__).parent.parent.parent / "data" / "invoice_templates" + self.templates = self._load_templates() + logger.info(f"📋 Loaded {len(self.templates)} invoice2data templates") + + def _load_templates(self) -> Dict[str, Dict]: + """Load all YAML templates from template directory""" + templates = {} + + if not self.template_dir.exists(): + logger.warning(f"Template directory not found: {self.template_dir}") + return templates + + for template_file in self.template_dir.glob("*.yml"): + try: + with open(template_file, 'r', encoding='utf-8') as f: + template_data = yaml.safe_load(f) + template_name = template_file.stem + templates[template_name] = template_data + logger.debug(f" ✓ Loaded template: {template_name}") + except Exception as e: + logger.error(f" ✗ Failed to load template {template_file}: {e}") + + return templates + + def match_template(self, text: str) -> Optional[str]: + """ + Find matching template based on keywords + Returns template name or None + """ + text_lower = text.lower() + + for template_name, template_data in self.templates.items(): + keywords = template_data.get('keywords', []) + + # Check if all keywords are present + matches = sum(1 for keyword in keywords if str(keyword).lower() in text_lower) + + if matches >= len(keywords) * 0.7: # 70% of keywords must match + logger.info(f"✅ Matched template: {template_name} ({matches}/{len(keywords)} keywords)") + return template_name + + logger.warning("⚠️ No template matched") + return None + + def extract_with_template(self, text: str, template_name: str) -> Dict[str, Any]: + """ + Extract invoice data using specific template + """ + if template_name not in self.templates: + raise ValueError(f"Template not found: {template_name}") + + template = self.templates[template_name] + fields = template.get('fields', {}) + options = template.get('options', {}) + + extracted = { + 'template': template_name, + 'issuer': template.get('issuer'), + 'country': template.get('country'), + 'currency': options.get('currency', 'DKK') + } + + # Extract each field using its regex + for field_name, field_config in fields.items(): + if field_config.get('parser') != 'regex': + continue + + pattern = field_config.get('regex') + field_type = field_config.get('type', 'string') + group = field_config.get('group', 1) + + try: + match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) + + if match: + value = match.group(group).strip() + + logger.debug(f" 🔍 Extracted raw value for {field_name}: '{value}' (type: {field_type})") + + # Handle CVR filtering (avoid customer CVR) + if field_name == 'vendor_vat': + # Find ALL CVR numbers + all_cvr_matches = re.finditer(r'SE/CVR-nr\.\s+(\d{8})', text, re.IGNORECASE) + cvr_numbers = [m.group(1) for m in all_cvr_matches] + + # Filter out BMC's CVR (29522790) + vendor_cvrs = [cvr for cvr in cvr_numbers if cvr != '29522790'] + + if vendor_cvrs: + value = vendor_cvrs[0] + logger.debug(f" ✓ {field_name}: {value} (filtered from {cvr_numbers})") + else: + logger.warning(f" ⚠️ Only customer CVR found, no vendor CVR") + continue + + # Convert type + if field_type == 'float': + # Handle Danish number format (1.234,56 → 1234.56) + # OR (148,587.98 → 148587.98) - handle both formats + decimal_sep = options.get('decimal_separator', ',') + thousands_sep = options.get('thousands_separator', '.') + + # Remove all spaces first + value = value.replace(' ', '') + + # If both separators are present, we can determine the format + # Danish: 148.587,98 (thousands=., decimal=,) + # English: 148,587.98 (thousands=, decimal=.) + if thousands_sep in value and decimal_sep in value: + # Remove thousands separator, then convert decimal separator to . + value = value.replace(thousands_sep, '').replace(decimal_sep, '.') + elif thousands_sep in value: + # Only thousands separator present - just remove it + value = value.replace(thousands_sep, '') + elif decimal_sep in value and decimal_sep == ',': + # Only decimal separator and it's Danish comma - convert to . + value = value.replace(',', '.') + + value = float(value) + elif field_type == 'int': + value = int(value) + elif field_type == 'date': + # Try to parse Danish dates + date_formats = options.get('date_formats', ['%B %d, %Y', '%d-%m-%Y']) + + # Danish month names + value = value.replace('januar', 'January').replace('februar', 'February') + value = value.replace('marts', 'March').replace('april', 'April') + value = value.replace('maj', 'May').replace('juni', 'June') + value = value.replace('juli', 'July').replace('august', 'August') + value = value.replace('september', 'September').replace('oktober', 'October') + value = value.replace('november', 'November').replace('december', 'December') + + for date_format in date_formats: + try: + parsed_date = datetime.strptime(value, date_format) + value = parsed_date.strftime('%Y-%m-%d') + break + except ValueError: + continue + + extracted[field_name] = value + logger.debug(f" ✓ {field_name}: {value}") + else: + logger.debug(f" ✗ {field_name}: No match") + + except Exception as e: + logger.warning(f" ✗ Failed to extract {field_name}: {e}") + + # Extract line items if defined in template + lines_config = template.get('lines', []) + if lines_config: + extracted['lines'] = self._extract_lines(text, lines_config, options) + + return extracted + + def _extract_lines(self, text: str, lines_configs: List[Dict], options: Dict) -> List[Dict]: + """Extract line items from invoice text""" + all_lines = [] + + logger.debug(f"🔍 Extracting lines with {len(lines_configs)} configurations") + + for lines_config in lines_configs: + start_pattern = lines_config.get('start') + end_pattern = lines_config.get('end') + line_config = lines_config.get('line', {}) + + if not start_pattern or not line_config: + continue + + try: + # Find section between start and end patterns + if end_pattern: + section_pattern = f"{start_pattern}(.*?){end_pattern}" + section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE) + else: + section_pattern = f"{start_pattern}(.*?)$" + section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE) + + if not section_match: + logger.debug(f" ✗ Line section not found (start: {start_pattern[:50]}, end: {end_pattern[:50] if end_pattern else 'None'})") + continue + + section_text = section_match.group(1) + logger.debug(f" ✓ Found line section ({len(section_text)} chars)") + + # Extract individual lines + line_pattern = line_config.get('regex') + field_names = line_config.get('fields', []) + field_types = line_config.get('types', {}) + context_config = line_config.get('context_before', {}) + + if not line_pattern or not field_names: + continue + + # Split section into lines for context processing + section_lines = section_text.split('\n') + line_matches = [] + + # Find all matching lines with their indices + for line_idx, line_text in enumerate(section_lines): + match = re.search(line_pattern, line_text, re.MULTILINE) + if match: + line_matches.append((line_idx, line_text, match)) + + logger.debug(f" ✓ Found {len(line_matches)} matching lines") + + for line_idx, line_text, match in line_matches: + line_data = {} + + # Extract main line fields + for idx, field_name in enumerate(field_names, start=1): + try: + value = match.group(idx).strip() + field_type = field_types.get(field_name, 'string') + + # Convert type + if field_type == 'float': + thousands_sep = options.get('thousands_separator', ',') + decimal_sep = options.get('decimal_separator', '.') + value = value.replace(' ', '') + + if thousands_sep in value and decimal_sep in value: + value = value.replace(thousands_sep, '').replace(decimal_sep, '.') + elif thousands_sep in value: + value = value.replace(thousands_sep, '') + elif decimal_sep in value and decimal_sep == ',': + value = value.replace(',', '.') + + value = float(value) + elif field_type == 'int': + value = int(value) + + line_data[field_name] = value + except Exception as e: + logger.debug(f" ✗ Failed to extract line field {field_name}: {e}") + + # Extract context_before if configured + if context_config and line_idx > 0: + max_lines = context_config.get('max_lines', 5) + patterns = context_config.get('patterns', []) + + # Look at lines BEFORE this line + start_idx = max(0, line_idx - max_lines) + context_lines = section_lines[start_idx:line_idx] + + for pattern_config in patterns: + pattern_regex = pattern_config.get('regex') + pattern_fields = pattern_config.get('fields', []) + + if not pattern_regex or not pattern_fields: + continue + + # Try to match against context lines (most recent first) + for ctx_line in reversed(context_lines): + ctx_match = re.search(pattern_regex, ctx_line) + if ctx_match: + # Extract fields from context + for ctx_idx, ctx_field_name in enumerate(pattern_fields, start=1): + try: + ctx_value = ctx_match.group(ctx_idx).strip() + line_data[ctx_field_name] = ctx_value + except Exception as e: + logger.debug(f" ✗ Failed to extract context field {ctx_field_name}: {e}") + break # Stop after first match for this pattern + + if line_data: + all_lines.append(line_data) + + logger.info(f" ✓ Extracted {len(all_lines)} line items") + + except Exception as e: + logger.warning(f" ✗ Failed to extract lines: {e}") + + return all_lines + + def extract(self, text: str, template_name: Optional[str] = None) -> Optional[Dict[str, Any]]: + """ + Extract invoice data from text + If template_name is None, auto-detect template + """ + try: + # Auto-detect template if not specified + if template_name is None: + template_name = self.match_template(text) + if template_name is None: + return None + + # Extract with template + result = self.extract_with_template(text, template_name) + + logger.info(f"✅ Extracted {len(result)} fields using template: {template_name}") + return result + + except Exception as e: + logger.error(f"❌ Extraction failed: {e}") + return None + + def get_template_list(self) -> List[Dict[str, str]]: + """Get list of available templates""" + return [ + { + 'name': name, + 'issuer': template.get('issuer'), + 'country': template.get('country') + } + for name, template in self.templates.items() + ] + + +# Singleton instance +_invoice2data_service = None + +def get_invoice2data_service() -> Invoice2DataService: + """Get singleton instance of Invoice2Data service""" + global _invoice2data_service + if _invoice2data_service is None: + _invoice2data_service = Invoice2DataService() + return _invoice2data_service diff --git a/app/services/ollama_service.py b/app/services/ollama_service.py index ef6abff..66808c9 100644 --- a/app/services/ollama_service.py +++ b/app/services/ollama_service.py @@ -437,6 +437,130 @@ Output: { } return mime_types.get(suffix, 'application/octet-stream') + async def quick_analysis_on_upload(self, pdf_text: str) -> Dict: + """ + Quick analysis when file is uploaded - extracts critical fields only: + - CVR number (to match vendor) + - Document type (invoice vs credit note) + - Invoice/credit note number + + This runs BEFORE template matching for early vendor detection. + + Args: + pdf_text: Extracted text from PDF + + Returns: + Dict with cvr, document_type, document_number, vendor_id, vendor_name, is_own_invoice + """ + from app.core.config import settings + + logger.info("⚡ Running quick analysis on upload...") + + result = { + "cvr": None, + "document_type": None, # 'invoice' or 'credit_note' + "document_number": None, + "vendor_id": None, + "vendor_name": None, + "is_own_invoice": False # True if this is an outgoing invoice (BMC's own CVR) + } + + # 1. FIND CVR NUMBER (8 digits) + # Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678" + # Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR + # We need the SELLER's CVR (not BMC's own) + + cvr_patterns = [ + r'CVR[:\-\s]*(\d{8})', + r'CVR[:\-\s]*nr\.?\s*(\d{8})', + r'CVR[:\-\s]*nummer\s*(\d{8})', + r'SE[:\-\s]*(\d{8})', # SE = Svensk CVR, men også brugt i DK + r'\b(\d{8})\b' # Fallback: any 8-digit number + ] + + # Find ALL CVR numbers in document + found_cvrs = [] + for pattern in cvr_patterns: + matches = re.finditer(pattern, pdf_text, re.IGNORECASE) + for match in matches: + cvr_candidate = match.group(1) + # Validate it's a real CVR (starts with 1-4, not a random number) + if cvr_candidate[0] in '1234' and cvr_candidate not in found_cvrs: + found_cvrs.append(cvr_candidate) + + # Remove BMC's own CVR from list (buyer CVR, not seller) + vendor_cvrs = [cvr for cvr in found_cvrs if cvr != settings.OWN_CVR] + + if settings.OWN_CVR in found_cvrs: + # This is a proper invoice where BMC is the buyer + if len(vendor_cvrs) > 0: + # Found vendor CVR - use the first non-BMC CVR + result['cvr'] = vendor_cvrs[0] + logger.info(f"📋 Found vendor CVR: {vendor_cvrs[0]} (ignored BMC CVR: {settings.OWN_CVR})") + + # Try to match vendor + vendor = self.match_vendor_by_cvr(vendor_cvrs[0]) + if vendor: + result['vendor_id'] = vendor['id'] + result['vendor_name'] = vendor['name'] + else: + # Only BMC's CVR found = this is an outgoing invoice + result['is_own_invoice'] = True + result['cvr'] = settings.OWN_CVR + logger.warning(f"⚠️ OUTGOING INVOICE: Only BMC CVR found") + elif len(vendor_cvrs) > 0: + # No BMC CVR, but other CVR found - use first one + result['cvr'] = vendor_cvrs[0] + logger.info(f"📋 Found CVR: {vendor_cvrs[0]}") + + vendor = self.match_vendor_by_cvr(vendor_cvrs[0]) + if vendor: + result['vendor_id'] = vendor['id'] + result['vendor_name'] = vendor['name'] + + # 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note) + credit_keywords = [ + 'kreditnota', 'credit note', 'creditnote', 'kreditfaktura', + 'refusion', 'tilbagebetaling', 'godtgørelse', 'tilbageførsel' + ] + + text_lower = pdf_text.lower() + is_credit_note = any(keyword in text_lower for keyword in credit_keywords) + + if is_credit_note: + result['document_type'] = 'credit_note' + logger.info("📄 Document type: CREDIT NOTE") + else: + result['document_type'] = 'invoice' + logger.info("📄 Document type: INVOICE") + + # 3. EXTRACT DOCUMENT NUMBER + # For invoices: "Faktura nr.", "Invoice number:", "Fakturanr." + # For credit notes: "Kreditnota nr.", "Credit note number:" + + if result['document_type'] == 'credit_note': + number_patterns = [ + r'kreditnota\s*(?:nr\.?|nummer)[:\s]*(\S+)', + r'credit\s*note\s*(?:no\.?|number)[:\s]*(\S+)', + r'kreditfaktura\s*(?:nr\.?|nummer)[:\s]*(\S+)', + ] + else: + number_patterns = [ + r'faktura\s*(?:nr\.?|nummer)[:\s]*(\S+)', + r'invoice\s*(?:no\.?|number)[:\s]*(\S+)', + r'fakturanr\.?\s*[:\s]*(\S+)', + ] + + for pattern in number_patterns: + match = re.search(pattern, pdf_text, re.IGNORECASE) + if match: + result['document_number'] = match.group(1).strip() + logger.info(f"🔢 Document number: {result['document_number']}") + break + + logger.info(f"✅ Quick analysis complete: CVR={result['cvr']}, Type={result['document_type']}, Number={result['document_number']}, Vendor={result['vendor_name']}") + return result + def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]: """ Match vendor from database using CVR number @@ -459,7 +583,7 @@ Output: { # Search vendors table vendor = execute_query( - "SELECT * FROM vendors WHERE cvr = %s", + "SELECT * FROM vendors WHERE cvr_number = %s", (cvr_clean,), fetchone=True ) diff --git a/app/services/template_service.py b/app/services/template_service.py index 9e99b06..e8db2dd 100644 --- a/app/services/template_service.py +++ b/app/services/template_service.py @@ -1,6 +1,6 @@ """ Supplier Invoice Template Service -Simple template-based invoice field extraction (no AI) +Hybrid approach: invoice2data templates + custom regex templates Inspired by OmniSync's invoice template system """ @@ -11,6 +11,7 @@ from datetime import datetime from pathlib import Path from app.core.database import execute_query, execute_insert, execute_update +from app.services.invoice2data_service import get_invoice2data_service logger = logging.getLogger(__name__) @@ -21,12 +22,19 @@ class TemplateService: def __init__(self): self.templates_cache = {} self._initialized = False + self.invoice2data = None def _ensure_loaded(self): """Lazy load templates on first use""" if not self._initialized: logger.info("🔄 Lazy loading templates...") self._load_templates() + # Also load invoice2data templates + try: + self.invoice2data = get_invoice2data_service() + logger.info(f"✅ Invoice2Data service initialized") + except Exception as e: + logger.warning(f"⚠️ Failed to load invoice2data: {e}") self._initialized = True def _load_templates(self): @@ -51,11 +59,24 @@ class TemplateService: def match_template(self, pdf_text: str) -> Tuple[Optional[int], float]: """ Find best matching template for PDF text + First tries invoice2data templates, then falls back to custom templates Returns: (template_id, confidence_score) """ self._ensure_loaded() # Lazy load templates - logger.info(f"🔍 Matching against {len(self.templates_cache)} templates") + # Try invoice2data templates first + if self.invoice2data: + try: + template_name = self.invoice2data.match_template(pdf_text) + if template_name: + logger.info(f"✅ Matched invoice2data template: {template_name}") + # Return special ID to indicate invoice2data template + return (-1, 1.0) # -1 = invoice2data, 100% confidence + except Exception as e: + logger.warning(f"⚠️ Invoice2data matching failed: {e}") + + # Fallback to custom templates + logger.info(f"🔍 Matching against {len(self.templates_cache)} custom templates") best_match = None best_score = 0.0 @@ -112,6 +133,19 @@ class TemplateService: """Extract invoice fields using template's regex patterns""" self._ensure_loaded() # Lazy load templates + # Check if this is an invoice2data template + if template_id == -1: + if self.invoice2data: + try: + result = self.invoice2data.extract(pdf_text) + if result: + logger.info(f"✅ Extracted fields using invoice2data") + return result + except Exception as e: + logger.error(f"❌ Invoice2data extraction failed: {e}") + return {} + + # Use custom template template = self.templates_cache.get(template_id) if not template: logger.warning(f"⚠️ Template {template_id} not found in cache") @@ -128,11 +162,36 @@ class TemplateService: continue try: - match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE) - if match and len(match.groups()) >= group: - value = match.group(group).strip() - extracted[field_name] = value - logger.debug(f" ✓ {field_name}: {value}") + # Special handling for CVR to avoid extracting own CVR + if field_name == 'vendor_cvr': + from app.core.config import settings + own_cvr = getattr(settings, 'OWN_CVR', '29522790') + + # Find ALL CVR matches + all_matches = list(re.finditer(pattern, pdf_text, re.IGNORECASE | re.MULTILINE)) + found_cvrs = [] + + for match in all_matches: + if len(match.groups()) >= group: + cvr = match.group(group).strip() + found_cvrs.append(cvr) + + # Filter out own CVR + vendor_cvrs = [cvr for cvr in found_cvrs if cvr != own_cvr] + + if vendor_cvrs: + # Use first non-own CVR as vendor CVR + extracted[field_name] = vendor_cvrs[0] + logger.debug(f" ✓ {field_name}: {vendor_cvrs[0]} (filtered out own CVR: {own_cvr})") + else: + logger.warning(f" ⚠️ Only found own CVR ({own_cvr}), no vendor CVR found") + else: + # Normal extraction for other fields + match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE) + if match and len(match.groups()) >= group: + value = match.group(group).strip() + extracted[field_name] = value + logger.debug(f" ✓ {field_name}: {value}") except Exception as e: logger.warning(f" ✗ Failed to extract {field_name}: {e}") diff --git a/migrations/011_extraction_lines_context.sql b/migrations/011_extraction_lines_context.sql new file mode 100644 index 0000000..f8c7d6d --- /dev/null +++ b/migrations/011_extraction_lines_context.sql @@ -0,0 +1,18 @@ +-- Migration 011: Add context fields to extraction_lines +-- These fields capture additional context information from invoice line items + +ALTER TABLE extraction_lines +ADD COLUMN IF NOT EXISTS ip_address VARCHAR(50), +ADD COLUMN IF NOT EXISTS contract_number VARCHAR(100), +ADD COLUMN IF NOT EXISTS location_street VARCHAR(255), +ADD COLUMN IF NOT EXISTS location_zip VARCHAR(10), +ADD COLUMN IF NOT EXISTS location_city VARCHAR(100); + +-- Add index for contract number lookups +CREATE INDEX IF NOT EXISTS idx_extraction_lines_contract_number ON extraction_lines(contract_number); + +COMMENT ON COLUMN extraction_lines.ip_address IS 'IP address/subnet from line context (e.g., 152.115.56.192/27)'; +COMMENT ON COLUMN extraction_lines.contract_number IS 'Contract number from line context (e.g., NKA-008225)'; +COMMENT ON COLUMN extraction_lines.location_street IS 'Street address from line context'; +COMMENT ON COLUMN extraction_lines.location_zip IS 'Zip code from line context'; +COMMENT ON COLUMN extraction_lines.location_city IS 'City from line context'; diff --git a/migrations/011_quick_analysis.sql b/migrations/011_quick_analysis.sql new file mode 100644 index 0000000..38fb45a --- /dev/null +++ b/migrations/011_quick_analysis.sql @@ -0,0 +1,19 @@ +-- Migration 011: Quick Analysis on Upload +-- Adds fields to store automatic CVR, document type, and document number detection + +-- Add quick analysis fields to incoming_files +ALTER TABLE incoming_files +ADD COLUMN IF NOT EXISTS detected_cvr VARCHAR(8), +ADD COLUMN IF NOT EXISTS detected_vendor_id INTEGER REFERENCES vendors(id), +ADD COLUMN IF NOT EXISTS detected_document_type VARCHAR(20), -- 'invoice' or 'credit_note' +ADD COLUMN IF NOT EXISTS detected_document_number VARCHAR(100); + +-- Add index for CVR lookups +CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_cvr ON incoming_files(detected_cvr); +CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_vendor ON incoming_files(detected_vendor_id); + +-- Add comments +COMMENT ON COLUMN incoming_files.detected_cvr IS 'Automatically detected CVR number from PDF text'; +COMMENT ON COLUMN incoming_files.detected_vendor_id IS 'Vendor matched by CVR on upload'; +COMMENT ON COLUMN incoming_files.detected_document_type IS 'Auto-detected: invoice or credit_note'; +COMMENT ON COLUMN incoming_files.detected_document_number IS 'Automatically extracted invoice/credit note number'; diff --git a/migrations/012_own_invoice_filter.sql b/migrations/012_own_invoice_filter.sql new file mode 100644 index 0000000..5aab3b3 --- /dev/null +++ b/migrations/012_own_invoice_filter.sql @@ -0,0 +1,20 @@ +-- Migration 012: Add is_own_invoice flag to filter outgoing invoices +-- BMC's own CVR: 29522790 + +-- Add column to track outgoing invoices (BMC's own invoices to customers) +ALTER TABLE incoming_files +ADD COLUMN IF NOT EXISTS is_own_invoice BOOLEAN DEFAULT FALSE; + +-- Mark existing files with BMC's CVR as outgoing invoices +UPDATE incoming_files +SET is_own_invoice = TRUE +WHERE detected_cvr = '29522790'; + +-- Add index for faster filtering +CREATE INDEX IF NOT EXISTS idx_incoming_files_is_own_invoice +ON incoming_files(is_own_invoice) +WHERE is_own_invoice = TRUE; + +-- Add comment +COMMENT ON COLUMN incoming_files.is_own_invoice IS +'TRUE hvis filen er en udgående faktura fra BMC (CVR 29522790), FALSE hvis leverandør faktura'; diff --git a/migrations/012_template_default_category.sql b/migrations/012_template_default_category.sql new file mode 100644 index 0000000..35e32db --- /dev/null +++ b/migrations/012_template_default_category.sql @@ -0,0 +1,13 @@ +-- Migration 012: Add default product category to templates +-- Allows templates to specify default category for line items (varesalg, drift, etc.) + +ALTER TABLE supplier_invoice_templates +ADD COLUMN IF NOT EXISTS default_product_category VARCHAR(50) DEFAULT 'varesalg', +ADD COLUMN IF NOT EXISTS default_product_group_number INTEGER; + +-- Valid categories: varesalg, drift, anlæg, abonnement, lager, udlejning +COMMENT ON COLUMN supplier_invoice_templates.default_product_category IS 'Default kategori for varelinjer: varesalg, drift, anlæg, abonnement, lager, udlejning'; +COMMENT ON COLUMN supplier_invoice_templates.default_product_group_number IS 'Default e-conomic produktgruppe nummer'; + +-- Add index for category lookups +CREATE INDEX IF NOT EXISTS idx_supplier_invoice_templates_category ON supplier_invoice_templates(default_product_category); diff --git a/requirements.txt b/requirements.txt index cc52be7..59aad77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ PyPDF2==3.0.1 pdfplumber==0.11.4 pytesseract==0.3.13 Pillow==11.0.0 +invoice2data==0.4.4 +pyyaml==6.0.2 diff --git a/scripts/backfill_quick_analysis.py b/scripts/backfill_quick_analysis.py new file mode 100644 index 0000000..1248381 --- /dev/null +++ b/scripts/backfill_quick_analysis.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Backfill quick analysis for existing files +""" +import sys +import asyncio +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from app.core.database import execute_query, execute_update, init_db +from app.services.ollama_service import ollama_service + + +async def backfill_quick_analysis(): + """Run quick analysis on all files that don't have it""" + + # Initialize database + init_db() + + try: + # Get files without quick analysis + files = execute_query( + """SELECT file_id, filename, file_path + FROM incoming_files + WHERE (detected_cvr IS NULL OR detected_document_number IS NULL) + AND status NOT IN ('duplicate') + AND file_path IS NOT NULL + ORDER BY file_id DESC""" + ) + + print(f"📋 Found {len(files)} files without quick analysis") + + success_count = 0 + fail_count = 0 + + for file in files: + try: + file_path = Path(file['file_path']) + + if not file_path.exists(): + print(f"⚠️ File not found: {file_path}") + fail_count += 1 + continue + + print(f"\n🔍 Processing: {file['filename']} (ID: {file['file_id']})") + + # Extract text + text = await ollama_service._extract_text_from_file(file_path) + + # Run quick analysis + quick_result = await ollama_service.quick_analysis_on_upload(text) + + # Update database + execute_update( + """UPDATE incoming_files + SET detected_cvr = %s, + detected_vendor_id = %s, + detected_document_type = %s, + detected_document_number = %s + WHERE file_id = %s""", + (quick_result.get('cvr'), + quick_result.get('vendor_id'), + quick_result.get('document_type'), + quick_result.get('document_number'), + file['file_id']) + ) + + print(f"✅ Updated: CVR={quick_result.get('cvr')}, " + f"Type={quick_result.get('document_type')}, " + f"Number={quick_result.get('document_number')}, " + f"Vendor={quick_result.get('vendor_name')}") + + success_count += 1 + + except Exception as e: + print(f"❌ Error processing {file['filename']}: {e}") + fail_count += 1 + + print(f"\n📊 Summary: {success_count} successful, {fail_count} failed") + + except Exception as e: + print(f"❌ Fatal error: {e}") + raise + + +if __name__ == "__main__": + asyncio.run(backfill_quick_analysis()) diff --git a/static/design_templates/09_horizontal_dark/index.html b/static/design_templates/09_horizontal_dark/index.html index 108e663..61403f7 100644 --- a/static/design_templates/09_horizontal_dark/index.html +++ b/static/design_templates/09_horizontal_dark/index.html @@ -1,4 +1,4 @@ - +
diff --git a/test_quick_analysis.py b/test_quick_analysis.py new file mode 100644 index 0000000..8a50437 --- /dev/null +++ b/test_quick_analysis.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Test Quick Analysis on Upload +Tests CVR detection, document type, and invoice number extraction +""" + +import asyncio +import sys +from pathlib import Path + +# Add app directory to path +sys.path.insert(0, str(Path(__file__).parent / "app")) + +from app.services.ollama_service import ollama_service + +async def test_quick_analysis(): + """Test quick analysis with sample text""" + + # Sample invoice text with CVR + sample_invoice = """ + ALSO Danmark A/S + Jupitervej 4 + 6000 Kolding + + CVR-nr.: 35812428 + + FAKTURA + + Faktura nr.: INV-2024-12345 + Dato: 2024-12-08 + + Beløb i alt: 5.965,18 DKK + """ + + # Sample credit note text + sample_credit_note = """ + Test Leverandør A/S + CVR: 12345678 + + KREDITNOTA + + Kreditnota nr.: CN-2024-5678 + Original faktura: INV-2024-1000 + + Beløb: -1.234,56 DKK + """ + + print("🧪 Testing Quick Analysis\n") + print("=" * 60) + + # Test 1: Invoice with CVR + print("\n📄 TEST 1: Invoice with CVR") + print("-" * 60) + result1 = await ollama_service.quick_analysis_on_upload(sample_invoice) + print(f"CVR: {result1['cvr']}") + print(f"Document Type: {result1['document_type']}") + print(f"Document Number: {result1['document_number']}") + print(f"Vendor ID: {result1['vendor_id']}") + print(f"Vendor Name: {result1['vendor_name']}") + + assert result1['cvr'] == '35812428', f"Expected CVR 35812428, got {result1['cvr']}" + assert result1['document_type'] == 'invoice', f"Expected invoice, got {result1['document_type']}" + assert result1['document_number'] == 'INV-2024-12345', f"Expected INV-2024-12345, got {result1['document_number']}" + print("✅ Test 1 PASSED") + + # Test 2: Credit Note + print("\n📄 TEST 2: Credit Note") + print("-" * 60) + result2 = await ollama_service.quick_analysis_on_upload(sample_credit_note) + print(f"CVR: {result2['cvr']}") + print(f"Document Type: {result2['document_type']}") + print(f"Document Number: {result2['document_number']}") + print(f"Vendor ID: {result2['vendor_id']}") + print(f"Vendor Name: {result2['vendor_name']}") + + assert result2['cvr'] == '12345678', f"Expected CVR 12345678, got {result2['cvr']}" + assert result2['document_type'] == 'credit_note', f"Expected credit_note, got {result2['document_type']}" + assert result2['document_number'] == 'CN-2024-5678', f"Expected CN-2024-5678, got {result2['document_number']}" + print("✅ Test 2 PASSED") + + print("\n" + "=" * 60) + print("✅ ALL TESTS PASSED!") + print("=" * 60) + +if __name__ == "__main__": + asyncio.run(test_quick_analysis())