diff --git a/app/billing/backend/supplier_invoices.py b/app/billing/backend/supplier_invoices.py index e4a4b25..d6e3888 100644 --- a/app/billing/backend/supplier_invoices.py +++ b/app/billing/backend/supplier_invoices.py @@ -13,6 +13,7 @@ from app.core.config import settings from app.services.economic_service import get_economic_service from app.services.ollama_service import ollama_service from app.services.template_service import template_service +from app.services.invoice2data_service import get_invoice2data_service import logging import os import re @@ -232,15 +233,25 @@ async def get_pending_files(): f.error_message, f.template_id, f.file_path, + -- Quick analysis results (available immediately on upload) + f.detected_cvr, + f.detected_vendor_id, + f.detected_document_type, + f.detected_document_number, + f.is_own_invoice, + v_detected.name as detected_vendor_name, + v_detected.cvr_number as detected_vendor_cvr, -- Get vendor info from latest extraction ext.vendor_name, ext.vendor_cvr, ext.vendor_matched_id, v.name as matched_vendor_name, + v.cvr_number as matched_vendor_cvr_number, -- Check if already has invoice via latest extraction only si.id as existing_invoice_id, si.invoice_number as existing_invoice_number FROM incoming_files f + LEFT JOIN vendors v_detected ON v_detected.id = f.detected_vendor_id LEFT JOIN LATERAL ( SELECT extraction_id, file_id, vendor_name, vendor_cvr, vendor_matched_id FROM extractions @@ -250,16 +261,82 @@ async def get_pending_files(): ) ext ON true LEFT JOIN vendors v ON v.id = ext.vendor_matched_id LEFT JOIN supplier_invoices si ON si.extraction_id = ext.extraction_id - WHERE f.status IN ('pending', 'processing', 'failed', 'ai_extracted', 'processed') + WHERE f.status IN ('pending', 'processing', 'failed', 'ai_extracted', 'processed', 'duplicate') AND si.id IS NULL -- Only show files without invoice yet ORDER BY f.file_id, f.uploaded_at DESC""" ) + + # Convert to regular dicts so we can add new keys + files = [dict(file) for file in files] if files else [] + + # Check for invoice2data templates for each file + try: + from app.services.invoice2data_service import get_invoice2data_service + invoice2data = get_invoice2data_service() + logger.info(f"📋 Checking invoice2data templates: {len(invoice2data.templates)} loaded") + + for file in files: + # Check if there's an invoice2data template for this vendor's CVR + vendor_cvr = file.get('matched_vendor_cvr_number') or file.get('detected_vendor_cvr') or file.get('vendor_cvr') + file['has_invoice2data_template'] = False + + logger.debug(f" File {file['file_id']}: CVR={vendor_cvr}") + + if vendor_cvr: + # Check all templates for this CVR in keywords + for template_name, template_data in invoice2data.templates.items(): + keywords = template_data.get('keywords', []) + logger.debug(f" Template {template_name}: keywords={keywords}") + if str(vendor_cvr) in [str(k) for k in keywords]: + file['has_invoice2data_template'] = True + file['invoice2data_template_name'] = template_name + logger.info(f" ✅ File {file['file_id']} matched template: {template_name}") + break + except Exception as e: + logger.error(f"❌ Failed to check invoice2data templates: {e}", exc_info=True) + # Continue without invoice2data info + return {"files": files if files else [], "count": len(files) if files else 0} except Exception as e: logger.error(f"❌ Failed to get pending files: {e}") raise HTTPException(status_code=500, detail=str(e)) +@router.get("/supplier-invoices/files/{file_id}/pdf-text") +async def get_file_pdf_text(file_id: int): + """Hent fuld PDF tekst fra en uploaded fil (til template builder)""" + try: + # Get file info + file_info = execute_query( + "SELECT file_path, filename FROM incoming_files WHERE file_id = %s", + (file_id,), + fetchone=True + ) + + if not file_info: + raise HTTPException(status_code=404, detail="Fil ikke fundet") + + # Read PDF text + from pathlib import Path + file_path = Path(file_info['file_path']) + if not file_path.exists(): + raise HTTPException(status_code=404, detail=f"Fil ikke fundet på disk: {file_path}") + + pdf_text = await ollama_service._extract_text_from_file(file_path) + + return { + "file_id": file_id, + "filename": file_info['filename'], + "pdf_text": pdf_text + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Failed to get PDF text: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @router.get("/supplier-invoices/files/{file_id}/extracted-data") async def get_file_extracted_data(file_id: int): """Hent AI-extracted data fra en uploaded fil""" @@ -758,8 +835,9 @@ async def create_invoice_from_extraction(file_id: int): @router.get("/supplier-invoices/templates") async def list_templates(): - """Hent alle templates""" + """Hent alle templates (både database og invoice2data YAML)""" try: + # Get database templates query = """ SELECT t.*, v.name as vendor_name FROM supplier_invoice_templates t @@ -767,9 +845,55 @@ async def list_templates(): WHERE t.is_active = true ORDER BY t.created_at DESC """ - templates = execute_query(query) + db_templates = execute_query(query) or [] - return templates if templates else [] + # Get invoice2data templates + invoice2data_service = get_invoice2data_service() + invoice2data_templates = [] + + for template_name, template_data in invoice2data_service.templates.items(): + # Extract vendor CVR from keywords + vendor_cvr = None + keywords = template_data.get('keywords', []) + for keyword in keywords: + if isinstance(keyword, str) and keyword.isdigit() and len(keyword) == 8: + vendor_cvr = keyword + break + + # Get vendor info from database if CVR found + vendor_name = template_data.get('issuer', 'Ukendt') + vendor_id = None + if vendor_cvr: + vendor = execute_query( + "SELECT id, name FROM vendors WHERE cvr_number = %s", + (vendor_cvr,), + fetchone=True + ) + if vendor: + vendor_id = vendor['id'] + vendor_name = vendor['name'] + + invoice2data_templates.append({ + 'template_id': -1, # Negative ID to distinguish from DB templates + 'template_name': f"Invoice2Data: {template_name}", + 'template_type': 'invoice2data', + 'yaml_filename': template_name, + 'vendor_id': vendor_id, + 'vendor_name': vendor_name, + 'vendor_cvr': vendor_cvr, + 'default_product_category': template_data.get('default_product_category', 'varesalg'), + 'default_product_group_number': template_data.get('default_product_group_number', 1), + 'usage_count': 0, # Could track this separately + 'is_active': True, + 'detection_patterns': keywords, + 'field_mappings': template_data.get('fields', {}), + 'created_at': None + }) + + # Combine both types + all_templates = db_templates + invoice2data_templates + + return all_templates except Exception as e: logger.error(f"❌ Failed to list templates: {e}") raise HTTPException(status_code=500, detail=str(e)) @@ -978,6 +1102,7 @@ async def create_template(request: Dict): template_name = request.get('template_name') detection_patterns = request.get('detection_patterns', []) field_mappings = request.get('field_mappings', {}) + default_product_category = request.get('default_product_category', 'varesalg') if not vendor_id or not template_name: raise HTTPException(status_code=400, detail="vendor_id og template_name er påkrævet") @@ -996,11 +1121,11 @@ async def create_template(request: Dict): # Insert template and get template_id query = """ INSERT INTO supplier_invoice_templates - (vendor_id, template_name, detection_patterns, field_mappings) - VALUES (%s, %s, %s, %s) + (vendor_id, template_name, detection_patterns, field_mappings, default_product_category) + VALUES (%s, %s, %s, %s, %s) RETURNING template_id """ - result = execute_query(query, (vendor_id, template_name, json.dumps(detection_patterns), json.dumps(field_mappings))) + result = execute_query(query, (vendor_id, template_name, json.dumps(detection_patterns), json.dumps(field_mappings), default_product_category)) template_id = result[0]['template_id'] if result else None if not template_id: @@ -1657,6 +1782,97 @@ async def upload_supplier_invoice(file: UploadFile = File(...)): logger.info(f"📄 Extracting text from {final_path.suffix}...") text = await ollama_service._extract_text_from_file(final_path) + # QUICK ANALYSIS: Extract CVR, document type, invoice number IMMEDIATELY + logger.info(f"⚡ Running quick analysis...") + quick_result = await ollama_service.quick_analysis_on_upload(text) + + # Update file record with quick analysis results + execute_update( + """UPDATE incoming_files + SET detected_cvr = %s, + detected_vendor_id = %s, + detected_document_type = %s, + detected_document_number = %s, + is_own_invoice = %s + WHERE file_id = %s""", + (quick_result.get('cvr'), + quick_result.get('vendor_id'), + quick_result.get('document_type'), + quick_result.get('document_number'), + quick_result.get('is_own_invoice', False), + file_id) + ) + + logger.info(f"📋 Quick analysis saved: CVR={quick_result.get('cvr')}, " + f"Vendor={quick_result.get('vendor_name')}, " + f"Type={quick_result.get('document_type')}, " + f"Number={quick_result.get('document_number')}") + + # DUPLICATE CHECK: Check if invoice number already exists + document_number = quick_result.get('document_number') + if document_number: + logger.info(f"🔍 Checking for duplicate invoice number: {document_number}") + + # Check 1: Search in local database (supplier_invoices table) + existing_invoice = execute_query( + """SELECT si.id, si.invoice_number, si.created_at, v.name as vendor_name + FROM supplier_invoices si + LEFT JOIN vendors v ON v.id = si.vendor_id + WHERE si.invoice_number = %s + ORDER BY si.created_at DESC + LIMIT 1""", + (document_number,), + fetchone=True + ) + + if existing_invoice: + # DUPLICATE FOUND IN DATABASE + logger.error(f"🚫 DUPLICATE: Invoice {document_number} already exists in database (ID: {existing_invoice['id']})") + + # Mark file as duplicate + execute_update( + """UPDATE incoming_files + SET status = 'duplicate', + error_message = %s, + processed_at = CURRENT_TIMESTAMP + WHERE file_id = %s""", + (f"DUBLET: Fakturanummer {document_number} findes allerede i systemet (Faktura #{existing_invoice['id']}, {existing_invoice['vendor_name'] or 'Ukendt leverandør'})", + file_id) + ) + + raise HTTPException( + status_code=409, # 409 Conflict + detail=f"🚫 DUBLET: Fakturanummer {document_number} findes allerede i systemet (Faktura #{existing_invoice['id']}, oprettet {existing_invoice['created_at'].strftime('%d-%m-%Y')})" + ) + + # Check 2: Search in e-conomic (if configured) + from app.services.economic_service import economic_service + if hasattr(economic_service, 'app_secret_token') and economic_service.app_secret_token: + logger.info(f"🔍 Checking e-conomic for invoice number: {document_number}") + economic_duplicate = await economic_service.check_invoice_number_exists(document_number) + + if economic_duplicate: + # DUPLICATE FOUND IN E-CONOMIC + logger.error(f"🚫 DUPLICATE: Invoice {document_number} found in e-conomic (Voucher #{economic_duplicate.get('voucher_number')})") + + # Mark file as duplicate + execute_update( + """UPDATE incoming_files + SET status = 'duplicate', + error_message = %s, + processed_at = CURRENT_TIMESTAMP + WHERE file_id = %s""", + (f"DUBLET: Fakturanummer {document_number} findes i e-conomic (Bilag #{economic_duplicate.get('voucher_number')})", + file_id) + ) + + raise HTTPException( + status_code=409, # 409 Conflict + detail=f"🚫 DUBLET: Fakturanummer {document_number} findes i e-conomic (Bilag #{economic_duplicate.get('voucher_number')}, {economic_duplicate.get('date')})" + ) + + logger.info(f"✅ No duplicate found for invoice {document_number}") + # Try template matching logger.info(f"📋 Matching template...") template_id, confidence = template_service.match_template(text) @@ -1699,7 +1915,8 @@ async def upload_supplier_invoice(file: UploadFile = File(...)): """INSERT INTO extraction_lines (extraction_id, line_number, description, quantity, unit_price, line_total, vat_rate, vat_note, confidence) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)""", + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + RETURNING line_id""", (extraction_id, idx, line.get('description'), line.get('quantity'), line.get('unit_price'), line.get('line_total'), line.get('vat_rate'), @@ -1744,13 +1961,41 @@ async def upload_supplier_invoice(file: UploadFile = File(...)): "confidence": confidence, "extracted_fields": extracted_fields, "pdf_text": text[:500], # First 500 chars for reference + # Quick analysis results (available IMMEDIATELY on upload) + "quick_analysis": { + "cvr": quick_result.get('cvr'), + "vendor_id": quick_result.get('vendor_id'), + "vendor_name": quick_result.get('vendor_name'), + "document_type": quick_result.get('document_type'), + "document_number": quick_result.get('document_number') + }, "message": "Upload gennemført - gennemgå og bekræft data" } - except HTTPException: + except HTTPException as he: + # Mark file as failed if we have file_id + if 'file_id' in locals(): + execute_update( + """UPDATE incoming_files + SET status = 'failed', + error_message = %s, + processed_at = CURRENT_TIMESTAMP + WHERE file_id = %s""", + (str(he.detail), file_id) + ) raise except Exception as e: logger.error(f"❌ Upload failed (inner): {e}", exc_info=True) + # Mark file as failed if we have file_id + if 'file_id' in locals(): + execute_update( + """UPDATE incoming_files + SET status = 'failed', + error_message = %s, + processed_at = CURRENT_TIMESTAMP + WHERE file_id = %s""", + (str(e), file_id) + ) raise HTTPException(status_code=500, detail=f"Upload fejlede: {str(e)}") except HTTPException: @@ -1809,51 +2054,174 @@ async def reprocess_uploaded_file(file_id: int): logger.info(f"✅ Matched template {template_id} ({confidence:.0%})") extracted_fields = template_service.extract_fields(text, template_id) - template = template_service.templates_cache.get(template_id) - if template: - vendor_id = template.get('vendor_id') + # Check if this is an invoice2data template (ID -1) + is_invoice2data = (template_id == -1) - template_service.log_usage(template_id, file_id, True, confidence, extracted_fields) + if is_invoice2data: + # Invoice2data doesn't have vendor in cache + logger.info(f"📋 Using invoice2data template") + # Try to find vendor from extracted CVR + if extracted_fields.get('vendor_vat'): + vendor = execute_query( + "SELECT id FROM vendors WHERE cvr_number = %s", + (extracted_fields['vendor_vat'],), + fetchone=True + ) + if vendor: + vendor_id = vendor['id'] + + # Store invoice2data extraction in database + extraction_id = execute_insert( + """INSERT INTO extractions + (file_id, vendor_matched_id, vendor_name, vendor_cvr, + document_id, document_date, due_date, document_type, document_type_detected, + total_amount, currency, confidence, llm_response_json, status) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + RETURNING extraction_id""", + (file_id, vendor_id, + extracted_fields.get('issuer'), # vendor_name + extracted_fields.get('vendor_vat'), # vendor_cvr + str(extracted_fields.get('invoice_number')), # document_id + extracted_fields.get('invoice_date'), # document_date + extracted_fields.get('due_date'), + 'invoice', # document_type + 'invoice', # document_type_detected + extracted_fields.get('amount_total'), + extracted_fields.get('currency', 'DKK'), + 1.0, # invoice2data always 100% confidence + json.dumps(extracted_fields), # llm_response_json + 'extracted') # status + ) + + # Insert line items if extracted + if extracted_fields.get('lines'): + for idx, line in enumerate(extracted_fields['lines'], start=1): + execute_insert( + """INSERT INTO extraction_lines + (extraction_id, line_number, description, quantity, unit_price, + line_total, vat_rate, vat_note, confidence, + ip_address, contract_number, location_street, location_zip, location_city) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + RETURNING line_id""", + (extraction_id, idx, line.get('description'), + line.get('quantity'), line.get('unit_price'), + line.get('line_total'), None, None, 1.0, + line.get('ip_address'), line.get('contract_number'), + line.get('location_street'), line.get('location_zip'), line.get('location_city')) + ) + logger.info(f"✅ Saved {len(extracted_fields['lines'])} line items") + else: + # Custom template from database + template = template_service.templates_cache.get(template_id) + if template: + vendor_id = template.get('vendor_id') + + template_service.log_usage(template_id, file_id, True, confidence, extracted_fields) + # Update file - use NULL for invoice2data templates to avoid FK constraint + db_template_id = None if is_invoice2data else template_id execute_update( """UPDATE incoming_files SET status = 'processed', template_id = %s, processed_at = CURRENT_TIMESTAMP WHERE file_id = %s""", - (template_id, file_id) + (db_template_id, file_id) ) else: - # NO AI FALLBACK - Require template matching - logger.warning(f"⚠️ Ingen template match (confidence: {confidence:.0%}) - afviser fil") + # FALLBACK TO AI EXTRACTION + logger.info(f"⚠️ Ingen template match (confidence: {confidence:.0%}) - bruger AI extraction") + # Use detected vendor from quick analysis if available + vendor_id = file_record.get('detected_vendor_id') + + # Call Ollama for full extraction + logger.info(f"🤖 Calling Ollama for AI extraction...") + llm_result = await ollama_service.extract_from_text(text) + + if not llm_result or 'error' in llm_result: + error_msg = llm_result.get('error') if llm_result else 'AI extraction fejlede' + logger.error(f"❌ AI extraction failed: {error_msg}") + + execute_update( + """UPDATE incoming_files + SET status = 'failed', + error_message = %s, + processed_at = CURRENT_TIMESTAMP + WHERE file_id = %s""", + (f"AI extraction fejlede: {error_msg}", file_id) + ) + + raise HTTPException(status_code=500, detail=f"AI extraction fejlede: {error_msg}") + + extracted_fields = llm_result + confidence = llm_result.get('confidence', 0.75) + + # Store AI extracted data in extractions table + extraction_id = execute_insert( + """INSERT INTO supplier_invoice_extractions + (file_id, vendor_id, invoice_number, invoice_date, due_date, + total_amount, currency, document_type, confidence, llm_data) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING extraction_id""", + (file_id, vendor_id, + llm_result.get('invoice_number'), + llm_result.get('invoice_date'), + llm_result.get('due_date'), + llm_result.get('total_amount'), + llm_result.get('currency', 'DKK'), + llm_result.get('document_type'), + confidence, + json.dumps(llm_result)) + ) + + # Insert line items if extracted + if llm_result.get('lines'): + for idx, line in enumerate(llm_result['lines'], start=1): + execute_insert( + """INSERT INTO extraction_lines + (extraction_id, line_number, description, quantity, unit_price, + line_total, vat_rate, vat_note, confidence) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + RETURNING line_id""", + (extraction_id, idx, line.get('description'), + line.get('quantity'), line.get('unit_price'), + line.get('line_total'), line.get('vat_rate'), + line.get('vat_note'), confidence) + ) + + # Update file status to ai_extracted execute_update( """UPDATE incoming_files - SET status = 'failed', - error_message = 'Ingen template match - opret template for denne leverandør', - processed_at = CURRENT_TIMESTAMP + SET status = 'ai_extracted', processed_at = CURRENT_TIMESTAMP WHERE file_id = %s""", (file_id,) ) - return { - "status": "failed", - "file_id": file_id, - "error": "Ingen template match - opret template for denne leverandør", - "confidence": confidence - } + logger.info(f"✅ AI extraction completed for file {file_id}") - # Return success with template data - return { + # Return success with template data or AI extraction result + result = { "status": "success", "file_id": file_id, "filename": file_record['filename'], "template_matched": template_id is not None, "template_id": template_id, "vendor_id": vendor_id, - "confidence": confidence if template_id else 0.8, + "confidence": confidence if template_id else llm_result.get('confidence', 0.75), "extracted_fields": extracted_fields, "pdf_text": text[:1000] if not template_id else text } + # Add warning if no template exists + if not template_id and vendor_id: + vendor = execute_query( + "SELECT name FROM vendors WHERE id = %s", + (vendor_id,), + fetchone=True + ) + if vendor: + result["warning"] = f"⚠️ Ingen template fundet for {vendor['name']} - brugte AI extraction (langsommere)" + + return result + except HTTPException: raise except Exception as e: @@ -1866,6 +2234,7 @@ async def update_template( template_name: Optional[str] = None, detection_patterns: Optional[List[Dict]] = None, field_mappings: Optional[Dict] = None, + default_product_category: Optional[str] = None, is_active: Optional[bool] = None ): """Opdater eksisterende template""" @@ -1884,6 +2253,9 @@ async def update_template( if field_mappings is not None: updates.append("field_mappings = %s") params.append(json.dumps(field_mappings)) + if default_product_category is not None: + updates.append("default_product_category = %s") + params.append(default_product_category) if is_active is not None: updates.append("is_active = %s") params.append(is_active) @@ -1911,6 +2283,114 @@ async def update_template( raise HTTPException(status_code=500, detail=str(e)) +@router.post("/supplier-invoices/templates/invoice2data/{template_name}/test") +async def test_invoice2data_template(template_name: str, request: Dict): + """ + Test invoice2data YAML template mod PDF tekst + + Request body: + { + "pdf_text": "Full PDF text content..." + } + + Returns samme format som test_template endpoint + """ + try: + pdf_text = request.get('pdf_text', '') + if not pdf_text: + raise HTTPException(status_code=400, detail="pdf_text er påkrævet") + + # Get invoice2data service + invoice2data_service = get_invoice2data_service() + + # Check if template exists + if template_name not in invoice2data_service.templates: + raise HTTPException(status_code=404, detail=f"Template '{template_name}' ikke fundet") + + template_data = invoice2data_service.templates[template_name] + + # Test extraction + result = invoice2data_service.extract_with_template(pdf_text, template_name) + + if not result: + # Template didn't match + keywords = template_data.get('keywords', []) + detection_results = [] + for keyword in keywords: + found = str(keyword).lower() in pdf_text.lower() + detection_results.append({ + "pattern": str(keyword), + "type": "keyword", + "found": found, + "weight": 0.5 + }) + + return { + "matched": False, + "confidence": 0.0, + "extracted_fields": {}, + "line_items": [], + "detection_results": detection_results, + "template_name": template_name, + "error": "Template matchede ikke PDF'en" + } + + # Extract line items + line_items = [] + if 'lines' in result: + for line in result['lines']: + line_items.append({ + "line_number": line.get('line_number', ''), + "item_number": line.get('item_number', ''), + "description": line.get('description_raw', '') or line.get('description', ''), + "quantity": line.get('quantity', ''), + "unit_price": line.get('unit_price', ''), + "line_total": line.get('line_total', ''), + # Context fields (circuit/location info) + "circuit_id": line.get('circuit_id', ''), + "ip_address": line.get('ip_address', ''), + "contract_number": line.get('contract_number', ''), + "location_street": line.get('location_street', ''), + "location_zip": line.get('location_zip', ''), + "location_city": line.get('location_city', ''), + }) + + # Build detection results + keywords = template_data.get('keywords', []) + detection_results = [] + matched_count = 0 + for keyword in keywords: + found = str(keyword).lower() in pdf_text.lower() + if found: + matched_count += 1 + detection_results.append({ + "pattern": str(keyword), + "type": "keyword", + "found": found, + "weight": 0.5 + }) + + confidence = matched_count / len(keywords) if keywords else 1.0 + + # Remove 'lines' from extracted_fields to avoid duplication + extracted_fields = {k: v for k, v in result.items() if k != 'lines'} + + return { + "matched": True, + "confidence": confidence, + "extracted_fields": extracted_fields, + "line_items": line_items, + "detection_results": detection_results, + "template_name": template_name + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Invoice2data template test failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + @router.post("/supplier-invoices/templates/{template_id}/test") async def test_template(template_id: int, request: Dict): """ @@ -2076,6 +2556,102 @@ async def test_template(template_id: int, request: Dict): raise HTTPException(status_code=500, detail=str(e)) +@router.put("/supplier-invoices/templates/invoice2data/{template_name}/category") +async def update_yaml_category(template_name: str, request: Dict): + """ + Opdater default_product_category i YAML template fil + + Request body: + { + "category": "drift" // varesalg, drift, anlæg, abonnement, lager, udlejning + } + """ + try: + import yaml + from pathlib import Path + + new_category = request.get('category') + if not new_category: + raise HTTPException(status_code=400, detail="category er påkrævet") + + # Validate category + valid_categories = ['varesalg', 'drift', 'anlæg', 'abonnement', 'lager', 'udlejning'] + if new_category not in valid_categories: + raise HTTPException(status_code=400, detail=f"Ugyldig kategori. Skal være en af: {', '.join(valid_categories)}") + + # Find YAML file + templates_dir = Path(__file__).parent.parent.parent.parent / 'data' / 'invoice_templates' + yaml_file = templates_dir / f"{template_name}.yml" + + if not yaml_file.exists(): + raise HTTPException(status_code=404, detail=f"YAML fil ikke fundet: {template_name}.yml") + + # Load YAML + with open(yaml_file, 'r', encoding='utf-8') as f: + template_data = yaml.safe_load(f) + + # Update category + template_data['default_product_category'] = new_category + + # Save YAML with preserved formatting + with open(yaml_file, 'w', encoding='utf-8') as f: + yaml.dump(template_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + # Reload invoice2data service to pick up changes + invoice2data_service = get_invoice2data_service() + invoice2data_service.__init__() # Reinitialize to reload templates + + logger.info(f"✅ Updated category for {template_name}.yml to {new_category}") + + return { + "message": "Kategori opdateret", + "template_name": template_name, + "new_category": new_category + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Failed to update YAML category: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/supplier-invoices/templates/invoice2data/{template_name}/content") +async def get_yaml_content(template_name: str): + """ + Hent råt YAML indhold fra template fil + + Returns: + { + "content": "issuer: DCS ApS\nkeywords: ..." + } + """ + try: + from pathlib import Path + + # Find template file + template_dir = Path("data/invoice_templates") + template_file = template_dir / f"{template_name}.yml" + + if not template_file.exists(): + raise HTTPException(status_code=404, detail=f"Template fil ikke fundet: {template_name}.yml") + + # Read file content + content = template_file.read_text(encoding='utf-8') + + return { + "template_name": template_name, + "filename": f"{template_name}.yml", + "content": content + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Failed to read YAML content: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + @router.delete("/supplier-invoices/templates/{template_id}") async def delete_template(template_id: int): """Slet template (soft delete - sæt is_active=false)""" diff --git a/app/billing/frontend/supplier_invoices.html b/app/billing/frontend/supplier_invoices.html index 0bcf4cf..a43bd39 100644 --- a/app/billing/frontend/supplier_invoices.html +++ b/app/billing/frontend/supplier_invoices.html @@ -163,7 +163,7 @@
| + + | Fakturanr. | Leverandør | Fakturadato | @@ -217,7 +240,7 @@||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| + |
Indlæser...
@@ -238,18 +261,43 @@
-
+
+
+
+
📁 Uploadede filer afventer behandling+⏳ Filer der mangler behandling
| ||||||||||||||||||||||||||||||||||||
diff --git a/app/services/economic_service.py b/app/services/economic_service.py index f7cb6b2..c5d6375 100644 --- a/app/services/economic_service.py +++ b/app/services/economic_service.py @@ -271,6 +271,54 @@ class EconomicService: # ========== KASSEKLADDE (JOURNALS/VOUCHERS) ========== + async def check_invoice_number_exists(self, invoice_number: str, journal_number: Optional[int] = None) -> Optional[Dict]: + """ + Check if an invoice number already exists in e-conomic journals + + Args: + invoice_number: Invoice number to check + journal_number: Optional specific journal to search (if None, searches all) + + Returns: + Dict with voucher info if found, None otherwise + """ + try: + # Search in vouchers (posted journal entries) + url = f"{self.api_url}/vouchers" + params = { + 'filter': f'voucherNumber${invoice_number}', # e-conomic filter syntax + 'pagesize': 100 + } + + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self._get_headers(), params=params) as response: + if response.status != 200: + logger.warning(f"⚠️ Failed to search vouchers: {response.status}") + return None + + data = await response.json() + vouchers = data.get('collection', []) + + # Check if any voucher matches the invoice number + for voucher in vouchers: + # Check if invoice number appears in voucher text or entries + if invoice_number in str(voucher): + logger.warning(f"⚠️ Invoice number {invoice_number} found in e-conomic voucher #{voucher.get('voucherNumber')}") + return { + 'found_in': 'e-conomic', + 'voucher_number': voucher.get('voucherNumber'), + 'date': voucher.get('date'), + 'journal': voucher.get('journal', {}).get('journalNumber') + } + + logger.info(f"✅ Invoice number {invoice_number} not found in e-conomic") + return None + + except Exception as e: + logger.error(f"❌ Error checking invoice number in e-conomic: {e}") + # Don't block on e-conomic errors - assume not found + return None + async def get_supplier_invoice_journals(self) -> list: """ Get all available journals for supplier invoices (kassekladde) diff --git a/app/services/invoice2data_service.py b/app/services/invoice2data_service.py new file mode 100644 index 0000000..8123364 --- /dev/null +++ b/app/services/invoice2data_service.py @@ -0,0 +1,337 @@ +""" +Invoice2Data Service +Wrapper around invoice2data library for template-based invoice extraction +""" + +import logging +import re +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Any +import yaml + +logger = logging.getLogger(__name__) + +class Invoice2DataService: + """Service for extracting invoice data using invoice2data templates""" + + def __init__(self): + self.template_dir = Path(__file__).parent.parent.parent / "data" / "invoice_templates" + self.templates = self._load_templates() + logger.info(f"📋 Loaded {len(self.templates)} invoice2data templates") + + def _load_templates(self) -> Dict[str, Dict]: + """Load all YAML templates from template directory""" + templates = {} + + if not self.template_dir.exists(): + logger.warning(f"Template directory not found: {self.template_dir}") + return templates + + for template_file in self.template_dir.glob("*.yml"): + try: + with open(template_file, 'r', encoding='utf-8') as f: + template_data = yaml.safe_load(f) + template_name = template_file.stem + templates[template_name] = template_data + logger.debug(f" ✓ Loaded template: {template_name}") + except Exception as e: + logger.error(f" ✗ Failed to load template {template_file}: {e}") + + return templates + + def match_template(self, text: str) -> Optional[str]: + """ + Find matching template based on keywords + Returns template name or None + """ + text_lower = text.lower() + + for template_name, template_data in self.templates.items(): + keywords = template_data.get('keywords', []) + + # Check if all keywords are present + matches = sum(1 for keyword in keywords if str(keyword).lower() in text_lower) + + if matches >= len(keywords) * 0.7: # 70% of keywords must match + logger.info(f"✅ Matched template: {template_name} ({matches}/{len(keywords)} keywords)") + return template_name + + logger.warning("⚠️ No template matched") + return None + + def extract_with_template(self, text: str, template_name: str) -> Dict[str, Any]: + """ + Extract invoice data using specific template + """ + if template_name not in self.templates: + raise ValueError(f"Template not found: {template_name}") + + template = self.templates[template_name] + fields = template.get('fields', {}) + options = template.get('options', {}) + + extracted = { + 'template': template_name, + 'issuer': template.get('issuer'), + 'country': template.get('country'), + 'currency': options.get('currency', 'DKK') + } + + # Extract each field using its regex + for field_name, field_config in fields.items(): + if field_config.get('parser') != 'regex': + continue + + pattern = field_config.get('regex') + field_type = field_config.get('type', 'string') + group = field_config.get('group', 1) + + try: + match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) + + if match: + value = match.group(group).strip() + + logger.debug(f" 🔍 Extracted raw value for {field_name}: '{value}' (type: {field_type})") + + # Handle CVR filtering (avoid customer CVR) + if field_name == 'vendor_vat': + # Find ALL CVR numbers + all_cvr_matches = re.finditer(r'SE/CVR-nr\.\s+(\d{8})', text, re.IGNORECASE) + cvr_numbers = [m.group(1) for m in all_cvr_matches] + + # Filter out BMC's CVR (29522790) + vendor_cvrs = [cvr for cvr in cvr_numbers if cvr != '29522790'] + + if vendor_cvrs: + value = vendor_cvrs[0] + logger.debug(f" ✓ {field_name}: {value} (filtered from {cvr_numbers})") + else: + logger.warning(f" ⚠️ Only customer CVR found, no vendor CVR") + continue + + # Convert type + if field_type == 'float': + # Handle Danish number format (1.234,56 → 1234.56) + # OR (148,587.98 → 148587.98) - handle both formats + decimal_sep = options.get('decimal_separator', ',') + thousands_sep = options.get('thousands_separator', '.') + + # Remove all spaces first + value = value.replace(' ', '') + + # If both separators are present, we can determine the format + # Danish: 148.587,98 (thousands=., decimal=,) + # English: 148,587.98 (thousands=, decimal=.) + if thousands_sep in value and decimal_sep in value: + # Remove thousands separator, then convert decimal separator to . + value = value.replace(thousands_sep, '').replace(decimal_sep, '.') + elif thousands_sep in value: + # Only thousands separator present - just remove it + value = value.replace(thousands_sep, '') + elif decimal_sep in value and decimal_sep == ',': + # Only decimal separator and it's Danish comma - convert to . + value = value.replace(',', '.') + + value = float(value) + elif field_type == 'int': + value = int(value) + elif field_type == 'date': + # Try to parse Danish dates + date_formats = options.get('date_formats', ['%B %d, %Y', '%d-%m-%Y']) + + # Danish month names + value = value.replace('januar', 'January').replace('februar', 'February') + value = value.replace('marts', 'March').replace('april', 'April') + value = value.replace('maj', 'May').replace('juni', 'June') + value = value.replace('juli', 'July').replace('august', 'August') + value = value.replace('september', 'September').replace('oktober', 'October') + value = value.replace('november', 'November').replace('december', 'December') + + for date_format in date_formats: + try: + parsed_date = datetime.strptime(value, date_format) + value = parsed_date.strftime('%Y-%m-%d') + break + except ValueError: + continue + + extracted[field_name] = value + logger.debug(f" ✓ {field_name}: {value}") + else: + logger.debug(f" ✗ {field_name}: No match") + + except Exception as e: + logger.warning(f" ✗ Failed to extract {field_name}: {e}") + + # Extract line items if defined in template + lines_config = template.get('lines', []) + if lines_config: + extracted['lines'] = self._extract_lines(text, lines_config, options) + + return extracted + + def _extract_lines(self, text: str, lines_configs: List[Dict], options: Dict) -> List[Dict]: + """Extract line items from invoice text""" + all_lines = [] + + logger.debug(f"🔍 Extracting lines with {len(lines_configs)} configurations") + + for lines_config in lines_configs: + start_pattern = lines_config.get('start') + end_pattern = lines_config.get('end') + line_config = lines_config.get('line', {}) + + if not start_pattern or not line_config: + continue + + try: + # Find section between start and end patterns + if end_pattern: + section_pattern = f"{start_pattern}(.*?){end_pattern}" + section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE) + else: + section_pattern = f"{start_pattern}(.*?)$" + section_match = re.search(section_pattern, text, re.DOTALL | re.IGNORECASE) + + if not section_match: + logger.debug(f" ✗ Line section not found (start: {start_pattern[:50]}, end: {end_pattern[:50] if end_pattern else 'None'})") + continue + + section_text = section_match.group(1) + logger.debug(f" ✓ Found line section ({len(section_text)} chars)") + + # Extract individual lines + line_pattern = line_config.get('regex') + field_names = line_config.get('fields', []) + field_types = line_config.get('types', {}) + context_config = line_config.get('context_before', {}) + + if not line_pattern or not field_names: + continue + + # Split section into lines for context processing + section_lines = section_text.split('\n') + line_matches = [] + + # Find all matching lines with their indices + for line_idx, line_text in enumerate(section_lines): + match = re.search(line_pattern, line_text, re.MULTILINE) + if match: + line_matches.append((line_idx, line_text, match)) + + logger.debug(f" ✓ Found {len(line_matches)} matching lines") + + for line_idx, line_text, match in line_matches: + line_data = {} + + # Extract main line fields + for idx, field_name in enumerate(field_names, start=1): + try: + value = match.group(idx).strip() + field_type = field_types.get(field_name, 'string') + + # Convert type + if field_type == 'float': + thousands_sep = options.get('thousands_separator', ',') + decimal_sep = options.get('decimal_separator', '.') + value = value.replace(' ', '') + + if thousands_sep in value and decimal_sep in value: + value = value.replace(thousands_sep, '').replace(decimal_sep, '.') + elif thousands_sep in value: + value = value.replace(thousands_sep, '') + elif decimal_sep in value and decimal_sep == ',': + value = value.replace(',', '.') + + value = float(value) + elif field_type == 'int': + value = int(value) + + line_data[field_name] = value + except Exception as e: + logger.debug(f" ✗ Failed to extract line field {field_name}: {e}") + + # Extract context_before if configured + if context_config and line_idx > 0: + max_lines = context_config.get('max_lines', 5) + patterns = context_config.get('patterns', []) + + # Look at lines BEFORE this line + start_idx = max(0, line_idx - max_lines) + context_lines = section_lines[start_idx:line_idx] + + for pattern_config in patterns: + pattern_regex = pattern_config.get('regex') + pattern_fields = pattern_config.get('fields', []) + + if not pattern_regex or not pattern_fields: + continue + + # Try to match against context lines (most recent first) + for ctx_line in reversed(context_lines): + ctx_match = re.search(pattern_regex, ctx_line) + if ctx_match: + # Extract fields from context + for ctx_idx, ctx_field_name in enumerate(pattern_fields, start=1): + try: + ctx_value = ctx_match.group(ctx_idx).strip() + line_data[ctx_field_name] = ctx_value + except Exception as e: + logger.debug(f" ✗ Failed to extract context field {ctx_field_name}: {e}") + break # Stop after first match for this pattern + + if line_data: + all_lines.append(line_data) + + logger.info(f" ✓ Extracted {len(all_lines)} line items") + + except Exception as e: + logger.warning(f" ✗ Failed to extract lines: {e}") + + return all_lines + + def extract(self, text: str, template_name: Optional[str] = None) -> Optional[Dict[str, Any]]: + """ + Extract invoice data from text + If template_name is None, auto-detect template + """ + try: + # Auto-detect template if not specified + if template_name is None: + template_name = self.match_template(text) + if template_name is None: + return None + + # Extract with template + result = self.extract_with_template(text, template_name) + + logger.info(f"✅ Extracted {len(result)} fields using template: {template_name}") + return result + + except Exception as e: + logger.error(f"❌ Extraction failed: {e}") + return None + + def get_template_list(self) -> List[Dict[str, str]]: + """Get list of available templates""" + return [ + { + 'name': name, + 'issuer': template.get('issuer'), + 'country': template.get('country') + } + for name, template in self.templates.items() + ] + + +# Singleton instance +_invoice2data_service = None + +def get_invoice2data_service() -> Invoice2DataService: + """Get singleton instance of Invoice2Data service""" + global _invoice2data_service + if _invoice2data_service is None: + _invoice2data_service = Invoice2DataService() + return _invoice2data_service diff --git a/app/services/ollama_service.py b/app/services/ollama_service.py index ef6abff..66808c9 100644 --- a/app/services/ollama_service.py +++ b/app/services/ollama_service.py @@ -437,6 +437,130 @@ Output: { } return mime_types.get(suffix, 'application/octet-stream') + async def quick_analysis_on_upload(self, pdf_text: str) -> Dict: + """ + Quick analysis when file is uploaded - extracts critical fields only: + - CVR number (to match vendor) + - Document type (invoice vs credit note) + - Invoice/credit note number + + This runs BEFORE template matching for early vendor detection. + + Args: + pdf_text: Extracted text from PDF + + Returns: + Dict with cvr, document_type, document_number, vendor_id, vendor_name, is_own_invoice + """ + from app.core.config import settings + + logger.info("⚡ Running quick analysis on upload...") + + result = { + "cvr": None, + "document_type": None, # 'invoice' or 'credit_note' + "document_number": None, + "vendor_id": None, + "vendor_name": None, + "is_own_invoice": False # True if this is an outgoing invoice (BMC's own CVR) + } + + # 1. FIND CVR NUMBER (8 digits) + # Look for patterns like "CVR: 12345678", "CVR-nr.: 12345678", "CVR 12345678" + # Important: Supplier invoices have BOTH buyer (BMC=29522790) and seller CVR + # We need the SELLER's CVR (not BMC's own) + + cvr_patterns = [ + r'CVR[:\-\s]*(\d{8})', + r'CVR[:\-\s]*nr\.?\s*(\d{8})', + r'CVR[:\-\s]*nummer\s*(\d{8})', + r'SE[:\-\s]*(\d{8})', # SE = Svensk CVR, men også brugt i DK + r'\b(\d{8})\b' # Fallback: any 8-digit number + ] + + # Find ALL CVR numbers in document + found_cvrs = [] + for pattern in cvr_patterns: + matches = re.finditer(pattern, pdf_text, re.IGNORECASE) + for match in matches: + cvr_candidate = match.group(1) + # Validate it's a real CVR (starts with 1-4, not a random number) + if cvr_candidate[0] in '1234' and cvr_candidate not in found_cvrs: + found_cvrs.append(cvr_candidate) + + # Remove BMC's own CVR from list (buyer CVR, not seller) + vendor_cvrs = [cvr for cvr in found_cvrs if cvr != settings.OWN_CVR] + + if settings.OWN_CVR in found_cvrs: + # This is a proper invoice where BMC is the buyer + if len(vendor_cvrs) > 0: + # Found vendor CVR - use the first non-BMC CVR + result['cvr'] = vendor_cvrs[0] + logger.info(f"📋 Found vendor CVR: {vendor_cvrs[0]} (ignored BMC CVR: {settings.OWN_CVR})") + + # Try to match vendor + vendor = self.match_vendor_by_cvr(vendor_cvrs[0]) + if vendor: + result['vendor_id'] = vendor['id'] + result['vendor_name'] = vendor['name'] + else: + # Only BMC's CVR found = this is an outgoing invoice + result['is_own_invoice'] = True + result['cvr'] = settings.OWN_CVR + logger.warning(f"⚠️ OUTGOING INVOICE: Only BMC CVR found") + elif len(vendor_cvrs) > 0: + # No BMC CVR, but other CVR found - use first one + result['cvr'] = vendor_cvrs[0] + logger.info(f"📋 Found CVR: {vendor_cvrs[0]}") + + vendor = self.match_vendor_by_cvr(vendor_cvrs[0]) + if vendor: + result['vendor_id'] = vendor['id'] + result['vendor_name'] = vendor['name'] + + # 2. DETECT DOCUMENT TYPE (Invoice vs Credit Note) + credit_keywords = [ + 'kreditnota', 'credit note', 'creditnote', 'kreditfaktura', + 'refusion', 'tilbagebetaling', 'godtgørelse', 'tilbageførsel' + ] + + text_lower = pdf_text.lower() + is_credit_note = any(keyword in text_lower for keyword in credit_keywords) + + if is_credit_note: + result['document_type'] = 'credit_note' + logger.info("📄 Document type: CREDIT NOTE") + else: + result['document_type'] = 'invoice' + logger.info("📄 Document type: INVOICE") + + # 3. EXTRACT DOCUMENT NUMBER + # For invoices: "Faktura nr.", "Invoice number:", "Fakturanr." + # For credit notes: "Kreditnota nr.", "Credit note number:" + + if result['document_type'] == 'credit_note': + number_patterns = [ + r'kreditnota\s*(?:nr\.?|nummer)[:\s]*(\S+)', + r'credit\s*note\s*(?:no\.?|number)[:\s]*(\S+)', + r'kreditfaktura\s*(?:nr\.?|nummer)[:\s]*(\S+)', + ] + else: + number_patterns = [ + r'faktura\s*(?:nr\.?|nummer)[:\s]*(\S+)', + r'invoice\s*(?:no\.?|number)[:\s]*(\S+)', + r'fakturanr\.?\s*[:\s]*(\S+)', + ] + + for pattern in number_patterns: + match = re.search(pattern, pdf_text, re.IGNORECASE) + if match: + result['document_number'] = match.group(1).strip() + logger.info(f"🔢 Document number: {result['document_number']}") + break + + logger.info(f"✅ Quick analysis complete: CVR={result['cvr']}, Type={result['document_type']}, Number={result['document_number']}, Vendor={result['vendor_name']}") + return result + def match_vendor_by_cvr(self, vendor_cvr: Optional[str]) -> Optional[Dict]: """ Match vendor from database using CVR number @@ -459,7 +583,7 @@ Output: { # Search vendors table vendor = execute_query( - "SELECT * FROM vendors WHERE cvr = %s", + "SELECT * FROM vendors WHERE cvr_number = %s", (cvr_clean,), fetchone=True ) diff --git a/app/services/template_service.py b/app/services/template_service.py index 9e99b06..e8db2dd 100644 --- a/app/services/template_service.py +++ b/app/services/template_service.py @@ -1,6 +1,6 @@ """ Supplier Invoice Template Service -Simple template-based invoice field extraction (no AI) +Hybrid approach: invoice2data templates + custom regex templates Inspired by OmniSync's invoice template system """ @@ -11,6 +11,7 @@ from datetime import datetime from pathlib import Path from app.core.database import execute_query, execute_insert, execute_update +from app.services.invoice2data_service import get_invoice2data_service logger = logging.getLogger(__name__) @@ -21,12 +22,19 @@ class TemplateService: def __init__(self): self.templates_cache = {} self._initialized = False + self.invoice2data = None def _ensure_loaded(self): """Lazy load templates on first use""" if not self._initialized: logger.info("🔄 Lazy loading templates...") self._load_templates() + # Also load invoice2data templates + try: + self.invoice2data = get_invoice2data_service() + logger.info(f"✅ Invoice2Data service initialized") + except Exception as e: + logger.warning(f"⚠️ Failed to load invoice2data: {e}") self._initialized = True def _load_templates(self): @@ -51,11 +59,24 @@ class TemplateService: def match_template(self, pdf_text: str) -> Tuple[Optional[int], float]: """ Find best matching template for PDF text + First tries invoice2data templates, then falls back to custom templates Returns: (template_id, confidence_score) """ self._ensure_loaded() # Lazy load templates - logger.info(f"🔍 Matching against {len(self.templates_cache)} templates") + # Try invoice2data templates first + if self.invoice2data: + try: + template_name = self.invoice2data.match_template(pdf_text) + if template_name: + logger.info(f"✅ Matched invoice2data template: {template_name}") + # Return special ID to indicate invoice2data template + return (-1, 1.0) # -1 = invoice2data, 100% confidence + except Exception as e: + logger.warning(f"⚠️ Invoice2data matching failed: {e}") + + # Fallback to custom templates + logger.info(f"🔍 Matching against {len(self.templates_cache)} custom templates") best_match = None best_score = 0.0 @@ -112,6 +133,19 @@ class TemplateService: """Extract invoice fields using template's regex patterns""" self._ensure_loaded() # Lazy load templates + # Check if this is an invoice2data template + if template_id == -1: + if self.invoice2data: + try: + result = self.invoice2data.extract(pdf_text) + if result: + logger.info(f"✅ Extracted fields using invoice2data") + return result + except Exception as e: + logger.error(f"❌ Invoice2data extraction failed: {e}") + return {} + + # Use custom template template = self.templates_cache.get(template_id) if not template: logger.warning(f"⚠️ Template {template_id} not found in cache") @@ -128,11 +162,36 @@ class TemplateService: continue try: - match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE) - if match and len(match.groups()) >= group: - value = match.group(group).strip() - extracted[field_name] = value - logger.debug(f" ✓ {field_name}: {value}") + # Special handling for CVR to avoid extracting own CVR + if field_name == 'vendor_cvr': + from app.core.config import settings + own_cvr = getattr(settings, 'OWN_CVR', '29522790') + + # Find ALL CVR matches + all_matches = list(re.finditer(pattern, pdf_text, re.IGNORECASE | re.MULTILINE)) + found_cvrs = [] + + for match in all_matches: + if len(match.groups()) >= group: + cvr = match.group(group).strip() + found_cvrs.append(cvr) + + # Filter out own CVR + vendor_cvrs = [cvr for cvr in found_cvrs if cvr != own_cvr] + + if vendor_cvrs: + # Use first non-own CVR as vendor CVR + extracted[field_name] = vendor_cvrs[0] + logger.debug(f" ✓ {field_name}: {vendor_cvrs[0]} (filtered out own CVR: {own_cvr})") + else: + logger.warning(f" ⚠️ Only found own CVR ({own_cvr}), no vendor CVR found") + else: + # Normal extraction for other fields + match = re.search(pattern, pdf_text, re.IGNORECASE | re.MULTILINE) + if match and len(match.groups()) >= group: + value = match.group(group).strip() + extracted[field_name] = value + logger.debug(f" ✓ {field_name}: {value}") except Exception as e: logger.warning(f" ✗ Failed to extract {field_name}: {e}") diff --git a/migrations/011_extraction_lines_context.sql b/migrations/011_extraction_lines_context.sql new file mode 100644 index 0000000..f8c7d6d --- /dev/null +++ b/migrations/011_extraction_lines_context.sql @@ -0,0 +1,18 @@ +-- Migration 011: Add context fields to extraction_lines +-- These fields capture additional context information from invoice line items + +ALTER TABLE extraction_lines +ADD COLUMN IF NOT EXISTS ip_address VARCHAR(50), +ADD COLUMN IF NOT EXISTS contract_number VARCHAR(100), +ADD COLUMN IF NOT EXISTS location_street VARCHAR(255), +ADD COLUMN IF NOT EXISTS location_zip VARCHAR(10), +ADD COLUMN IF NOT EXISTS location_city VARCHAR(100); + +-- Add index for contract number lookups +CREATE INDEX IF NOT EXISTS idx_extraction_lines_contract_number ON extraction_lines(contract_number); + +COMMENT ON COLUMN extraction_lines.ip_address IS 'IP address/subnet from line context (e.g., 152.115.56.192/27)'; +COMMENT ON COLUMN extraction_lines.contract_number IS 'Contract number from line context (e.g., NKA-008225)'; +COMMENT ON COLUMN extraction_lines.location_street IS 'Street address from line context'; +COMMENT ON COLUMN extraction_lines.location_zip IS 'Zip code from line context'; +COMMENT ON COLUMN extraction_lines.location_city IS 'City from line context'; diff --git a/migrations/011_quick_analysis.sql b/migrations/011_quick_analysis.sql new file mode 100644 index 0000000..38fb45a --- /dev/null +++ b/migrations/011_quick_analysis.sql @@ -0,0 +1,19 @@ +-- Migration 011: Quick Analysis on Upload +-- Adds fields to store automatic CVR, document type, and document number detection + +-- Add quick analysis fields to incoming_files +ALTER TABLE incoming_files +ADD COLUMN IF NOT EXISTS detected_cvr VARCHAR(8), +ADD COLUMN IF NOT EXISTS detected_vendor_id INTEGER REFERENCES vendors(id), +ADD COLUMN IF NOT EXISTS detected_document_type VARCHAR(20), -- 'invoice' or 'credit_note' +ADD COLUMN IF NOT EXISTS detected_document_number VARCHAR(100); + +-- Add index for CVR lookups +CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_cvr ON incoming_files(detected_cvr); +CREATE INDEX IF NOT EXISTS idx_incoming_files_detected_vendor ON incoming_files(detected_vendor_id); + +-- Add comments +COMMENT ON COLUMN incoming_files.detected_cvr IS 'Automatically detected CVR number from PDF text'; +COMMENT ON COLUMN incoming_files.detected_vendor_id IS 'Vendor matched by CVR on upload'; +COMMENT ON COLUMN incoming_files.detected_document_type IS 'Auto-detected: invoice or credit_note'; +COMMENT ON COLUMN incoming_files.detected_document_number IS 'Automatically extracted invoice/credit note number'; diff --git a/migrations/012_own_invoice_filter.sql b/migrations/012_own_invoice_filter.sql new file mode 100644 index 0000000..5aab3b3 --- /dev/null +++ b/migrations/012_own_invoice_filter.sql @@ -0,0 +1,20 @@ +-- Migration 012: Add is_own_invoice flag to filter outgoing invoices +-- BMC's own CVR: 29522790 + +-- Add column to track outgoing invoices (BMC's own invoices to customers) +ALTER TABLE incoming_files +ADD COLUMN IF NOT EXISTS is_own_invoice BOOLEAN DEFAULT FALSE; + +-- Mark existing files with BMC's CVR as outgoing invoices +UPDATE incoming_files +SET is_own_invoice = TRUE +WHERE detected_cvr = '29522790'; + +-- Add index for faster filtering +CREATE INDEX IF NOT EXISTS idx_incoming_files_is_own_invoice +ON incoming_files(is_own_invoice) +WHERE is_own_invoice = TRUE; + +-- Add comment +COMMENT ON COLUMN incoming_files.is_own_invoice IS +'TRUE hvis filen er en udgående faktura fra BMC (CVR 29522790), FALSE hvis leverandør faktura'; diff --git a/migrations/012_template_default_category.sql b/migrations/012_template_default_category.sql new file mode 100644 index 0000000..35e32db --- /dev/null +++ b/migrations/012_template_default_category.sql @@ -0,0 +1,13 @@ +-- Migration 012: Add default product category to templates +-- Allows templates to specify default category for line items (varesalg, drift, etc.) + +ALTER TABLE supplier_invoice_templates +ADD COLUMN IF NOT EXISTS default_product_category VARCHAR(50) DEFAULT 'varesalg', +ADD COLUMN IF NOT EXISTS default_product_group_number INTEGER; + +-- Valid categories: varesalg, drift, anlæg, abonnement, lager, udlejning +COMMENT ON COLUMN supplier_invoice_templates.default_product_category IS 'Default kategori for varelinjer: varesalg, drift, anlæg, abonnement, lager, udlejning'; +COMMENT ON COLUMN supplier_invoice_templates.default_product_group_number IS 'Default e-conomic produktgruppe nummer'; + +-- Add index for category lookups +CREATE INDEX IF NOT EXISTS idx_supplier_invoice_templates_category ON supplier_invoice_templates(default_product_category); diff --git a/requirements.txt b/requirements.txt index cc52be7..59aad77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ PyPDF2==3.0.1 pdfplumber==0.11.4 pytesseract==0.3.13 Pillow==11.0.0 +invoice2data==0.4.4 +pyyaml==6.0.2 diff --git a/scripts/backfill_quick_analysis.py b/scripts/backfill_quick_analysis.py new file mode 100644 index 0000000..1248381 --- /dev/null +++ b/scripts/backfill_quick_analysis.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Backfill quick analysis for existing files +""" +import sys +import asyncio +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from app.core.database import execute_query, execute_update, init_db +from app.services.ollama_service import ollama_service + + +async def backfill_quick_analysis(): + """Run quick analysis on all files that don't have it""" + + # Initialize database + init_db() + + try: + # Get files without quick analysis + files = execute_query( + """SELECT file_id, filename, file_path + FROM incoming_files + WHERE (detected_cvr IS NULL OR detected_document_number IS NULL) + AND status NOT IN ('duplicate') + AND file_path IS NOT NULL + ORDER BY file_id DESC""" + ) + + print(f"📋 Found {len(files)} files without quick analysis") + + success_count = 0 + fail_count = 0 + + for file in files: + try: + file_path = Path(file['file_path']) + + if not file_path.exists(): + print(f"⚠️ File not found: {file_path}") + fail_count += 1 + continue + + print(f"\n🔍 Processing: {file['filename']} (ID: {file['file_id']})") + + # Extract text + text = await ollama_service._extract_text_from_file(file_path) + + # Run quick analysis + quick_result = await ollama_service.quick_analysis_on_upload(text) + + # Update database + execute_update( + """UPDATE incoming_files + SET detected_cvr = %s, + detected_vendor_id = %s, + detected_document_type = %s, + detected_document_number = %s + WHERE file_id = %s""", + (quick_result.get('cvr'), + quick_result.get('vendor_id'), + quick_result.get('document_type'), + quick_result.get('document_number'), + file['file_id']) + ) + + print(f"✅ Updated: CVR={quick_result.get('cvr')}, " + f"Type={quick_result.get('document_type')}, " + f"Number={quick_result.get('document_number')}, " + f"Vendor={quick_result.get('vendor_name')}") + + success_count += 1 + + except Exception as e: + print(f"❌ Error processing {file['filename']}: {e}") + fail_count += 1 + + print(f"\n📊 Summary: {success_count} successful, {fail_count} failed") + + except Exception as e: + print(f"❌ Fatal error: {e}") + raise + + +if __name__ == "__main__": + asyncio.run(backfill_quick_analysis()) diff --git a/static/design_templates/09_horizontal_dark/index.html b/static/design_templates/09_horizontal_dark/index.html index 108e663..61403f7 100644 --- a/static/design_templates/09_horizontal_dark/index.html +++ b/static/design_templates/09_horizontal_dark/index.html @@ -1,4 +1,4 @@ - +
diff --git a/test_quick_analysis.py b/test_quick_analysis.py new file mode 100644 index 0000000..8a50437 --- /dev/null +++ b/test_quick_analysis.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Test Quick Analysis on Upload +Tests CVR detection, document type, and invoice number extraction +""" + +import asyncio +import sys +from pathlib import Path + +# Add app directory to path +sys.path.insert(0, str(Path(__file__).parent / "app")) + +from app.services.ollama_service import ollama_service + +async def test_quick_analysis(): + """Test quick analysis with sample text""" + + # Sample invoice text with CVR + sample_invoice = """ + ALSO Danmark A/S + Jupitervej 4 + 6000 Kolding + + CVR-nr.: 35812428 + + FAKTURA + + Faktura nr.: INV-2024-12345 + Dato: 2024-12-08 + + Beløb i alt: 5.965,18 DKK + """ + + # Sample credit note text + sample_credit_note = """ + Test Leverandør A/S + CVR: 12345678 + + KREDITNOTA + + Kreditnota nr.: CN-2024-5678 + Original faktura: INV-2024-1000 + + Beløb: -1.234,56 DKK + """ + + print("🧪 Testing Quick Analysis\n") + print("=" * 60) + + # Test 1: Invoice with CVR + print("\n📄 TEST 1: Invoice with CVR") + print("-" * 60) + result1 = await ollama_service.quick_analysis_on_upload(sample_invoice) + print(f"CVR: {result1['cvr']}") + print(f"Document Type: {result1['document_type']}") + print(f"Document Number: {result1['document_number']}") + print(f"Vendor ID: {result1['vendor_id']}") + print(f"Vendor Name: {result1['vendor_name']}") + + assert result1['cvr'] == '35812428', f"Expected CVR 35812428, got {result1['cvr']}" + assert result1['document_type'] == 'invoice', f"Expected invoice, got {result1['document_type']}" + assert result1['document_number'] == 'INV-2024-12345', f"Expected INV-2024-12345, got {result1['document_number']}" + print("✅ Test 1 PASSED") + + # Test 2: Credit Note + print("\n📄 TEST 2: Credit Note") + print("-" * 60) + result2 = await ollama_service.quick_analysis_on_upload(sample_credit_note) + print(f"CVR: {result2['cvr']}") + print(f"Document Type: {result2['document_type']}") + print(f"Document Number: {result2['document_number']}") + print(f"Vendor ID: {result2['vendor_id']}") + print(f"Vendor Name: {result2['vendor_name']}") + + assert result2['cvr'] == '12345678', f"Expected CVR 12345678, got {result2['cvr']}" + assert result2['document_type'] == 'credit_note', f"Expected credit_note, got {result2['document_type']}" + assert result2['document_number'] == 'CN-2024-5678', f"Expected CN-2024-5678, got {result2['document_number']}" + print("✅ Test 2 PASSED") + + print("\n" + "=" * 60) + print("✅ ALL TESTS PASSED!") + print("=" * 60) + +if __name__ == "__main__": + asyncio.run(test_quick_analysis())