diff --git a/VERSION b/VERSION index c36c648..8c57128 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.17 +2.2.18 diff --git a/app/services/email_analysis_service.py b/app/services/email_analysis_service.py index 9c685a5..e601763 100644 --- a/app/services/email_analysis_service.py +++ b/app/services/email_analysis_service.py @@ -77,7 +77,7 @@ Response format (JSON only, no other text): IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking, or additional text.""" def _build_email_context(self, email_data: Dict) -> str: - """Build email context for AI analysis""" + """Build email context for AI classification (email body only - fast)""" subject = email_data.get('subject', '') sender = email_data.get('sender_email', '') @@ -87,9 +87,17 @@ IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking if len(body) > 2000: body = body[:2000] + "... [truncated]" + # Also note if PDF attachments exist (helps classification even without reading them) + attachments = email_data.get('attachments', []) + pdf_filenames = [a.get('filename', '') for a in attachments + if a.get('filename', '').lower().endswith('.pdf')] + attachment_note = '' + if pdf_filenames: + attachment_note = f"\n\nVedhæftede filer: {', '.join(pdf_filenames)}" + context = f"""**Email Information:** From: {sender} -Subject: {subject} +Subject: {subject}{attachment_note} **Email Body:** {body} @@ -97,6 +105,116 @@ Subject: {subject} Klassificer denne email.""" return context + + def _extract_pdf_texts_from_attachments(self, email_data: Dict) -> List[str]: + """Extract text from PDF attachments in email_data (in-memory bytes)""" + pdf_texts = [] + attachments = email_data.get('attachments', []) + for att in attachments: + filename = att.get('filename', '') + if not filename.lower().endswith('.pdf'): + continue + content = att.get('content', b'') + if not content: + continue + try: + import pdfplumber + import io + with pdfplumber.open(io.BytesIO(content)) as pdf: + pages = [] + for page in pdf.pages: + text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2) + if text: + pages.append(text) + if pages: + pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages)) + logger.info(f"📄 Extracted PDF text from attachment {filename} ({len(pages)} pages)") + except Exception as e: + logger.warning(f"⚠️ Could not extract PDF text from {filename}: {e}") + return pdf_texts + + def _get_attachment_texts_from_db(self, email_id: int) -> List[str]: + """Fetch PDF attachment text from DB (content_data column) for already-saved emails""" + from pathlib import Path + pdf_texts = [] + try: + attachments = execute_query( + """SELECT filename, content_data, file_path + FROM email_attachments + WHERE email_id = %s AND filename ILIKE '%.pdf'""", + (email_id,) + ) + for att in (attachments or []): + filename = att.get('filename', 'unknown.pdf') + content = None + # Prefer content_data (bytes in DB) + if att.get('content_data'): + content = bytes(att['content_data']) + # Fallback: read from disk + elif att.get('file_path'): + fp = Path(att['file_path']) + if fp.exists(): + content = fp.read_bytes() + if not content: + continue + try: + import pdfplumber + import io + with pdfplumber.open(io.BytesIO(content)) as pdf: + pages = [] + for page in pdf.pages: + text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2) + if text: + pages.append(text) + if pages: + pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages)) + logger.info(f"📄 Extracted PDF text from DB for {filename} ({len(pages)} pages)") + except Exception as e: + logger.warning(f"⚠️ Could not extract PDF text for {filename} from DB: {e}") + except Exception as e: + logger.error(f"❌ Error fetching attachment texts from DB for email {email_id}: {e}") + return pdf_texts + + def _build_invoice_extraction_context(self, email_data: Dict) -> str: + """Build extraction context with PDF as PRIMARY data source. + Email body/sender are ignored for invoice data — only the attached PDF counts. + Sender can be a forwarder or external bookkeeper, not the actual vendor. + """ + subject = email_data.get('subject', '') + body = email_data.get('body_text', '') or '' + # Keep body brief — it's secondary context at best + if len(body) > 300: + body = body[:300] + "..." + + # 1. Try in-memory attachment bytes first (during initial fetch) + pdf_texts = self._extract_pdf_texts_from_attachments(email_data) + + # 2. Fallback: load from DB for already-processed emails + if not pdf_texts and email_data.get('id'): + pdf_texts = self._get_attachment_texts_from_db(email_data['id']) + + if pdf_texts: + pdf_section = "\n\n".join(pdf_texts) + return f"""VEDHÆFTET FAKTURA (primær datakilde - analyser grundigt): +{pdf_section} + +--- +Email emne: {subject} +Email tekst (sekundær): {body} + +VIGTIGT: Udtrækket SKAL baseres på PDF-indholdet ovenfor. +Afsenderens email-adresse er IKKE leverandøren — leverandøren fremgår af fakturaen.""" + else: + # No PDF found — fall back to email body + logger.warning(f"⚠️ No PDF attachment found for email {email_data.get('id')} — using email body only") + body_full = email_data.get('body_text', '') or email_data.get('body_html', '') or '' + if len(body_full) > 3000: + body_full = body_full[:3000] + "..." + return f"""Email emne: {subject} +Email tekst: +{body_full} + +Ingen PDF vedhæftet — udtræk fakturadata fra email-teksten.""" async def _call_ollama(self, system_prompt: str, user_message: str) -> Optional[Dict]: """Call Ollama API for classification""" @@ -279,9 +397,9 @@ Klassificer denne email.""" logger.info(f"✅ Using cached extraction for email {email_data['id']}") return cached_result - # Build extraction prompt + # Build extraction prompt — use PDF-first context, not email sender system_prompt = self._build_extraction_prompt() - user_message = self._build_email_context(email_data) + user_message = self._build_invoice_extraction_context(email_data) # Call Ollama result = await self._call_ollama_extraction(system_prompt, user_message) @@ -294,39 +412,61 @@ Klassificer denne email.""" return None def _build_extraction_prompt(self) -> str: - """Build Danish system prompt for invoice data extraction""" - return """Du er en ekspert i at udtrække struktureret data fra danske fakturaer. + """Build comprehensive Danish system prompt for deep invoice data extraction.""" + from app.core.config import settings as cfg + own_cvr = getattr(cfg, 'OWN_CVR', '') + return f"""Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og kreditnotaer. -Din opgave er at finde og udtrække følgende information fra emailen: +DU SKAL ANALYSERE SELVE FAKTURAEN (PDF-indholdet) - IKKE email-afsenderen. +Afsender kan være os selv der videresender, eller en ekstern bogholder - IGNORER afsender. +Leverandørens navn og CVR fremgår ALTID af selve fakturadokumentet. -**Felter at udtrække:** -- `invoice_number` (string) - Fakturanummer -- `amount` (decimal) - Fakturabeløb i DKK (uden valutasymbol) -- `due_date` (string YYYY-MM-DD) - Forfaldsdato -- `vendor_name` (string) - Leverandørens navn -- `order_number` (string) - Ordrenummer (hvis angivet) -- `cvr_number` (string) - CVR-nummer (hvis angivet) +VIGTIGE REGLER: +1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst +2. Hvis et felt ikke findes, sæt det til null +3. Datoer skal være i format YYYY-MM-DD +4. DANSKE PRISFORMATER: + - Tusind-separator: . (punkt) eller mellemrum: "5.965,18" eller "5 965,18" + - Decimal-separator: , (komma): "1.234,56 kr" + - I JSON: brug . (punkt) som decimal: 1234.56 + - Eksempel: "5.965,18 kr" → 5965.18 +5. CVR-nummer: 8 cifre uden mellemrum + - IGNORER CVR {own_cvr} — det er VORES eget CVR (køber), ikke leverandørens! + - Find LEVERANDØRENS CVR i toppen/headeren af fakturaen +6. DOKUMENTTYPE: + - "invoice" = Almindelig faktura + - "credit_note" = Kreditnota (Kreditnota, Refusion, Tilbagebetaling, Credit Note) +7. Varelinjer: udtræk ALLE linjer med beskrivelse, antal, enhedspris, total -**Vigtige regler:** -- Hvis et felt ikke findes, brug `null` -- Beløb skal være numerisk (uden "kr", "DKK" osv.) -- Datoer skal være i formatet YYYY-MM-DD -- Vær præcis - returner kun data du er sikker på +JSON STRUKTUR: +{{ + "document_type": "invoice" eller "credit_note", + "invoice_number": "fakturanummer", + "vendor_name": "leverandørens firmanavn", + "vendor_cvr": "12345678", + "invoice_date": "YYYY-MM-DD", + "due_date": "YYYY-MM-DD", + "currency": "DKK", + "total_amount": 1234.56, + "vat_amount": 246.91, + "net_amount": 987.65, + "order_number": "ordrenummer eller null", + "original_invoice_reference": "ref til original faktura (kun kreditnotaer) eller null", + "lines": [ + {{ + "line_number": 1, + "description": "varebeskrivelse", + "quantity": 2.0, + "unit_price": 500.00, + "line_total": 1000.00, + "vat_rate": 25.00, + "vat_amount": 250.00 + }} + ], + "confidence": 0.95 +}} -**Output format (JSON):** -```json -{ - "invoice_number": "INV-2024-001", - "amount": 5250.00, - "due_date": "2025-01-15", - "vendor_name": "Acme Leverandør A/S", - "order_number": "ORD-123", - "cvr_number": "12345678" -} -``` - -Returner KUN JSON - ingen anden tekst. -""" +Returner KUN JSON - ingen anden tekst.""" async def _call_ollama_extraction(self, system_prompt: str, user_message: str) -> Optional[Dict]: """Call Ollama for data extraction""" @@ -340,20 +480,23 @@ Returner KUN JSON - ingen anden tekst. {"role": "user", "content": user_message} ], "stream": False, + "format": "json", "options": { - "temperature": 0.0, # Zero temperature for deterministic extraction - "num_predict": 300 + "temperature": 0.0, # Deterministic extraction + "num_predict": 3000 # Enough for full invoice with many lines } } try: async with aiohttp.ClientSession() as session: - async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=30)) as response: + async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=120)) as response: if response.status != 200: return None data = await response.json() - content = data.get('message', {}).get('content', '') + msg = data.get('message', {}) + # qwen3 sometimes returns content in 'thinking' field + content = msg.get('content', '') or msg.get('thinking', '') # Parse JSON response result = self._parse_extraction_response(content) diff --git a/app/services/email_processor_service.py b/app/services/email_processor_service.py index 3225b08..fd4a49d 100644 --- a/app/services/email_processor_service.py +++ b/app/services/email_processor_service.py @@ -240,25 +240,40 @@ class EmailProcessorService: logger.error(f"❌ Classification failed for email {email_data['id']}: {e}") async def _update_extracted_fields(self, email_id: int, extraction: Dict): - """Update email with extracted invoice fields""" + """Update email with extracted invoice fields (from PDF attachment analysis)""" try: + # Normalize amount field (new extraction uses total_amount, old used amount) + amount = extraction.get('total_amount') or extraction.get('amount') + query = """ UPDATE email_messages SET extracted_invoice_number = %s, - extracted_amount = %s, - extracted_due_date = %s + extracted_amount = %s, + extracted_due_date = %s, + extracted_vendor_name = %s, + extracted_vendor_cvr = %s, + extracted_invoice_date = %s WHERE id = %s """ - + execute_query(query, ( extraction.get('invoice_number'), - extraction.get('amount'), + amount, extraction.get('due_date'), + extraction.get('vendor_name'), + extraction.get('vendor_cvr'), + extraction.get('invoice_date'), email_id )) - - logger.info(f"✅ Updated extracted fields for email {email_id}") - + + logger.info( + f"✅ Updated extracted fields for email {email_id}: " + f"invoice={extraction.get('invoice_number')}, " + f"vendor={extraction.get('vendor_name')}, " + f"cvr={extraction.get('vendor_cvr')}, " + f"amount={amount}" + ) + except Exception as e: logger.error(f"❌ Error updating extracted fields: {e}") diff --git a/migrations/140_email_extracted_vendor_fields.sql b/migrations/140_email_extracted_vendor_fields.sql new file mode 100644 index 0000000..bf3535f --- /dev/null +++ b/migrations/140_email_extracted_vendor_fields.sql @@ -0,0 +1,11 @@ +-- Migration 140: Add vendor extraction fields to email_messages +-- Stores vendor info extracted from attached invoice PDFs + +ALTER TABLE email_messages + ADD COLUMN IF NOT EXISTS extracted_vendor_name VARCHAR(255), + ADD COLUMN IF NOT EXISTS extracted_vendor_cvr VARCHAR(20), + ADD COLUMN IF NOT EXISTS extracted_invoice_date DATE; + +COMMENT ON COLUMN email_messages.extracted_vendor_name IS 'Vendor name from attached invoice PDF'; +COMMENT ON COLUMN email_messages.extracted_vendor_cvr IS 'Vendor CVR from attached invoice PDF'; +COMMENT ON COLUMN email_messages.extracted_invoice_date IS 'Invoice date from attached invoice PDF';