feat: analyze PDF attachments for invoice extraction v2.2.18

- email_analysis_service: extract PDF text from attachments as PRIMARY source
  - _build_invoice_extraction_context: reads PDF bytes (in-memory or DB)
  - _extract_pdf_texts_from_attachments: pdfplumber on in-memory bytes
  - _get_attachment_texts_from_db: fallback to content_data/file_path
  - _build_extraction_prompt: comprehensive schema (vendor, CVR, lines, dates)
  - num_predict 300→3000, timeout 30→120s, format=json
- email_processor_service: _update_extracted_fields saves vendor_name, CVR, invoice_date
- migration 140: extracted_vendor_name, extracted_vendor_cvr, extracted_invoice_date columns

Sender (forwarder/external bookkeeper) is now ignored for vendor detection.
The actual invoice PDF determines vendor/amounts/lines.
This commit is contained in:
Christian 2026-03-02 00:17:41 +01:00
parent 3d24987365
commit c6d310e96d
4 changed files with 215 additions and 46 deletions

View File

@ -1 +1 @@
2.2.17
2.2.18

View File

@ -77,7 +77,7 @@ Response format (JSON only, no other text):
IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking, or additional text."""
def _build_email_context(self, email_data: Dict) -> str:
"""Build email context for AI analysis"""
"""Build email context for AI classification (email body only - fast)"""
subject = email_data.get('subject', '')
sender = email_data.get('sender_email', '')
@ -87,9 +87,17 @@ IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking
if len(body) > 2000:
body = body[:2000] + "... [truncated]"
# Also note if PDF attachments exist (helps classification even without reading them)
attachments = email_data.get('attachments', [])
pdf_filenames = [a.get('filename', '') for a in attachments
if a.get('filename', '').lower().endswith('.pdf')]
attachment_note = ''
if pdf_filenames:
attachment_note = f"\n\nVedhæftede filer: {', '.join(pdf_filenames)}"
context = f"""**Email Information:**
From: {sender}
Subject: {subject}
Subject: {subject}{attachment_note}
**Email Body:**
{body}
@ -98,6 +106,116 @@ Klassificer denne email."""
return context
def _extract_pdf_texts_from_attachments(self, email_data: Dict) -> List[str]:
"""Extract text from PDF attachments in email_data (in-memory bytes)"""
pdf_texts = []
attachments = email_data.get('attachments', [])
for att in attachments:
filename = att.get('filename', '')
if not filename.lower().endswith('.pdf'):
continue
content = att.get('content', b'')
if not content:
continue
try:
import pdfplumber
import io
with pdfplumber.open(io.BytesIO(content)) as pdf:
pages = []
for page in pdf.pages:
text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
if text:
pages.append(text)
if pages:
pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages))
logger.info(f"📄 Extracted PDF text from attachment {filename} ({len(pages)} pages)")
except Exception as e:
logger.warning(f"⚠️ Could not extract PDF text from {filename}: {e}")
return pdf_texts
def _get_attachment_texts_from_db(self, email_id: int) -> List[str]:
"""Fetch PDF attachment text from DB (content_data column) for already-saved emails"""
from pathlib import Path
pdf_texts = []
try:
attachments = execute_query(
"""SELECT filename, content_data, file_path
FROM email_attachments
WHERE email_id = %s AND filename ILIKE '%.pdf'""",
(email_id,)
)
for att in (attachments or []):
filename = att.get('filename', 'unknown.pdf')
content = None
# Prefer content_data (bytes in DB)
if att.get('content_data'):
content = bytes(att['content_data'])
# Fallback: read from disk
elif att.get('file_path'):
fp = Path(att['file_path'])
if fp.exists():
content = fp.read_bytes()
if not content:
continue
try:
import pdfplumber
import io
with pdfplumber.open(io.BytesIO(content)) as pdf:
pages = []
for page in pdf.pages:
text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
if text:
pages.append(text)
if pages:
pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages))
logger.info(f"📄 Extracted PDF text from DB for {filename} ({len(pages)} pages)")
except Exception as e:
logger.warning(f"⚠️ Could not extract PDF text for {filename} from DB: {e}")
except Exception as e:
logger.error(f"❌ Error fetching attachment texts from DB for email {email_id}: {e}")
return pdf_texts
def _build_invoice_extraction_context(self, email_data: Dict) -> str:
"""Build extraction context with PDF as PRIMARY data source.
Email body/sender are ignored for invoice data only the attached PDF counts.
Sender can be a forwarder or external bookkeeper, not the actual vendor.
"""
subject = email_data.get('subject', '')
body = email_data.get('body_text', '') or ''
# Keep body brief — it's secondary context at best
if len(body) > 300:
body = body[:300] + "..."
# 1. Try in-memory attachment bytes first (during initial fetch)
pdf_texts = self._extract_pdf_texts_from_attachments(email_data)
# 2. Fallback: load from DB for already-processed emails
if not pdf_texts and email_data.get('id'):
pdf_texts = self._get_attachment_texts_from_db(email_data['id'])
if pdf_texts:
pdf_section = "\n\n".join(pdf_texts)
return f"""VEDHÆFTET FAKTURA (primær datakilde - analyser grundigt):
{pdf_section}
---
Email emne: {subject}
Email tekst (sekundær): {body}
VIGTIGT: Udtrækket SKAL baseres PDF-indholdet ovenfor.
Afsenderens email-adresse er IKKE leverandøren leverandøren fremgår af fakturaen."""
else:
# No PDF found — fall back to email body
logger.warning(f"⚠️ No PDF attachment found for email {email_data.get('id')} — using email body only")
body_full = email_data.get('body_text', '') or email_data.get('body_html', '') or ''
if len(body_full) > 3000:
body_full = body_full[:3000] + "..."
return f"""Email emne: {subject}
Email tekst:
{body_full}
Ingen PDF vedhæftet udtræk fakturadata fra email-teksten."""
async def _call_ollama(self, system_prompt: str, user_message: str) -> Optional[Dict]:
"""Call Ollama API for classification"""
@ -279,9 +397,9 @@ Klassificer denne email."""
logger.info(f"✅ Using cached extraction for email {email_data['id']}")
return cached_result
# Build extraction prompt
# Build extraction prompt — use PDF-first context, not email sender
system_prompt = self._build_extraction_prompt()
user_message = self._build_email_context(email_data)
user_message = self._build_invoice_extraction_context(email_data)
# Call Ollama
result = await self._call_ollama_extraction(system_prompt, user_message)
@ -294,39 +412,61 @@ Klassificer denne email."""
return None
def _build_extraction_prompt(self) -> str:
"""Build Danish system prompt for invoice data extraction"""
return """Du er en ekspert i at udtrække struktureret data fra danske fakturaer.
"""Build comprehensive Danish system prompt for deep invoice data extraction."""
from app.core.config import settings as cfg
own_cvr = getattr(cfg, 'OWN_CVR', '')
return f"""Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og kreditnotaer.
Din opgave er at finde og udtrække følgende information fra emailen:
DU SKAL ANALYSERE SELVE FAKTURAEN (PDF-indholdet) - IKKE email-afsenderen.
Afsender kan være os selv der videresender, eller en ekstern bogholder - IGNORER afsender.
Leverandørens navn og CVR fremgår ALTID af selve fakturadokumentet.
**Felter at udtrække:**
- `invoice_number` (string) - Fakturanummer
- `amount` (decimal) - Fakturabeløb i DKK (uden valutasymbol)
- `due_date` (string YYYY-MM-DD) - Forfaldsdato
- `vendor_name` (string) - Leverandørens navn
- `order_number` (string) - Ordrenummer (hvis angivet)
- `cvr_number` (string) - CVR-nummer (hvis angivet)
VIGTIGE REGLER:
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
2. Hvis et felt ikke findes, sæt det til null
3. Datoer skal være i format YYYY-MM-DD
4. DANSKE PRISFORMATER:
- Tusind-separator: . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
- Decimal-separator: , (komma): "1.234,56 kr"
- I JSON: brug . (punkt) som decimal: 1234.56
- Eksempel: "5.965,18 kr" 5965.18
5. CVR-nummer: 8 cifre uden mellemrum
- IGNORER CVR {own_cvr} det er VORES eget CVR (køber), ikke leverandørens!
- Find LEVERANDØRENS CVR i toppen/headeren af fakturaen
6. DOKUMENTTYPE:
- "invoice" = Almindelig faktura
- "credit_note" = Kreditnota (Kreditnota, Refusion, Tilbagebetaling, Credit Note)
7. Varelinjer: udtræk ALLE linjer med beskrivelse, antal, enhedspris, total
**Vigtige regler:**
- Hvis et felt ikke findes, brug `null`
- Beløb skal være numerisk (uden "kr", "DKK" osv.)
- Datoer skal være i formatet YYYY-MM-DD
- Vær præcis - returner kun data du er sikker
JSON STRUKTUR:
{{
"document_type": "invoice" eller "credit_note",
"invoice_number": "fakturanummer",
"vendor_name": "leverandørens firmanavn",
"vendor_cvr": "12345678",
"invoice_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"currency": "DKK",
"total_amount": 1234.56,
"vat_amount": 246.91,
"net_amount": 987.65,
"order_number": "ordrenummer eller null",
"original_invoice_reference": "ref til original faktura (kun kreditnotaer) eller null",
"lines": [
{{
"line_number": 1,
"description": "varebeskrivelse",
"quantity": 2.0,
"unit_price": 500.00,
"line_total": 1000.00,
"vat_rate": 25.00,
"vat_amount": 250.00
}}
],
"confidence": 0.95
}}
**Output format (JSON):**
```json
{
"invoice_number": "INV-2024-001",
"amount": 5250.00,
"due_date": "2025-01-15",
"vendor_name": "Acme Leverandør A/S",
"order_number": "ORD-123",
"cvr_number": "12345678"
}
```
Returner KUN JSON - ingen anden tekst.
"""
Returner KUN JSON - ingen anden tekst."""
async def _call_ollama_extraction(self, system_prompt: str, user_message: str) -> Optional[Dict]:
"""Call Ollama for data extraction"""
@ -340,20 +480,23 @@ Returner KUN JSON - ingen anden tekst.
{"role": "user", "content": user_message}
],
"stream": False,
"format": "json",
"options": {
"temperature": 0.0, # Zero temperature for deterministic extraction
"num_predict": 300
"temperature": 0.0, # Deterministic extraction
"num_predict": 3000 # Enough for full invoice with many lines
}
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=30)) as response:
async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=120)) as response:
if response.status != 200:
return None
data = await response.json()
content = data.get('message', {}).get('content', '')
msg = data.get('message', {})
# qwen3 sometimes returns content in 'thinking' field
content = msg.get('content', '') or msg.get('thinking', '')
# Parse JSON response
result = self._parse_extraction_response(content)

View File

@ -240,24 +240,39 @@ class EmailProcessorService:
logger.error(f"❌ Classification failed for email {email_data['id']}: {e}")
async def _update_extracted_fields(self, email_id: int, extraction: Dict):
"""Update email with extracted invoice fields"""
"""Update email with extracted invoice fields (from PDF attachment analysis)"""
try:
# Normalize amount field (new extraction uses total_amount, old used amount)
amount = extraction.get('total_amount') or extraction.get('amount')
query = """
UPDATE email_messages
SET extracted_invoice_number = %s,
extracted_amount = %s,
extracted_due_date = %s
extracted_due_date = %s,
extracted_vendor_name = %s,
extracted_vendor_cvr = %s,
extracted_invoice_date = %s
WHERE id = %s
"""
execute_query(query, (
extraction.get('invoice_number'),
extraction.get('amount'),
amount,
extraction.get('due_date'),
extraction.get('vendor_name'),
extraction.get('vendor_cvr'),
extraction.get('invoice_date'),
email_id
))
logger.info(f"✅ Updated extracted fields for email {email_id}")
logger.info(
f"✅ Updated extracted fields for email {email_id}: "
f"invoice={extraction.get('invoice_number')}, "
f"vendor={extraction.get('vendor_name')}, "
f"cvr={extraction.get('vendor_cvr')}, "
f"amount={amount}"
)
except Exception as e:
logger.error(f"❌ Error updating extracted fields: {e}")

View File

@ -0,0 +1,11 @@
-- Migration 140: Add vendor extraction fields to email_messages
-- Stores vendor info extracted from attached invoice PDFs
ALTER TABLE email_messages
ADD COLUMN IF NOT EXISTS extracted_vendor_name VARCHAR(255),
ADD COLUMN IF NOT EXISTS extracted_vendor_cvr VARCHAR(20),
ADD COLUMN IF NOT EXISTS extracted_invoice_date DATE;
COMMENT ON COLUMN email_messages.extracted_vendor_name IS 'Vendor name from attached invoice PDF';
COMMENT ON COLUMN email_messages.extracted_vendor_cvr IS 'Vendor CVR from attached invoice PDF';
COMMENT ON COLUMN email_messages.extracted_invoice_date IS 'Invoice date from attached invoice PDF';