feat: analyze PDF attachments for invoice extraction v2.2.18
- email_analysis_service: extract PDF text from attachments as PRIMARY source - _build_invoice_extraction_context: reads PDF bytes (in-memory or DB) - _extract_pdf_texts_from_attachments: pdfplumber on in-memory bytes - _get_attachment_texts_from_db: fallback to content_data/file_path - _build_extraction_prompt: comprehensive schema (vendor, CVR, lines, dates) - num_predict 300→3000, timeout 30→120s, format=json - email_processor_service: _update_extracted_fields saves vendor_name, CVR, invoice_date - migration 140: extracted_vendor_name, extracted_vendor_cvr, extracted_invoice_date columns Sender (forwarder/external bookkeeper) is now ignored for vendor detection. The actual invoice PDF determines vendor/amounts/lines.
This commit is contained in:
parent
3d24987365
commit
c6d310e96d
@ -77,7 +77,7 @@ Response format (JSON only, no other text):
|
||||
IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking, or additional text."""
|
||||
|
||||
def _build_email_context(self, email_data: Dict) -> str:
|
||||
"""Build email context for AI analysis"""
|
||||
"""Build email context for AI classification (email body only - fast)"""
|
||||
|
||||
subject = email_data.get('subject', '')
|
||||
sender = email_data.get('sender_email', '')
|
||||
@ -87,9 +87,17 @@ IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking
|
||||
if len(body) > 2000:
|
||||
body = body[:2000] + "... [truncated]"
|
||||
|
||||
# Also note if PDF attachments exist (helps classification even without reading them)
|
||||
attachments = email_data.get('attachments', [])
|
||||
pdf_filenames = [a.get('filename', '') for a in attachments
|
||||
if a.get('filename', '').lower().endswith('.pdf')]
|
||||
attachment_note = ''
|
||||
if pdf_filenames:
|
||||
attachment_note = f"\n\nVedhæftede filer: {', '.join(pdf_filenames)}"
|
||||
|
||||
context = f"""**Email Information:**
|
||||
From: {sender}
|
||||
Subject: {subject}
|
||||
Subject: {subject}{attachment_note}
|
||||
|
||||
**Email Body:**
|
||||
{body}
|
||||
@ -97,6 +105,116 @@ Subject: {subject}
|
||||
Klassificer denne email."""
|
||||
|
||||
return context
|
||||
|
||||
def _extract_pdf_texts_from_attachments(self, email_data: Dict) -> List[str]:
|
||||
"""Extract text from PDF attachments in email_data (in-memory bytes)"""
|
||||
pdf_texts = []
|
||||
attachments = email_data.get('attachments', [])
|
||||
for att in attachments:
|
||||
filename = att.get('filename', '')
|
||||
if not filename.lower().endswith('.pdf'):
|
||||
continue
|
||||
content = att.get('content', b'')
|
||||
if not content:
|
||||
continue
|
||||
try:
|
||||
import pdfplumber
|
||||
import io
|
||||
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||
pages = []
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
|
||||
if text:
|
||||
pages.append(text)
|
||||
if pages:
|
||||
pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages))
|
||||
logger.info(f"📄 Extracted PDF text from attachment {filename} ({len(pages)} pages)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not extract PDF text from {filename}: {e}")
|
||||
return pdf_texts
|
||||
|
||||
def _get_attachment_texts_from_db(self, email_id: int) -> List[str]:
|
||||
"""Fetch PDF attachment text from DB (content_data column) for already-saved emails"""
|
||||
from pathlib import Path
|
||||
pdf_texts = []
|
||||
try:
|
||||
attachments = execute_query(
|
||||
"""SELECT filename, content_data, file_path
|
||||
FROM email_attachments
|
||||
WHERE email_id = %s AND filename ILIKE '%.pdf'""",
|
||||
(email_id,)
|
||||
)
|
||||
for att in (attachments or []):
|
||||
filename = att.get('filename', 'unknown.pdf')
|
||||
content = None
|
||||
# Prefer content_data (bytes in DB)
|
||||
if att.get('content_data'):
|
||||
content = bytes(att['content_data'])
|
||||
# Fallback: read from disk
|
||||
elif att.get('file_path'):
|
||||
fp = Path(att['file_path'])
|
||||
if fp.exists():
|
||||
content = fp.read_bytes()
|
||||
if not content:
|
||||
continue
|
||||
try:
|
||||
import pdfplumber
|
||||
import io
|
||||
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||
pages = []
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
|
||||
if text:
|
||||
pages.append(text)
|
||||
if pages:
|
||||
pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages))
|
||||
logger.info(f"📄 Extracted PDF text from DB for {filename} ({len(pages)} pages)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not extract PDF text for {filename} from DB: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error fetching attachment texts from DB for email {email_id}: {e}")
|
||||
return pdf_texts
|
||||
|
||||
def _build_invoice_extraction_context(self, email_data: Dict) -> str:
|
||||
"""Build extraction context with PDF as PRIMARY data source.
|
||||
Email body/sender are ignored for invoice data — only the attached PDF counts.
|
||||
Sender can be a forwarder or external bookkeeper, not the actual vendor.
|
||||
"""
|
||||
subject = email_data.get('subject', '')
|
||||
body = email_data.get('body_text', '') or ''
|
||||
# Keep body brief — it's secondary context at best
|
||||
if len(body) > 300:
|
||||
body = body[:300] + "..."
|
||||
|
||||
# 1. Try in-memory attachment bytes first (during initial fetch)
|
||||
pdf_texts = self._extract_pdf_texts_from_attachments(email_data)
|
||||
|
||||
# 2. Fallback: load from DB for already-processed emails
|
||||
if not pdf_texts and email_data.get('id'):
|
||||
pdf_texts = self._get_attachment_texts_from_db(email_data['id'])
|
||||
|
||||
if pdf_texts:
|
||||
pdf_section = "\n\n".join(pdf_texts)
|
||||
return f"""VEDHÆFTET FAKTURA (primær datakilde - analyser grundigt):
|
||||
{pdf_section}
|
||||
|
||||
---
|
||||
Email emne: {subject}
|
||||
Email tekst (sekundær): {body}
|
||||
|
||||
VIGTIGT: Udtrækket SKAL baseres på PDF-indholdet ovenfor.
|
||||
Afsenderens email-adresse er IKKE leverandøren — leverandøren fremgår af fakturaen."""
|
||||
else:
|
||||
# No PDF found — fall back to email body
|
||||
logger.warning(f"⚠️ No PDF attachment found for email {email_data.get('id')} — using email body only")
|
||||
body_full = email_data.get('body_text', '') or email_data.get('body_html', '') or ''
|
||||
if len(body_full) > 3000:
|
||||
body_full = body_full[:3000] + "..."
|
||||
return f"""Email emne: {subject}
|
||||
Email tekst:
|
||||
{body_full}
|
||||
|
||||
Ingen PDF vedhæftet — udtræk fakturadata fra email-teksten."""
|
||||
|
||||
async def _call_ollama(self, system_prompt: str, user_message: str) -> Optional[Dict]:
|
||||
"""Call Ollama API for classification"""
|
||||
@ -279,9 +397,9 @@ Klassificer denne email."""
|
||||
logger.info(f"✅ Using cached extraction for email {email_data['id']}")
|
||||
return cached_result
|
||||
|
||||
# Build extraction prompt
|
||||
# Build extraction prompt — use PDF-first context, not email sender
|
||||
system_prompt = self._build_extraction_prompt()
|
||||
user_message = self._build_email_context(email_data)
|
||||
user_message = self._build_invoice_extraction_context(email_data)
|
||||
|
||||
# Call Ollama
|
||||
result = await self._call_ollama_extraction(system_prompt, user_message)
|
||||
@ -294,39 +412,61 @@ Klassificer denne email."""
|
||||
return None
|
||||
|
||||
def _build_extraction_prompt(self) -> str:
|
||||
"""Build Danish system prompt for invoice data extraction"""
|
||||
return """Du er en ekspert i at udtrække struktureret data fra danske fakturaer.
|
||||
"""Build comprehensive Danish system prompt for deep invoice data extraction."""
|
||||
from app.core.config import settings as cfg
|
||||
own_cvr = getattr(cfg, 'OWN_CVR', '')
|
||||
return f"""Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og kreditnotaer.
|
||||
|
||||
Din opgave er at finde og udtrække følgende information fra emailen:
|
||||
DU SKAL ANALYSERE SELVE FAKTURAEN (PDF-indholdet) - IKKE email-afsenderen.
|
||||
Afsender kan være os selv der videresender, eller en ekstern bogholder - IGNORER afsender.
|
||||
Leverandørens navn og CVR fremgår ALTID af selve fakturadokumentet.
|
||||
|
||||
**Felter at udtrække:**
|
||||
- `invoice_number` (string) - Fakturanummer
|
||||
- `amount` (decimal) - Fakturabeløb i DKK (uden valutasymbol)
|
||||
- `due_date` (string YYYY-MM-DD) - Forfaldsdato
|
||||
- `vendor_name` (string) - Leverandørens navn
|
||||
- `order_number` (string) - Ordrenummer (hvis angivet)
|
||||
- `cvr_number` (string) - CVR-nummer (hvis angivet)
|
||||
VIGTIGE REGLER:
|
||||
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
|
||||
2. Hvis et felt ikke findes, sæt det til null
|
||||
3. Datoer skal være i format YYYY-MM-DD
|
||||
4. DANSKE PRISFORMATER:
|
||||
- Tusind-separator: . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
|
||||
- Decimal-separator: , (komma): "1.234,56 kr"
|
||||
- I JSON: brug . (punkt) som decimal: 1234.56
|
||||
- Eksempel: "5.965,18 kr" → 5965.18
|
||||
5. CVR-nummer: 8 cifre uden mellemrum
|
||||
- IGNORER CVR {own_cvr} — det er VORES eget CVR (køber), ikke leverandørens!
|
||||
- Find LEVERANDØRENS CVR i toppen/headeren af fakturaen
|
||||
6. DOKUMENTTYPE:
|
||||
- "invoice" = Almindelig faktura
|
||||
- "credit_note" = Kreditnota (Kreditnota, Refusion, Tilbagebetaling, Credit Note)
|
||||
7. Varelinjer: udtræk ALLE linjer med beskrivelse, antal, enhedspris, total
|
||||
|
||||
**Vigtige regler:**
|
||||
- Hvis et felt ikke findes, brug `null`
|
||||
- Beløb skal være numerisk (uden "kr", "DKK" osv.)
|
||||
- Datoer skal være i formatet YYYY-MM-DD
|
||||
- Vær præcis - returner kun data du er sikker på
|
||||
JSON STRUKTUR:
|
||||
{{
|
||||
"document_type": "invoice" eller "credit_note",
|
||||
"invoice_number": "fakturanummer",
|
||||
"vendor_name": "leverandørens firmanavn",
|
||||
"vendor_cvr": "12345678",
|
||||
"invoice_date": "YYYY-MM-DD",
|
||||
"due_date": "YYYY-MM-DD",
|
||||
"currency": "DKK",
|
||||
"total_amount": 1234.56,
|
||||
"vat_amount": 246.91,
|
||||
"net_amount": 987.65,
|
||||
"order_number": "ordrenummer eller null",
|
||||
"original_invoice_reference": "ref til original faktura (kun kreditnotaer) eller null",
|
||||
"lines": [
|
||||
{{
|
||||
"line_number": 1,
|
||||
"description": "varebeskrivelse",
|
||||
"quantity": 2.0,
|
||||
"unit_price": 500.00,
|
||||
"line_total": 1000.00,
|
||||
"vat_rate": 25.00,
|
||||
"vat_amount": 250.00
|
||||
}}
|
||||
],
|
||||
"confidence": 0.95
|
||||
}}
|
||||
|
||||
**Output format (JSON):**
|
||||
```json
|
||||
{
|
||||
"invoice_number": "INV-2024-001",
|
||||
"amount": 5250.00,
|
||||
"due_date": "2025-01-15",
|
||||
"vendor_name": "Acme Leverandør A/S",
|
||||
"order_number": "ORD-123",
|
||||
"cvr_number": "12345678"
|
||||
}
|
||||
```
|
||||
|
||||
Returner KUN JSON - ingen anden tekst.
|
||||
"""
|
||||
Returner KUN JSON - ingen anden tekst."""
|
||||
|
||||
async def _call_ollama_extraction(self, system_prompt: str, user_message: str) -> Optional[Dict]:
|
||||
"""Call Ollama for data extraction"""
|
||||
@ -340,20 +480,23 @@ Returner KUN JSON - ingen anden tekst.
|
||||
{"role": "user", "content": user_message}
|
||||
],
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {
|
||||
"temperature": 0.0, # Zero temperature for deterministic extraction
|
||||
"num_predict": 300
|
||||
"temperature": 0.0, # Deterministic extraction
|
||||
"num_predict": 3000 # Enough for full invoice with many lines
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||
async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=120)) as response:
|
||||
if response.status != 200:
|
||||
return None
|
||||
|
||||
data = await response.json()
|
||||
content = data.get('message', {}).get('content', '')
|
||||
msg = data.get('message', {})
|
||||
# qwen3 sometimes returns content in 'thinking' field
|
||||
content = msg.get('content', '') or msg.get('thinking', '')
|
||||
|
||||
# Parse JSON response
|
||||
result = self._parse_extraction_response(content)
|
||||
|
||||
@ -240,25 +240,40 @@ class EmailProcessorService:
|
||||
logger.error(f"❌ Classification failed for email {email_data['id']}: {e}")
|
||||
|
||||
async def _update_extracted_fields(self, email_id: int, extraction: Dict):
|
||||
"""Update email with extracted invoice fields"""
|
||||
"""Update email with extracted invoice fields (from PDF attachment analysis)"""
|
||||
try:
|
||||
# Normalize amount field (new extraction uses total_amount, old used amount)
|
||||
amount = extraction.get('total_amount') or extraction.get('amount')
|
||||
|
||||
query = """
|
||||
UPDATE email_messages
|
||||
SET extracted_invoice_number = %s,
|
||||
extracted_amount = %s,
|
||||
extracted_due_date = %s
|
||||
extracted_amount = %s,
|
||||
extracted_due_date = %s,
|
||||
extracted_vendor_name = %s,
|
||||
extracted_vendor_cvr = %s,
|
||||
extracted_invoice_date = %s
|
||||
WHERE id = %s
|
||||
"""
|
||||
|
||||
|
||||
execute_query(query, (
|
||||
extraction.get('invoice_number'),
|
||||
extraction.get('amount'),
|
||||
amount,
|
||||
extraction.get('due_date'),
|
||||
extraction.get('vendor_name'),
|
||||
extraction.get('vendor_cvr'),
|
||||
extraction.get('invoice_date'),
|
||||
email_id
|
||||
))
|
||||
|
||||
logger.info(f"✅ Updated extracted fields for email {email_id}")
|
||||
|
||||
|
||||
logger.info(
|
||||
f"✅ Updated extracted fields for email {email_id}: "
|
||||
f"invoice={extraction.get('invoice_number')}, "
|
||||
f"vendor={extraction.get('vendor_name')}, "
|
||||
f"cvr={extraction.get('vendor_cvr')}, "
|
||||
f"amount={amount}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error updating extracted fields: {e}")
|
||||
|
||||
|
||||
11
migrations/140_email_extracted_vendor_fields.sql
Normal file
11
migrations/140_email_extracted_vendor_fields.sql
Normal file
@ -0,0 +1,11 @@
|
||||
-- Migration 140: Add vendor extraction fields to email_messages
|
||||
-- Stores vendor info extracted from attached invoice PDFs
|
||||
|
||||
ALTER TABLE email_messages
|
||||
ADD COLUMN IF NOT EXISTS extracted_vendor_name VARCHAR(255),
|
||||
ADD COLUMN IF NOT EXISTS extracted_vendor_cvr VARCHAR(20),
|
||||
ADD COLUMN IF NOT EXISTS extracted_invoice_date DATE;
|
||||
|
||||
COMMENT ON COLUMN email_messages.extracted_vendor_name IS 'Vendor name from attached invoice PDF';
|
||||
COMMENT ON COLUMN email_messages.extracted_vendor_cvr IS 'Vendor CVR from attached invoice PDF';
|
||||
COMMENT ON COLUMN email_messages.extracted_invoice_date IS 'Invoice date from attached invoice PDF';
|
||||
Loading…
Reference in New Issue
Block a user