feat: analyze PDF attachments for invoice extraction v2.2.18
- email_analysis_service: extract PDF text from attachments as PRIMARY source - _build_invoice_extraction_context: reads PDF bytes (in-memory or DB) - _extract_pdf_texts_from_attachments: pdfplumber on in-memory bytes - _get_attachment_texts_from_db: fallback to content_data/file_path - _build_extraction_prompt: comprehensive schema (vendor, CVR, lines, dates) - num_predict 300→3000, timeout 30→120s, format=json - email_processor_service: _update_extracted_fields saves vendor_name, CVR, invoice_date - migration 140: extracted_vendor_name, extracted_vendor_cvr, extracted_invoice_date columns Sender (forwarder/external bookkeeper) is now ignored for vendor detection. The actual invoice PDF determines vendor/amounts/lines.
This commit is contained in:
parent
3d24987365
commit
c6d310e96d
@ -77,7 +77,7 @@ Response format (JSON only, no other text):
|
|||||||
IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking, or additional text."""
|
IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking, or additional text."""
|
||||||
|
|
||||||
def _build_email_context(self, email_data: Dict) -> str:
|
def _build_email_context(self, email_data: Dict) -> str:
|
||||||
"""Build email context for AI analysis"""
|
"""Build email context for AI classification (email body only - fast)"""
|
||||||
|
|
||||||
subject = email_data.get('subject', '')
|
subject = email_data.get('subject', '')
|
||||||
sender = email_data.get('sender_email', '')
|
sender = email_data.get('sender_email', '')
|
||||||
@ -87,9 +87,17 @@ IMPORTANT: Return ONLY the JSON object. Do not include any explanation, thinking
|
|||||||
if len(body) > 2000:
|
if len(body) > 2000:
|
||||||
body = body[:2000] + "... [truncated]"
|
body = body[:2000] + "... [truncated]"
|
||||||
|
|
||||||
|
# Also note if PDF attachments exist (helps classification even without reading them)
|
||||||
|
attachments = email_data.get('attachments', [])
|
||||||
|
pdf_filenames = [a.get('filename', '') for a in attachments
|
||||||
|
if a.get('filename', '').lower().endswith('.pdf')]
|
||||||
|
attachment_note = ''
|
||||||
|
if pdf_filenames:
|
||||||
|
attachment_note = f"\n\nVedhæftede filer: {', '.join(pdf_filenames)}"
|
||||||
|
|
||||||
context = f"""**Email Information:**
|
context = f"""**Email Information:**
|
||||||
From: {sender}
|
From: {sender}
|
||||||
Subject: {subject}
|
Subject: {subject}{attachment_note}
|
||||||
|
|
||||||
**Email Body:**
|
**Email Body:**
|
||||||
{body}
|
{body}
|
||||||
@ -97,6 +105,116 @@ Subject: {subject}
|
|||||||
Klassificer denne email."""
|
Klassificer denne email."""
|
||||||
|
|
||||||
return context
|
return context
|
||||||
|
|
||||||
|
def _extract_pdf_texts_from_attachments(self, email_data: Dict) -> List[str]:
|
||||||
|
"""Extract text from PDF attachments in email_data (in-memory bytes)"""
|
||||||
|
pdf_texts = []
|
||||||
|
attachments = email_data.get('attachments', [])
|
||||||
|
for att in attachments:
|
||||||
|
filename = att.get('filename', '')
|
||||||
|
if not filename.lower().endswith('.pdf'):
|
||||||
|
continue
|
||||||
|
content = att.get('content', b'')
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
import io
|
||||||
|
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||||
|
pages = []
|
||||||
|
for page in pdf.pages:
|
||||||
|
text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
|
||||||
|
if text:
|
||||||
|
pages.append(text)
|
||||||
|
if pages:
|
||||||
|
pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages))
|
||||||
|
logger.info(f"📄 Extracted PDF text from attachment {filename} ({len(pages)} pages)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Could not extract PDF text from {filename}: {e}")
|
||||||
|
return pdf_texts
|
||||||
|
|
||||||
|
def _get_attachment_texts_from_db(self, email_id: int) -> List[str]:
|
||||||
|
"""Fetch PDF attachment text from DB (content_data column) for already-saved emails"""
|
||||||
|
from pathlib import Path
|
||||||
|
pdf_texts = []
|
||||||
|
try:
|
||||||
|
attachments = execute_query(
|
||||||
|
"""SELECT filename, content_data, file_path
|
||||||
|
FROM email_attachments
|
||||||
|
WHERE email_id = %s AND filename ILIKE '%.pdf'""",
|
||||||
|
(email_id,)
|
||||||
|
)
|
||||||
|
for att in (attachments or []):
|
||||||
|
filename = att.get('filename', 'unknown.pdf')
|
||||||
|
content = None
|
||||||
|
# Prefer content_data (bytes in DB)
|
||||||
|
if att.get('content_data'):
|
||||||
|
content = bytes(att['content_data'])
|
||||||
|
# Fallback: read from disk
|
||||||
|
elif att.get('file_path'):
|
||||||
|
fp = Path(att['file_path'])
|
||||||
|
if fp.exists():
|
||||||
|
content = fp.read_bytes()
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
import io
|
||||||
|
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||||
|
pages = []
|
||||||
|
for page in pdf.pages:
|
||||||
|
text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
|
||||||
|
if text:
|
||||||
|
pages.append(text)
|
||||||
|
if pages:
|
||||||
|
pdf_texts.append(f"=== PDF: {filename} ===\n" + "\n".join(pages))
|
||||||
|
logger.info(f"📄 Extracted PDF text from DB for {filename} ({len(pages)} pages)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Could not extract PDF text for {filename} from DB: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error fetching attachment texts from DB for email {email_id}: {e}")
|
||||||
|
return pdf_texts
|
||||||
|
|
||||||
|
def _build_invoice_extraction_context(self, email_data: Dict) -> str:
|
||||||
|
"""Build extraction context with PDF as PRIMARY data source.
|
||||||
|
Email body/sender are ignored for invoice data — only the attached PDF counts.
|
||||||
|
Sender can be a forwarder or external bookkeeper, not the actual vendor.
|
||||||
|
"""
|
||||||
|
subject = email_data.get('subject', '')
|
||||||
|
body = email_data.get('body_text', '') or ''
|
||||||
|
# Keep body brief — it's secondary context at best
|
||||||
|
if len(body) > 300:
|
||||||
|
body = body[:300] + "..."
|
||||||
|
|
||||||
|
# 1. Try in-memory attachment bytes first (during initial fetch)
|
||||||
|
pdf_texts = self._extract_pdf_texts_from_attachments(email_data)
|
||||||
|
|
||||||
|
# 2. Fallback: load from DB for already-processed emails
|
||||||
|
if not pdf_texts and email_data.get('id'):
|
||||||
|
pdf_texts = self._get_attachment_texts_from_db(email_data['id'])
|
||||||
|
|
||||||
|
if pdf_texts:
|
||||||
|
pdf_section = "\n\n".join(pdf_texts)
|
||||||
|
return f"""VEDHÆFTET FAKTURA (primær datakilde - analyser grundigt):
|
||||||
|
{pdf_section}
|
||||||
|
|
||||||
|
---
|
||||||
|
Email emne: {subject}
|
||||||
|
Email tekst (sekundær): {body}
|
||||||
|
|
||||||
|
VIGTIGT: Udtrækket SKAL baseres på PDF-indholdet ovenfor.
|
||||||
|
Afsenderens email-adresse er IKKE leverandøren — leverandøren fremgår af fakturaen."""
|
||||||
|
else:
|
||||||
|
# No PDF found — fall back to email body
|
||||||
|
logger.warning(f"⚠️ No PDF attachment found for email {email_data.get('id')} — using email body only")
|
||||||
|
body_full = email_data.get('body_text', '') or email_data.get('body_html', '') or ''
|
||||||
|
if len(body_full) > 3000:
|
||||||
|
body_full = body_full[:3000] + "..."
|
||||||
|
return f"""Email emne: {subject}
|
||||||
|
Email tekst:
|
||||||
|
{body_full}
|
||||||
|
|
||||||
|
Ingen PDF vedhæftet — udtræk fakturadata fra email-teksten."""
|
||||||
|
|
||||||
async def _call_ollama(self, system_prompt: str, user_message: str) -> Optional[Dict]:
|
async def _call_ollama(self, system_prompt: str, user_message: str) -> Optional[Dict]:
|
||||||
"""Call Ollama API for classification"""
|
"""Call Ollama API for classification"""
|
||||||
@ -279,9 +397,9 @@ Klassificer denne email."""
|
|||||||
logger.info(f"✅ Using cached extraction for email {email_data['id']}")
|
logger.info(f"✅ Using cached extraction for email {email_data['id']}")
|
||||||
return cached_result
|
return cached_result
|
||||||
|
|
||||||
# Build extraction prompt
|
# Build extraction prompt — use PDF-first context, not email sender
|
||||||
system_prompt = self._build_extraction_prompt()
|
system_prompt = self._build_extraction_prompt()
|
||||||
user_message = self._build_email_context(email_data)
|
user_message = self._build_invoice_extraction_context(email_data)
|
||||||
|
|
||||||
# Call Ollama
|
# Call Ollama
|
||||||
result = await self._call_ollama_extraction(system_prompt, user_message)
|
result = await self._call_ollama_extraction(system_prompt, user_message)
|
||||||
@ -294,39 +412,61 @@ Klassificer denne email."""
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _build_extraction_prompt(self) -> str:
|
def _build_extraction_prompt(self) -> str:
|
||||||
"""Build Danish system prompt for invoice data extraction"""
|
"""Build comprehensive Danish system prompt for deep invoice data extraction."""
|
||||||
return """Du er en ekspert i at udtrække struktureret data fra danske fakturaer.
|
from app.core.config import settings as cfg
|
||||||
|
own_cvr = getattr(cfg, 'OWN_CVR', '')
|
||||||
|
return f"""Du er en ekspert i at læse og udtrække strukturerede data fra danske fakturaer og kreditnotaer.
|
||||||
|
|
||||||
Din opgave er at finde og udtrække følgende information fra emailen:
|
DU SKAL ANALYSERE SELVE FAKTURAEN (PDF-indholdet) - IKKE email-afsenderen.
|
||||||
|
Afsender kan være os selv der videresender, eller en ekstern bogholder - IGNORER afsender.
|
||||||
|
Leverandørens navn og CVR fremgår ALTID af selve fakturadokumentet.
|
||||||
|
|
||||||
**Felter at udtrække:**
|
VIGTIGE REGLER:
|
||||||
- `invoice_number` (string) - Fakturanummer
|
1. Returner KUN gyldig JSON - ingen forklaring eller ekstra tekst
|
||||||
- `amount` (decimal) - Fakturabeløb i DKK (uden valutasymbol)
|
2. Hvis et felt ikke findes, sæt det til null
|
||||||
- `due_date` (string YYYY-MM-DD) - Forfaldsdato
|
3. Datoer skal være i format YYYY-MM-DD
|
||||||
- `vendor_name` (string) - Leverandørens navn
|
4. DANSKE PRISFORMATER:
|
||||||
- `order_number` (string) - Ordrenummer (hvis angivet)
|
- Tusind-separator: . (punkt) eller mellemrum: "5.965,18" eller "5 965,18"
|
||||||
- `cvr_number` (string) - CVR-nummer (hvis angivet)
|
- Decimal-separator: , (komma): "1.234,56 kr"
|
||||||
|
- I JSON: brug . (punkt) som decimal: 1234.56
|
||||||
|
- Eksempel: "5.965,18 kr" → 5965.18
|
||||||
|
5. CVR-nummer: 8 cifre uden mellemrum
|
||||||
|
- IGNORER CVR {own_cvr} — det er VORES eget CVR (køber), ikke leverandørens!
|
||||||
|
- Find LEVERANDØRENS CVR i toppen/headeren af fakturaen
|
||||||
|
6. DOKUMENTTYPE:
|
||||||
|
- "invoice" = Almindelig faktura
|
||||||
|
- "credit_note" = Kreditnota (Kreditnota, Refusion, Tilbagebetaling, Credit Note)
|
||||||
|
7. Varelinjer: udtræk ALLE linjer med beskrivelse, antal, enhedspris, total
|
||||||
|
|
||||||
**Vigtige regler:**
|
JSON STRUKTUR:
|
||||||
- Hvis et felt ikke findes, brug `null`
|
{{
|
||||||
- Beløb skal være numerisk (uden "kr", "DKK" osv.)
|
"document_type": "invoice" eller "credit_note",
|
||||||
- Datoer skal være i formatet YYYY-MM-DD
|
"invoice_number": "fakturanummer",
|
||||||
- Vær præcis - returner kun data du er sikker på
|
"vendor_name": "leverandørens firmanavn",
|
||||||
|
"vendor_cvr": "12345678",
|
||||||
|
"invoice_date": "YYYY-MM-DD",
|
||||||
|
"due_date": "YYYY-MM-DD",
|
||||||
|
"currency": "DKK",
|
||||||
|
"total_amount": 1234.56,
|
||||||
|
"vat_amount": 246.91,
|
||||||
|
"net_amount": 987.65,
|
||||||
|
"order_number": "ordrenummer eller null",
|
||||||
|
"original_invoice_reference": "ref til original faktura (kun kreditnotaer) eller null",
|
||||||
|
"lines": [
|
||||||
|
{{
|
||||||
|
"line_number": 1,
|
||||||
|
"description": "varebeskrivelse",
|
||||||
|
"quantity": 2.0,
|
||||||
|
"unit_price": 500.00,
|
||||||
|
"line_total": 1000.00,
|
||||||
|
"vat_rate": 25.00,
|
||||||
|
"vat_amount": 250.00
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"confidence": 0.95
|
||||||
|
}}
|
||||||
|
|
||||||
**Output format (JSON):**
|
Returner KUN JSON - ingen anden tekst."""
|
||||||
```json
|
|
||||||
{
|
|
||||||
"invoice_number": "INV-2024-001",
|
|
||||||
"amount": 5250.00,
|
|
||||||
"due_date": "2025-01-15",
|
|
||||||
"vendor_name": "Acme Leverandør A/S",
|
|
||||||
"order_number": "ORD-123",
|
|
||||||
"cvr_number": "12345678"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Returner KUN JSON - ingen anden tekst.
|
|
||||||
"""
|
|
||||||
|
|
||||||
async def _call_ollama_extraction(self, system_prompt: str, user_message: str) -> Optional[Dict]:
|
async def _call_ollama_extraction(self, system_prompt: str, user_message: str) -> Optional[Dict]:
|
||||||
"""Call Ollama for data extraction"""
|
"""Call Ollama for data extraction"""
|
||||||
@ -340,20 +480,23 @@ Returner KUN JSON - ingen anden tekst.
|
|||||||
{"role": "user", "content": user_message}
|
{"role": "user", "content": user_message}
|
||||||
],
|
],
|
||||||
"stream": False,
|
"stream": False,
|
||||||
|
"format": "json",
|
||||||
"options": {
|
"options": {
|
||||||
"temperature": 0.0, # Zero temperature for deterministic extraction
|
"temperature": 0.0, # Deterministic extraction
|
||||||
"num_predict": 300
|
"num_predict": 3000 # Enough for full invoice with many lines
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
async with session.post(url, json=payload, timeout=aiohttp.ClientTimeout(total=120)) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
data = await response.json()
|
data = await response.json()
|
||||||
content = data.get('message', {}).get('content', '')
|
msg = data.get('message', {})
|
||||||
|
# qwen3 sometimes returns content in 'thinking' field
|
||||||
|
content = msg.get('content', '') or msg.get('thinking', '')
|
||||||
|
|
||||||
# Parse JSON response
|
# Parse JSON response
|
||||||
result = self._parse_extraction_response(content)
|
result = self._parse_extraction_response(content)
|
||||||
|
|||||||
@ -240,25 +240,40 @@ class EmailProcessorService:
|
|||||||
logger.error(f"❌ Classification failed for email {email_data['id']}: {e}")
|
logger.error(f"❌ Classification failed for email {email_data['id']}: {e}")
|
||||||
|
|
||||||
async def _update_extracted_fields(self, email_id: int, extraction: Dict):
|
async def _update_extracted_fields(self, email_id: int, extraction: Dict):
|
||||||
"""Update email with extracted invoice fields"""
|
"""Update email with extracted invoice fields (from PDF attachment analysis)"""
|
||||||
try:
|
try:
|
||||||
|
# Normalize amount field (new extraction uses total_amount, old used amount)
|
||||||
|
amount = extraction.get('total_amount') or extraction.get('amount')
|
||||||
|
|
||||||
query = """
|
query = """
|
||||||
UPDATE email_messages
|
UPDATE email_messages
|
||||||
SET extracted_invoice_number = %s,
|
SET extracted_invoice_number = %s,
|
||||||
extracted_amount = %s,
|
extracted_amount = %s,
|
||||||
extracted_due_date = %s
|
extracted_due_date = %s,
|
||||||
|
extracted_vendor_name = %s,
|
||||||
|
extracted_vendor_cvr = %s,
|
||||||
|
extracted_invoice_date = %s
|
||||||
WHERE id = %s
|
WHERE id = %s
|
||||||
"""
|
"""
|
||||||
|
|
||||||
execute_query(query, (
|
execute_query(query, (
|
||||||
extraction.get('invoice_number'),
|
extraction.get('invoice_number'),
|
||||||
extraction.get('amount'),
|
amount,
|
||||||
extraction.get('due_date'),
|
extraction.get('due_date'),
|
||||||
|
extraction.get('vendor_name'),
|
||||||
|
extraction.get('vendor_cvr'),
|
||||||
|
extraction.get('invoice_date'),
|
||||||
email_id
|
email_id
|
||||||
))
|
))
|
||||||
|
|
||||||
logger.info(f"✅ Updated extracted fields for email {email_id}")
|
logger.info(
|
||||||
|
f"✅ Updated extracted fields for email {email_id}: "
|
||||||
|
f"invoice={extraction.get('invoice_number')}, "
|
||||||
|
f"vendor={extraction.get('vendor_name')}, "
|
||||||
|
f"cvr={extraction.get('vendor_cvr')}, "
|
||||||
|
f"amount={amount}"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Error updating extracted fields: {e}")
|
logger.error(f"❌ Error updating extracted fields: {e}")
|
||||||
|
|
||||||
|
|||||||
11
migrations/140_email_extracted_vendor_fields.sql
Normal file
11
migrations/140_email_extracted_vendor_fields.sql
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
-- Migration 140: Add vendor extraction fields to email_messages
|
||||||
|
-- Stores vendor info extracted from attached invoice PDFs
|
||||||
|
|
||||||
|
ALTER TABLE email_messages
|
||||||
|
ADD COLUMN IF NOT EXISTS extracted_vendor_name VARCHAR(255),
|
||||||
|
ADD COLUMN IF NOT EXISTS extracted_vendor_cvr VARCHAR(20),
|
||||||
|
ADD COLUMN IF NOT EXISTS extracted_invoice_date DATE;
|
||||||
|
|
||||||
|
COMMENT ON COLUMN email_messages.extracted_vendor_name IS 'Vendor name from attached invoice PDF';
|
||||||
|
COMMENT ON COLUMN email_messages.extracted_vendor_cvr IS 'Vendor CVR from attached invoice PDF';
|
||||||
|
COMMENT ON COLUMN email_messages.extracted_invoice_date IS 'Invoice date from attached invoice PDF';
|
||||||
Loading…
Reference in New Issue
Block a user