fix: store PDF bytes in DB (content_data) + re-save existing email attachments v2.2.11

This commit is contained in:
Christian 2026-03-01 16:36:05 +01:00
parent eb0dad8a10
commit 84c837f303
4 changed files with 101 additions and 32 deletions

View File

@ -1 +1 @@
2.2.9
2.2.11

View File

@ -681,22 +681,40 @@ async def extract_vendor_suggestion(email_id: int):
)
for att in (attachments or []):
raw_path = att.get('file_path')
if not raw_path:
continue
file_path = resolve_file_path(raw_path)
if not file_path:
logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
continue
ct = att.get('content_type', '')
if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
filename = att.get('filename', '')
is_pdf = 'pdf' in ct or filename.lower().endswith('.pdf')
# Try disk path first
file_path = resolve_file_path(raw_path) if raw_path else None
if file_path:
if is_pdf:
try:
from pathlib import Path as PathLib
pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
if pdf_text:
text_parts.append(("pdf", pdf_text))
logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
logger.info(f"📎 PDF udtrukket fra disk: {filename} ({len(pdf_text)} tegn)")
except Exception as e:
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
logger.warning(f"⚠️ Kunne ikke læse PDF fra disk {file_path}: {e}")
elif is_pdf:
# Fallback: read from content_data column in DB
content_data = att.get('content_data')
if content_data:
try:
import io
import pdfplumber
pdf_bytes = bytes(content_data)
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
pdf_text = "\n".join(p.extract_text() or '' for p in pdf.pages)
if pdf_text.strip():
text_parts.append(("pdf", pdf_text))
logger.info(f"📎 PDF udtrukket fra DB content_data: {filename} ({len(pdf_text)} tegn)")
except Exception as e:
logger.warning(f"⚠️ Kunne ikke læse PDF fra content_data ({filename}): {e}")
else:
logger.warning(f"⚠️ Bilag ikke fundet på disk og ingen content_data: {raw_path or filename}")
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument

View File

@ -212,6 +212,12 @@ class EmailService:
logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}")
else:
logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}")
# Re-save attachment bytes for existing emails (fills content_data for old emails)
if parsed_email.get('attachments'):
await self._resave_attachment_content(
parsed_email['message_id'],
parsed_email['attachments']
)
except Exception as e:
logger.error(f"❌ Error parsing Graph message: {e}")
@ -554,47 +560,85 @@ class EmailService:
return None
async def _save_attachments(self, email_id: int, attachments: List[Dict]):
"""Save email attachments to disk and database"""
"""Save email attachments to disk and database (also stores bytes as fallback)"""
import os
import hashlib
from pathlib import Path
# Create uploads directory if not exists
upload_dir = Path("uploads/email_attachments")
# Use absolute path based on UPLOAD_DIR setting
from app.core.config import settings
upload_dir = Path(settings.UPLOAD_DIR) / "email_attachments"
try:
upload_dir.mkdir(parents=True, exist_ok=True)
except Exception as e:
logger.warning(f"⚠️ Could not create upload dir {upload_dir}: {e}")
for att in attachments:
try:
filename = att['filename']
content = att['content']
content = att['content'] # bytes
content_type = att.get('content_type', 'application/octet-stream')
size_bytes = att['size']
size_bytes = att.get('size', len(content) if content else 0)
if not content:
continue
# Generate MD5 hash for deduplication
md5_hash = hashlib.md5(content).hexdigest()
# Save to disk with hash prefix
# Try to save to disk
file_path_str = None
try:
file_path = upload_dir / f"{md5_hash}_{filename}"
file_path.write_bytes(content)
file_path_str = str(file_path)
except Exception as e:
logger.warning(f"⚠️ Could not save attachment to disk ({filename}): {e}")
# Save to database
# Save to database — always store content_data as fallback
query = """
INSERT INTO email_attachments
(email_id, filename, content_type, size_bytes, file_path)
VALUES (%s, %s, %s, %s, %s)
(email_id, filename, content_type, size_bytes, file_path, content_data)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
"""
execute_insert(query, (
from psycopg2 import Binary
execute_query(query, (
email_id,
filename,
content_type,
size_bytes,
str(file_path)
file_path_str,
Binary(content)
))
logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes)")
logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes, disk={file_path_str is not None})")
except Exception as e:
logger.error(f"❌ Failed to save attachment {filename}: {e}")
logger.error(f"❌ Failed to save attachment {att.get('filename', '?')}: {e}")
async def _resave_attachment_content(self, message_id: str, attachments: List[Dict]):
"""For existing emails, store attachment bytes in content_data if not already saved"""
from psycopg2 import Binary
for att in attachments:
try:
filename = att.get('filename')
content = att.get('content')
if not filename or not content:
continue
query = """
UPDATE email_attachments
SET content_data = %s
WHERE email_id = (
SELECT id FROM email_messages WHERE message_id = %s LIMIT 1
)
AND filename = %s
AND content_data IS NULL
"""
execute_query(query, (Binary(content), message_id, filename))
logger.debug(f"💾 Re-saved content_data for attachment: {filename}")
except Exception as e:
logger.warning(f"⚠️ Could not re-save content_data for {att.get('filename', '?')}: {e}")
async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]:
"""Get emails from database that haven't been processed yet"""

View File

@ -0,0 +1,7 @@
-- Migration 139: Store email attachment bytes in DB
-- Allows PDF text extraction even when file is not on disk (container restarts etc.)
ALTER TABLE email_attachments
ADD COLUMN IF NOT EXISTS content_data BYTEA;
COMMENT ON COLUMN email_attachments.content_data IS 'Raw attachment bytes stored in DB as fallback when file_path is unavailable';