fix: store PDF bytes in DB (content_data) + re-save existing email attachments v2.2.11

This commit is contained in:
Christian 2026-03-01 16:36:05 +01:00
parent eb0dad8a10
commit 84c837f303
4 changed files with 101 additions and 32 deletions

View File

@ -1 +1 @@
2.2.9 2.2.11

View File

@ -681,22 +681,40 @@ async def extract_vendor_suggestion(email_id: int):
) )
for att in (attachments or []): for att in (attachments or []):
raw_path = att.get('file_path') raw_path = att.get('file_path')
if not raw_path:
continue
file_path = resolve_file_path(raw_path)
if not file_path:
logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
continue
ct = att.get('content_type', '') ct = att.get('content_type', '')
if 'pdf' in ct or raw_path.lower().endswith('.pdf'): filename = att.get('filename', '')
try: is_pdf = 'pdf' in ct or filename.lower().endswith('.pdf')
from pathlib import Path as PathLib
pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path)) # Try disk path first
if pdf_text: file_path = resolve_file_path(raw_path) if raw_path else None
text_parts.append(("pdf", pdf_text))
logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)") if file_path:
except Exception as e: if is_pdf:
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}") try:
from pathlib import Path as PathLib
pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
if pdf_text:
text_parts.append(("pdf", pdf_text))
logger.info(f"📎 PDF udtrukket fra disk: {filename} ({len(pdf_text)} tegn)")
except Exception as e:
logger.warning(f"⚠️ Kunne ikke læse PDF fra disk {file_path}: {e}")
elif is_pdf:
# Fallback: read from content_data column in DB
content_data = att.get('content_data')
if content_data:
try:
import io
import pdfplumber
pdf_bytes = bytes(content_data)
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
pdf_text = "\n".join(p.extract_text() or '' for p in pdf.pages)
if pdf_text.strip():
text_parts.append(("pdf", pdf_text))
logger.info(f"📎 PDF udtrukket fra DB content_data: {filename} ({len(pdf_text)} tegn)")
except Exception as e:
logger.warning(f"⚠️ Kunne ikke læse PDF fra content_data ({filename}): {e}")
else:
logger.warning(f"⚠️ Bilag ikke fundet på disk og ingen content_data: {raw_path or filename}")
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo) # Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument # Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument

View File

@ -212,6 +212,12 @@ class EmailService:
logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}") logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}")
else: else:
logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}") logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}")
# Re-save attachment bytes for existing emails (fills content_data for old emails)
if parsed_email.get('attachments'):
await self._resave_attachment_content(
parsed_email['message_id'],
parsed_email['attachments']
)
except Exception as e: except Exception as e:
logger.error(f"❌ Error parsing Graph message: {e}") logger.error(f"❌ Error parsing Graph message: {e}")
@ -554,48 +560,86 @@ class EmailService:
return None return None
async def _save_attachments(self, email_id: int, attachments: List[Dict]): async def _save_attachments(self, email_id: int, attachments: List[Dict]):
"""Save email attachments to disk and database""" """Save email attachments to disk and database (also stores bytes as fallback)"""
import os import os
import hashlib import hashlib
from pathlib import Path from pathlib import Path
# Create uploads directory if not exists # Use absolute path based on UPLOAD_DIR setting
upload_dir = Path("uploads/email_attachments") from app.core.config import settings
upload_dir.mkdir(parents=True, exist_ok=True) upload_dir = Path(settings.UPLOAD_DIR) / "email_attachments"
try:
upload_dir.mkdir(parents=True, exist_ok=True)
except Exception as e:
logger.warning(f"⚠️ Could not create upload dir {upload_dir}: {e}")
for att in attachments: for att in attachments:
try: try:
filename = att['filename'] filename = att['filename']
content = att['content'] content = att['content'] # bytes
content_type = att.get('content_type', 'application/octet-stream') content_type = att.get('content_type', 'application/octet-stream')
size_bytes = att['size'] size_bytes = att.get('size', len(content) if content else 0)
if not content:
continue
# Generate MD5 hash for deduplication # Generate MD5 hash for deduplication
md5_hash = hashlib.md5(content).hexdigest() md5_hash = hashlib.md5(content).hexdigest()
# Save to disk with hash prefix # Try to save to disk
file_path = upload_dir / f"{md5_hash}_{filename}" file_path_str = None
file_path.write_bytes(content) try:
file_path = upload_dir / f"{md5_hash}_{filename}"
file_path.write_bytes(content)
file_path_str = str(file_path)
except Exception as e:
logger.warning(f"⚠️ Could not save attachment to disk ({filename}): {e}")
# Save to database # Save to database — always store content_data as fallback
query = """ query = """
INSERT INTO email_attachments INSERT INTO email_attachments
(email_id, filename, content_type, size_bytes, file_path) (email_id, filename, content_type, size_bytes, file_path, content_data)
VALUES (%s, %s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""" """
execute_insert(query, ( from psycopg2 import Binary
execute_query(query, (
email_id, email_id,
filename, filename,
content_type, content_type,
size_bytes, size_bytes,
str(file_path) file_path_str,
Binary(content)
)) ))
logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes)") logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes, disk={file_path_str is not None})")
except Exception as e: except Exception as e:
logger.error(f"❌ Failed to save attachment {filename}: {e}") logger.error(f"❌ Failed to save attachment {att.get('filename', '?')}: {e}")
async def _resave_attachment_content(self, message_id: str, attachments: List[Dict]):
"""For existing emails, store attachment bytes in content_data if not already saved"""
from psycopg2 import Binary
for att in attachments:
try:
filename = att.get('filename')
content = att.get('content')
if not filename or not content:
continue
query = """
UPDATE email_attachments
SET content_data = %s
WHERE email_id = (
SELECT id FROM email_messages WHERE message_id = %s LIMIT 1
)
AND filename = %s
AND content_data IS NULL
"""
execute_query(query, (Binary(content), message_id, filename))
logger.debug(f"💾 Re-saved content_data for attachment: {filename}")
except Exception as e:
logger.warning(f"⚠️ Could not re-save content_data for {att.get('filename', '?')}: {e}")
async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]: async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]:
"""Get emails from database that haven't been processed yet""" """Get emails from database that haven't been processed yet"""
query = """ query = """

View File

@ -0,0 +1,7 @@
-- Migration 139: Store email attachment bytes in DB
-- Allows PDF text extraction even when file is not on disk (container restarts etc.)
ALTER TABLE email_attachments
ADD COLUMN IF NOT EXISTS content_data BYTEA;
COMMENT ON COLUMN email_attachments.content_data IS 'Raw attachment bytes stored in DB as fallback when file_path is unavailable';