From 84c837f303a8f4c7a16a35602372ce99fdff0700 Mon Sep 17 00:00:00 2001 From: Christian Date: Sun, 1 Mar 2026 16:36:05 +0100 Subject: [PATCH] fix: store PDF bytes in DB (content_data) + re-save existing email attachments v2.2.11 --- VERSION | 2 +- app/emails/backend/router.py | 48 ++++++++---- app/services/email_service.py | 76 +++++++++++++++---- .../139_email_attachment_content_data.sql | 7 ++ 4 files changed, 101 insertions(+), 32 deletions(-) create mode 100644 migrations/139_email_attachment_content_data.sql diff --git a/VERSION b/VERSION index a6333e4..0b6e431 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.9 +2.2.11 diff --git a/app/emails/backend/router.py b/app/emails/backend/router.py index 7eb462f..a4a2392 100644 --- a/app/emails/backend/router.py +++ b/app/emails/backend/router.py @@ -681,22 +681,40 @@ async def extract_vendor_suggestion(email_id: int): ) for att in (attachments or []): raw_path = att.get('file_path') - if not raw_path: - continue - file_path = resolve_file_path(raw_path) - if not file_path: - logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}") - continue ct = att.get('content_type', '') - if 'pdf' in ct or raw_path.lower().endswith('.pdf'): - try: - from pathlib import Path as PathLib - pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path)) - if pdf_text: - text_parts.append(("pdf", pdf_text)) - logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)") - except Exception as e: - logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}") + filename = att.get('filename', '') + is_pdf = 'pdf' in ct or filename.lower().endswith('.pdf') + + # Try disk path first + file_path = resolve_file_path(raw_path) if raw_path else None + + if file_path: + if is_pdf: + try: + from pathlib import Path as PathLib + pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path)) + if pdf_text: + text_parts.append(("pdf", pdf_text)) + logger.info(f"📎 PDF udtrukket fra disk: {filename} ({len(pdf_text)} tegn)") + except Exception as e: + logger.warning(f"⚠️ Kunne ikke læse PDF fra disk {file_path}: {e}") + elif is_pdf: + # Fallback: read from content_data column in DB + content_data = att.get('content_data') + if content_data: + try: + import io + import pdfplumber + pdf_bytes = bytes(content_data) + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + pdf_text = "\n".join(p.extract_text() or '' for p in pdf.pages) + if pdf_text.strip(): + text_parts.append(("pdf", pdf_text)) + logger.info(f"📎 PDF udtrukket fra DB content_data: {filename} ({len(pdf_text)} tegn)") + except Exception as e: + logger.warning(f"⚠️ Kunne ikke læse PDF fra content_data ({filename}): {e}") + else: + logger.warning(f"⚠️ Bilag ikke fundet på disk og ingen content_data: {raw_path or filename}") # Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo) # Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument diff --git a/app/services/email_service.py b/app/services/email_service.py index 4ce6126..5dc8626 100644 --- a/app/services/email_service.py +++ b/app/services/email_service.py @@ -212,6 +212,12 @@ class EmailService: logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}") else: logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}") + # Re-save attachment bytes for existing emails (fills content_data for old emails) + if parsed_email.get('attachments'): + await self._resave_attachment_content( + parsed_email['message_id'], + parsed_email['attachments'] + ) except Exception as e: logger.error(f"❌ Error parsing Graph message: {e}") @@ -554,48 +560,86 @@ class EmailService: return None async def _save_attachments(self, email_id: int, attachments: List[Dict]): - """Save email attachments to disk and database""" + """Save email attachments to disk and database (also stores bytes as fallback)""" import os import hashlib from pathlib import Path - # Create uploads directory if not exists - upload_dir = Path("uploads/email_attachments") - upload_dir.mkdir(parents=True, exist_ok=True) + # Use absolute path based on UPLOAD_DIR setting + from app.core.config import settings + upload_dir = Path(settings.UPLOAD_DIR) / "email_attachments" + try: + upload_dir.mkdir(parents=True, exist_ok=True) + except Exception as e: + logger.warning(f"⚠️ Could not create upload dir {upload_dir}: {e}") for att in attachments: try: filename = att['filename'] - content = att['content'] + content = att['content'] # bytes content_type = att.get('content_type', 'application/octet-stream') - size_bytes = att['size'] + size_bytes = att.get('size', len(content) if content else 0) + if not content: + continue + # Generate MD5 hash for deduplication md5_hash = hashlib.md5(content).hexdigest() - # Save to disk with hash prefix - file_path = upload_dir / f"{md5_hash}_{filename}" - file_path.write_bytes(content) + # Try to save to disk + file_path_str = None + try: + file_path = upload_dir / f"{md5_hash}_{filename}" + file_path.write_bytes(content) + file_path_str = str(file_path) + except Exception as e: + logger.warning(f"⚠️ Could not save attachment to disk ({filename}): {e}") - # Save to database + # Save to database — always store content_data as fallback query = """ INSERT INTO email_attachments - (email_id, filename, content_type, size_bytes, file_path) - VALUES (%s, %s, %s, %s, %s) + (email_id, filename, content_type, size_bytes, file_path, content_data) + VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT DO NOTHING """ - execute_insert(query, ( + from psycopg2 import Binary + execute_query(query, ( email_id, filename, content_type, size_bytes, - str(file_path) + file_path_str, + Binary(content) )) - logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes)") + logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes, disk={file_path_str is not None})") except Exception as e: - logger.error(f"❌ Failed to save attachment {filename}: {e}") + logger.error(f"❌ Failed to save attachment {att.get('filename', '?')}: {e}") + async def _resave_attachment_content(self, message_id: str, attachments: List[Dict]): + """For existing emails, store attachment bytes in content_data if not already saved""" + from psycopg2 import Binary + for att in attachments: + try: + filename = att.get('filename') + content = att.get('content') + if not filename or not content: + continue + query = """ + UPDATE email_attachments + SET content_data = %s + WHERE email_id = ( + SELECT id FROM email_messages WHERE message_id = %s LIMIT 1 + ) + AND filename = %s + AND content_data IS NULL + """ + execute_query(query, (Binary(content), message_id, filename)) + logger.debug(f"💾 Re-saved content_data for attachment: {filename}") + except Exception as e: + logger.warning(f"⚠️ Could not re-save content_data for {att.get('filename', '?')}: {e}") + async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]: """Get emails from database that haven't been processed yet""" query = """ diff --git a/migrations/139_email_attachment_content_data.sql b/migrations/139_email_attachment_content_data.sql new file mode 100644 index 0000000..b98345b --- /dev/null +++ b/migrations/139_email_attachment_content_data.sql @@ -0,0 +1,7 @@ +-- Migration 139: Store email attachment bytes in DB +-- Allows PDF text extraction even when file is not on disk (container restarts etc.) + +ALTER TABLE email_attachments + ADD COLUMN IF NOT EXISTS content_data BYTEA; + +COMMENT ON COLUMN email_attachments.content_data IS 'Raw attachment bytes stored in DB as fallback when file_path is unavailable';