From 84c837f303a8f4c7a16a35602372ce99fdff0700 Mon Sep 17 00:00:00 2001
From: Christian <christian@blaahund.dk>
Date: Sun, 1 Mar 2026 16:36:05 +0100
Subject: [PATCH] fix: store PDF bytes in DB (content_data) + re-save existing
 email attachments v2.2.11

---
 VERSION                                       |  2 +-
 app/emails/backend/router.py                  | 48 ++++++++----
 app/services/email_service.py                 | 76 +++++++++++++++----
 .../139_email_attachment_content_data.sql     |  7 ++
 4 files changed, 101 insertions(+), 32 deletions(-)
 create mode 100644 migrations/139_email_attachment_content_data.sql

diff --git a/VERSION b/VERSION
index a6333e4..0b6e431 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.2.9
+2.2.11
diff --git a/app/emails/backend/router.py b/app/emails/backend/router.py
index 7eb462f..a4a2392 100644
--- a/app/emails/backend/router.py
+++ b/app/emails/backend/router.py
@@ -681,22 +681,40 @@ async def extract_vendor_suggestion(email_id: int):
         )
         for att in (attachments or []):
             raw_path = att.get('file_path')
-            if not raw_path:
-                continue
-            file_path = resolve_file_path(raw_path)
-            if not file_path:
-                logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
-                continue
             ct = att.get('content_type', '')
-            if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
-                try:
-                    from pathlib import Path as PathLib
-                    pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
-                    if pdf_text:
-                        text_parts.append(("pdf", pdf_text))
-                        logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
-                except Exception as e:
-                    logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
+            filename = att.get('filename', '')
+            is_pdf = 'pdf' in ct or filename.lower().endswith('.pdf')
+
+            # Try disk path first
+            file_path = resolve_file_path(raw_path) if raw_path else None
+
+            if file_path:
+                if is_pdf:
+                    try:
+                        from pathlib import Path as PathLib
+                        pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
+                        if pdf_text:
+                            text_parts.append(("pdf", pdf_text))
+                            logger.info(f"📎 PDF udtrukket fra disk: {filename} ({len(pdf_text)} tegn)")
+                    except Exception as e:
+                        logger.warning(f"⚠️ Kunne ikke læse PDF fra disk {file_path}: {e}")
+            elif is_pdf:
+                # Fallback: read from content_data column in DB
+                content_data = att.get('content_data')
+                if content_data:
+                    try:
+                        import io
+                        import pdfplumber
+                        pdf_bytes = bytes(content_data)
+                        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+                            pdf_text = "\n".join(p.extract_text() or '' for p in pdf.pages)
+                        if pdf_text.strip():
+                            text_parts.append(("pdf", pdf_text))
+                            logger.info(f"📎 PDF udtrukket fra DB content_data: {filename} ({len(pdf_text)} tegn)")
+                    except Exception as e:
+                        logger.warning(f"⚠️ Kunne ikke læse PDF fra content_data ({filename}): {e}")
+                else:
+                    logger.warning(f"⚠️ Bilag ikke fundet på disk og ingen content_data: {raw_path or filename}")
 
         # Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
         # Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
diff --git a/app/services/email_service.py b/app/services/email_service.py
index 4ce6126..5dc8626 100644
--- a/app/services/email_service.py
+++ b/app/services/email_service.py
@@ -212,6 +212,12 @@ class EmailService:
                                 logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}")
                             else:
                                 logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}")
+                                # Re-save attachment bytes for existing emails (fills content_data for old emails)
+                                if parsed_email.get('attachments'):
+                                    await self._resave_attachment_content(
+                                        parsed_email['message_id'],
+                                        parsed_email['attachments']
+                                    )
                         
                         except Exception as e:
                             logger.error(f"❌ Error parsing Graph message: {e}")
@@ -554,48 +560,86 @@ class EmailService:
             return None
     
     async def _save_attachments(self, email_id: int, attachments: List[Dict]):
-        """Save email attachments to disk and database"""
+        """Save email attachments to disk and database (also stores bytes as fallback)"""
         import os
         import hashlib
         from pathlib import Path
         
-        # Create uploads directory if not exists
-        upload_dir = Path("uploads/email_attachments")
-        upload_dir.mkdir(parents=True, exist_ok=True)
+        # Use absolute path based on UPLOAD_DIR setting
+        from app.core.config import settings
+        upload_dir = Path(settings.UPLOAD_DIR) / "email_attachments"
+        try:
+            upload_dir.mkdir(parents=True, exist_ok=True)
+        except Exception as e:
+            logger.warning(f"⚠️ Could not create upload dir {upload_dir}: {e}")
         
         for att in attachments:
             try:
                 filename = att['filename']
-                content = att['content']
+                content = att['content']  # bytes
                 content_type = att.get('content_type', 'application/octet-stream')
-                size_bytes = att['size']
+                size_bytes = att.get('size', len(content) if content else 0)
                 
+                if not content:
+                    continue
+
                 # Generate MD5 hash for deduplication
                 md5_hash = hashlib.md5(content).hexdigest()
                 
-                # Save to disk with hash prefix
-                file_path = upload_dir / f"{md5_hash}_{filename}"
-                file_path.write_bytes(content)
+                # Try to save to disk
+                file_path_str = None
+                try:
+                    file_path = upload_dir / f"{md5_hash}_{filename}"
+                    file_path.write_bytes(content)
+                    file_path_str = str(file_path)
+                except Exception as e:
+                    logger.warning(f"⚠️ Could not save attachment to disk ({filename}): {e}")
                 
-                # Save to database
+                # Save to database — always store content_data as fallback
                 query = """
                     INSERT INTO email_attachments 
-                    (email_id, filename, content_type, size_bytes, file_path)
-                    VALUES (%s, %s, %s, %s, %s)
+                    (email_id, filename, content_type, size_bytes, file_path, content_data)
+                    VALUES (%s, %s, %s, %s, %s, %s)
+                    ON CONFLICT DO NOTHING
                 """
-                execute_insert(query, (
+                from psycopg2 import Binary
+                execute_query(query, (
                     email_id,
                     filename,
                     content_type,
                     size_bytes,
-                    str(file_path)
+                    file_path_str,
+                    Binary(content)
                 ))
                 
-                logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes)")
+                logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes, disk={file_path_str is not None})")
                 
             except Exception as e:
-                logger.error(f"❌ Failed to save attachment {filename}: {e}")
+                logger.error(f"❌ Failed to save attachment {att.get('filename', '?')}: {e}")
     
+    async def _resave_attachment_content(self, message_id: str, attachments: List[Dict]):
+        """For existing emails, store attachment bytes in content_data if not already saved"""
+        from psycopg2 import Binary
+        for att in attachments:
+            try:
+                filename = att.get('filename')
+                content = att.get('content')
+                if not filename or not content:
+                    continue
+                query = """
+                    UPDATE email_attachments
+                    SET content_data = %s
+                    WHERE email_id = (
+                        SELECT id FROM email_messages WHERE message_id = %s LIMIT 1
+                    )
+                    AND filename = %s
+                    AND content_data IS NULL
+                """
+                execute_query(query, (Binary(content), message_id, filename))
+                logger.debug(f"💾 Re-saved content_data for attachment: {filename}")
+            except Exception as e:
+                logger.warning(f"⚠️ Could not re-save content_data for {att.get('filename', '?')}: {e}")
+
     async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]:
         """Get emails from database that haven't been processed yet"""
         query = """
diff --git a/migrations/139_email_attachment_content_data.sql b/migrations/139_email_attachment_content_data.sql
new file mode 100644
index 0000000..b98345b
--- /dev/null
+++ b/migrations/139_email_attachment_content_data.sql
@@ -0,0 +1,7 @@
+-- Migration 139: Store email attachment bytes in DB
+-- Allows PDF text extraction even when file is not on disk (container restarts etc.)
+
+ALTER TABLE email_attachments
+    ADD COLUMN IF NOT EXISTS content_data BYTEA;
+
+COMMENT ON COLUMN email_attachments.content_data IS 'Raw attachment bytes stored in DB as fallback when file_path is unavailable';