fix: store PDF bytes in DB (content_data) + re-save existing email attachments v2.2.11

2026-03-01 16:36:05 +01:00 · 2026-03-01 16:36:05 +01:00 · 84c837f303
commit 84c837f303
parent eb0dad8a10
4 changed files with 101 additions and 32 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-2.2.9
+2.2.11
--- a/app/emails/backend/router.py
+++ b/app/emails/backend/router.py
@ -681,22 +681,40 @@ async def extract_vendor_suggestion(email_id: int):
        )
        for att in (attachments or []):
            raw_path = att.get('file_path')
-            if not raw_path:
-                continue
-            file_path = resolve_file_path(raw_path)
-            if not file_path:
-                logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
-                continue
            ct = att.get('content_type', '')
-            if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
-                try:
-                    from pathlib import Path as PathLib
-                    pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
-                    if pdf_text:
-                        text_parts.append(("pdf", pdf_text))
-                        logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
-                except Exception as e:
-                    logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
+            filename = att.get('filename', '')
+            is_pdf = 'pdf' in ct or filename.lower().endswith('.pdf')
+
+            # Try disk path first
+            file_path = resolve_file_path(raw_path) if raw_path else None
+
+            if file_path:
+                if is_pdf:
+                    try:
+                        from pathlib import Path as PathLib
+                        pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
+                        if pdf_text:
+                            text_parts.append(("pdf", pdf_text))
+                            logger.info(f"📎 PDF udtrukket fra disk: {filename} ({len(pdf_text)} tegn)")
+                    except Exception as e:
+                        logger.warning(f"⚠️ Kunne ikke læse PDF fra disk {file_path}: {e}")
+            elif is_pdf:
+                # Fallback: read from content_data column in DB
+                content_data = att.get('content_data')
+                if content_data:
+                    try:
+                        import io
+                        import pdfplumber
+                        pdf_bytes = bytes(content_data)
+                        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+                            pdf_text = "\n".join(p.extract_text() or '' for p in pdf.pages)
+                        if pdf_text.strip():
+                            text_parts.append(("pdf", pdf_text))
+                            logger.info(f"📎 PDF udtrukket fra DB content_data: {filename} ({len(pdf_text)} tegn)")
+                    except Exception as e:
+                        logger.warning(f"⚠️ Kunne ikke læse PDF fra content_data ({filename}): {e}")
+                else:
+                    logger.warning(f"⚠️ Bilag ikke fundet på disk og ingen content_data: {raw_path or filename}")

        # Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
        # Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
--- a/app/services/email_service.py
+++ b/app/services/email_service.py
@ -212,6 +212,12 @@ class EmailService:
                                logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}")
                            else:
                                logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}")
+                                # Re-save attachment bytes for existing emails (fills content_data for old emails)
+                                if parsed_email.get('attachments'):
+                                    await self._resave_attachment_content(
+                                        parsed_email['message_id'],
+                                        parsed_email['attachments']
+                                    )
                        
                        except Exception as e:
                            logger.error(f"❌ Error parsing Graph message: {e}")
@ -554,48 +560,86 @@ class EmailService:
            return None
    
    async def _save_attachments(self, email_id: int, attachments: List[Dict]):
-        """Save email attachments to disk and database"""
+        """Save email attachments to disk and database (also stores bytes as fallback)"""
        import os
        import hashlib
        from pathlib import Path
        
-        # Create uploads directory if not exists
-        upload_dir = Path("uploads/email_attachments")
-        upload_dir.mkdir(parents=True, exist_ok=True)
+        # Use absolute path based on UPLOAD_DIR setting
+        from app.core.config import settings
+        upload_dir = Path(settings.UPLOAD_DIR) / "email_attachments"
+        try:
+            upload_dir.mkdir(parents=True, exist_ok=True)
+        except Exception as e:
+            logger.warning(f"⚠️ Could not create upload dir {upload_dir}: {e}")
        
        for att in attachments:
            try:
                filename = att['filename']
-                content = att['content']
+                content = att['content']  # bytes
                content_type = att.get('content_type', 'application/octet-stream')
-                size_bytes = att['size']
+                size_bytes = att.get('size', len(content) if content else 0)
                
+                if not content:
+                    continue
+
                # Generate MD5 hash for deduplication
                md5_hash = hashlib.md5(content).hexdigest()
                
-                # Save to disk with hash prefix
-                file_path = upload_dir / f"{md5_hash}_{filename}"
-                file_path.write_bytes(content)
+                # Try to save to disk
+                file_path_str = None
+                try:
+                    file_path = upload_dir / f"{md5_hash}_{filename}"
+                    file_path.write_bytes(content)
+                    file_path_str = str(file_path)
+                except Exception as e:
+                    logger.warning(f"⚠️ Could not save attachment to disk ({filename}): {e}")
                
-                # Save to database
+                # Save to database — always store content_data as fallback
                query = """
                    INSERT INTO email_attachments 
-                    (email_id, filename, content_type, size_bytes, file_path)
-                    VALUES (%s, %s, %s, %s, %s)
+                    (email_id, filename, content_type, size_bytes, file_path, content_data)
+                    VALUES (%s, %s, %s, %s, %s, %s)
+                    ON CONFLICT DO NOTHING
                """
-                execute_insert(query, (
+                from psycopg2 import Binary
+                execute_query(query, (
                    email_id,
                    filename,
                    content_type,
                    size_bytes,
-                    str(file_path)
+                    file_path_str,
+                    Binary(content)
                ))
                
-                logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes)")
+                logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes, disk={file_path_str is not None})")
                
            except Exception as e:
-                logger.error(f"❌ Failed to save attachment {filename}: {e}")
+                logger.error(f"❌ Failed to save attachment {att.get('filename', '?')}: {e}")
    
+    async def _resave_attachment_content(self, message_id: str, attachments: List[Dict]):
+        """For existing emails, store attachment bytes in content_data if not already saved"""
+        from psycopg2 import Binary
+        for att in attachments:
+            try:
+                filename = att.get('filename')
+                content = att.get('content')
+                if not filename or not content:
+                    continue
+                query = """
+                    UPDATE email_attachments
+                    SET content_data = %s
+                    WHERE email_id = (
+                        SELECT id FROM email_messages WHERE message_id = %s LIMIT 1
+                    )
+                    AND filename = %s
+                    AND content_data IS NULL
+                """
+                execute_query(query, (Binary(content), message_id, filename))
+                logger.debug(f"💾 Re-saved content_data for attachment: {filename}")
+            except Exception as e:
+                logger.warning(f"⚠️ Could not re-save content_data for {att.get('filename', '?')}: {e}")
+
    async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]:
        """Get emails from database that haven't been processed yet"""
        query = """
--- a/migrations/139_email_attachment_content_data.sql
+++ b/migrations/139_email_attachment_content_data.sql
@ -0,0 +1,7 @@
+-- Migration 139: Store email attachment bytes in DB
+-- Allows PDF text extraction even when file is not on disk (container restarts etc.)
+
+ALTER TABLE email_attachments
+    ADD COLUMN IF NOT EXISTS content_data BYTEA;
+
+COMMENT ON COLUMN email_attachments.content_data IS 'Raw attachment bytes stored in DB as fallback when file_path is unavailable';
 @ -1 +1 @@
 .2.9
 .2.11