fix: store PDF bytes in DB (content_data) + re-save existing email attachments v2.2.11
This commit is contained in:
parent
eb0dad8a10
commit
84c837f303
@ -681,22 +681,40 @@ async def extract_vendor_suggestion(email_id: int):
|
||||
)
|
||||
for att in (attachments or []):
|
||||
raw_path = att.get('file_path')
|
||||
if not raw_path:
|
||||
continue
|
||||
file_path = resolve_file_path(raw_path)
|
||||
if not file_path:
|
||||
logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
|
||||
continue
|
||||
ct = att.get('content_type', '')
|
||||
if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
|
||||
try:
|
||||
from pathlib import Path as PathLib
|
||||
pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
|
||||
if pdf_text:
|
||||
text_parts.append(("pdf", pdf_text))
|
||||
logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
|
||||
filename = att.get('filename', '')
|
||||
is_pdf = 'pdf' in ct or filename.lower().endswith('.pdf')
|
||||
|
||||
# Try disk path first
|
||||
file_path = resolve_file_path(raw_path) if raw_path else None
|
||||
|
||||
if file_path:
|
||||
if is_pdf:
|
||||
try:
|
||||
from pathlib import Path as PathLib
|
||||
pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
|
||||
if pdf_text:
|
||||
text_parts.append(("pdf", pdf_text))
|
||||
logger.info(f"📎 PDF udtrukket fra disk: {filename} ({len(pdf_text)} tegn)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Kunne ikke læse PDF fra disk {file_path}: {e}")
|
||||
elif is_pdf:
|
||||
# Fallback: read from content_data column in DB
|
||||
content_data = att.get('content_data')
|
||||
if content_data:
|
||||
try:
|
||||
import io
|
||||
import pdfplumber
|
||||
pdf_bytes = bytes(content_data)
|
||||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||
pdf_text = "\n".join(p.extract_text() or '' for p in pdf.pages)
|
||||
if pdf_text.strip():
|
||||
text_parts.append(("pdf", pdf_text))
|
||||
logger.info(f"📎 PDF udtrukket fra DB content_data: {filename} ({len(pdf_text)} tegn)")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Kunne ikke læse PDF fra content_data ({filename}): {e}")
|
||||
else:
|
||||
logger.warning(f"⚠️ Bilag ikke fundet på disk og ingen content_data: {raw_path or filename}")
|
||||
|
||||
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
|
||||
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
|
||||
|
||||
@ -212,6 +212,12 @@ class EmailService:
|
||||
logger.info(f"✅ New email: {parsed_email['subject'][:50]}... from {parsed_email['sender_email']}")
|
||||
else:
|
||||
logger.debug(f"⏭️ Email already exists: {parsed_email['message_id']}")
|
||||
# Re-save attachment bytes for existing emails (fills content_data for old emails)
|
||||
if parsed_email.get('attachments'):
|
||||
await self._resave_attachment_content(
|
||||
parsed_email['message_id'],
|
||||
parsed_email['attachments']
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error parsing Graph message: {e}")
|
||||
@ -554,48 +560,86 @@ class EmailService:
|
||||
return None
|
||||
|
||||
async def _save_attachments(self, email_id: int, attachments: List[Dict]):
|
||||
"""Save email attachments to disk and database"""
|
||||
"""Save email attachments to disk and database (also stores bytes as fallback)"""
|
||||
import os
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
# Create uploads directory if not exists
|
||||
upload_dir = Path("uploads/email_attachments")
|
||||
upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Use absolute path based on UPLOAD_DIR setting
|
||||
from app.core.config import settings
|
||||
upload_dir = Path(settings.UPLOAD_DIR) / "email_attachments"
|
||||
try:
|
||||
upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not create upload dir {upload_dir}: {e}")
|
||||
|
||||
for att in attachments:
|
||||
try:
|
||||
filename = att['filename']
|
||||
content = att['content']
|
||||
content = att['content'] # bytes
|
||||
content_type = att.get('content_type', 'application/octet-stream')
|
||||
size_bytes = att['size']
|
||||
size_bytes = att.get('size', len(content) if content else 0)
|
||||
|
||||
if not content:
|
||||
continue
|
||||
|
||||
# Generate MD5 hash for deduplication
|
||||
md5_hash = hashlib.md5(content).hexdigest()
|
||||
|
||||
# Save to disk with hash prefix
|
||||
file_path = upload_dir / f"{md5_hash}_{filename}"
|
||||
file_path.write_bytes(content)
|
||||
# Try to save to disk
|
||||
file_path_str = None
|
||||
try:
|
||||
file_path = upload_dir / f"{md5_hash}_{filename}"
|
||||
file_path.write_bytes(content)
|
||||
file_path_str = str(file_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not save attachment to disk ({filename}): {e}")
|
||||
|
||||
# Save to database
|
||||
# Save to database — always store content_data as fallback
|
||||
query = """
|
||||
INSERT INTO email_attachments
|
||||
(email_id, filename, content_type, size_bytes, file_path)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
(email_id, filename, content_type, size_bytes, file_path, content_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
execute_insert(query, (
|
||||
from psycopg2 import Binary
|
||||
execute_query(query, (
|
||||
email_id,
|
||||
filename,
|
||||
content_type,
|
||||
size_bytes,
|
||||
str(file_path)
|
||||
file_path_str,
|
||||
Binary(content)
|
||||
))
|
||||
|
||||
logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes)")
|
||||
logger.info(f"📎 Saved attachment: {filename} ({size_bytes} bytes, disk={file_path_str is not None})")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to save attachment {filename}: {e}")
|
||||
logger.error(f"❌ Failed to save attachment {att.get('filename', '?')}: {e}")
|
||||
|
||||
async def _resave_attachment_content(self, message_id: str, attachments: List[Dict]):
|
||||
"""For existing emails, store attachment bytes in content_data if not already saved"""
|
||||
from psycopg2 import Binary
|
||||
for att in attachments:
|
||||
try:
|
||||
filename = att.get('filename')
|
||||
content = att.get('content')
|
||||
if not filename or not content:
|
||||
continue
|
||||
query = """
|
||||
UPDATE email_attachments
|
||||
SET content_data = %s
|
||||
WHERE email_id = (
|
||||
SELECT id FROM email_messages WHERE message_id = %s LIMIT 1
|
||||
)
|
||||
AND filename = %s
|
||||
AND content_data IS NULL
|
||||
"""
|
||||
execute_query(query, (Binary(content), message_id, filename))
|
||||
logger.debug(f"💾 Re-saved content_data for attachment: {filename}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not re-save content_data for {att.get('filename', '?')}: {e}")
|
||||
|
||||
async def get_unprocessed_emails(self, limit: int = 100) -> List[Dict]:
|
||||
"""Get emails from database that haven't been processed yet"""
|
||||
query = """
|
||||
|
||||
7
migrations/139_email_attachment_content_data.sql
Normal file
7
migrations/139_email_attachment_content_data.sql
Normal file
@ -0,0 +1,7 @@
|
||||
-- Migration 139: Store email attachment bytes in DB
|
||||
-- Allows PDF text extraction even when file is not on disk (container restarts etc.)
|
||||
|
||||
ALTER TABLE email_attachments
|
||||
ADD COLUMN IF NOT EXISTS content_data BYTEA;
|
||||
|
||||
COMMENT ON COLUMN email_attachments.content_data IS 'Raw attachment bytes stored in DB as fallback when file_path is unavailable';
|
||||
Loading…
Reference in New Issue
Block a user