fix: read body_html + resolve relative file paths for PDF extraction

2026-03-01 16:03:07 +01:00 · 2026-03-01 16:03:07 +01:00 · eb0dad8a10
commit eb0dad8a10
parent 14e1c87a4c
2 changed files with 57 additions and 13 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-2.2.8
+2.2.9
--- a/app/emails/backend/router.py
+++ b/app/emails/backend/router.py
@ -633,26 +633,70 @@ async def extract_vendor_suggestion(email_id: int):
        from app.core.config import settings
        own_cvr = getattr(settings, 'OWN_CVR', '')

+        def resolve_file_path(raw_path: str) -> Optional[str]:
+            """Løs relativ/absolut filsti — prøv /app-prefix i Docker"""
+            import os
+            if os.path.exists(raw_path):
+                return raw_path
+            # Docker: CWD er /app, så prøv begge varianter
+            for base in ('/app', '/app/app', ''):
+                candidate = os.path.join(base, raw_path.lstrip('/'))
+                if os.path.exists(candidate):
+                    return candidate
+            return None
+
+        def html_to_text(html: str) -> str:
+            """Fjern HTML-tags og decode entities til plain text"""
+            import html as html_lib
+            # Fjern style/script blokke
+            text = re.sub(r'<(style|script)[^>]*>.*?</\1>', '', html, flags=re.DOTALL | re.IGNORECASE)
+            # Erstat <br>, <p>, <div>, <td> med linjeskift
+            text = re.sub(r'<(?:br|p|div|tr|td|th|li)[^>]*>', '\n', text, flags=re.IGNORECASE)
+            # Fjern alle resterende tags
+            text = re.sub(r'<[^>]+>', ' ', text)
+            # Decode HTML entities (&nbsp; &amp; osv.)
+            text = html_lib.unescape(text)
+            # Normaliser whitespace men bevar linjeskift
+            lines = [' '.join(line.split()) for line in text.split('\n')]
+            return '\n'.join(line for line in lines if line)
+
        # Saml tekst fra body + PDF-bilag
        text_parts = []
-        if email.get('body_text'):
-            text_parts.append(("body", email['body_text']))
+        body_text = email.get('body_text') or ''
+        body_html = email.get('body_html') or ''
+
+        if body_text.strip():
+            text_parts.append(("body", body_text))
+        elif body_html.strip():
+            # e-conomic og mange andre sender kun HTML - konverter til plain text
+            plain = html_to_text(body_html)
+            if plain.strip():
+                text_parts.append(("body", plain))
+                logger.info(f"📧 Email {email_id}: bruger HTML→tekst konvertering ({len(plain)} tegn)")
+

        attachments = execute_query(
            "SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id",
            (email_id,)
        )
        for att in (attachments or []):
-            file_path = att.get('file_path')
-            if file_path and os.path.exists(file_path):
-                ct = att.get('content_type', '')
-                if 'pdf' in ct or file_path.lower().endswith('.pdf'):
-                    try:
-                        pdf_text = await ollama_service._extract_text_from_file(file_path)
-                        if pdf_text:
-                            text_parts.append(("pdf", pdf_text))
-                    except Exception as e:
-                        logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
+            raw_path = att.get('file_path')
+            if not raw_path:
+                continue
+            file_path = resolve_file_path(raw_path)
+            if not file_path:
+                logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
+                continue
+            ct = att.get('content_type', '')
+            if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
+                try:
+                    from pathlib import Path as PathLib
+                    pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
+                    if pdf_text:
+                        text_parts.append(("pdf", pdf_text))
+                        logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
+                except Exception as e:
+                    logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")

        # Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
        # Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
 @ -1 +1 @@
 .2.8
 .2.9