diff --git a/VERSION b/VERSION
index 23a63f5..a6333e4 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.2.8
+2.2.9
diff --git a/app/emails/backend/router.py b/app/emails/backend/router.py
index 642a94c..7eb462f 100644
--- a/app/emails/backend/router.py
+++ b/app/emails/backend/router.py
@@ -633,26 +633,70 @@ async def extract_vendor_suggestion(email_id: int):
from app.core.config import settings
own_cvr = getattr(settings, 'OWN_CVR', '')
+ def resolve_file_path(raw_path: str) -> Optional[str]:
+ """Løs relativ/absolut filsti — prøv /app-prefix i Docker"""
+ import os
+ if os.path.exists(raw_path):
+ return raw_path
+ # Docker: CWD er /app, så prøv begge varianter
+ for base in ('/app', '/app/app', ''):
+ candidate = os.path.join(base, raw_path.lstrip('/'))
+ if os.path.exists(candidate):
+ return candidate
+ return None
+
+ def html_to_text(html: str) -> str:
+ """Fjern HTML-tags og decode entities til plain text"""
+ import html as html_lib
+ # Fjern style/script blokke
+ text = re.sub(r'<(style|script)[^>]*>.*?\1>', '', html, flags=re.DOTALL | re.IGNORECASE)
+ # Erstat
,
,
,
med linjeskift
+ text = re.sub(r'<(?:br|p|div|tr|td|th|li)[^>]*>', '\n', text, flags=re.IGNORECASE)
+ # Fjern alle resterende tags
+ text = re.sub(r'<[^>]+>', ' ', text)
+ # Decode HTML entities ( & osv.)
+ text = html_lib.unescape(text)
+ # Normaliser whitespace men bevar linjeskift
+ lines = [' '.join(line.split()) for line in text.split('\n')]
+ return '\n'.join(line for line in lines if line)
+
# Saml tekst fra body + PDF-bilag
text_parts = []
- if email.get('body_text'):
- text_parts.append(("body", email['body_text']))
+ body_text = email.get('body_text') or ''
+ body_html = email.get('body_html') or ''
+
+ if body_text.strip():
+ text_parts.append(("body", body_text))
+ elif body_html.strip():
+ # e-conomic og mange andre sender kun HTML - konverter til plain text
+ plain = html_to_text(body_html)
+ if plain.strip():
+ text_parts.append(("body", plain))
+ logger.info(f"📧 Email {email_id}: bruger HTML→tekst konvertering ({len(plain)} tegn)")
+
attachments = execute_query(
"SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id",
(email_id,)
)
for att in (attachments or []):
- file_path = att.get('file_path')
- if file_path and os.path.exists(file_path):
- ct = att.get('content_type', '')
- if 'pdf' in ct or file_path.lower().endswith('.pdf'):
- try:
- pdf_text = await ollama_service._extract_text_from_file(file_path)
- if pdf_text:
- text_parts.append(("pdf", pdf_text))
- except Exception as e:
- logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
+ raw_path = att.get('file_path')
+ if not raw_path:
+ continue
+ file_path = resolve_file_path(raw_path)
+ if not file_path:
+ logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
+ continue
+ ct = att.get('content_type', '')
+ if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
+ try:
+ from pathlib import Path as PathLib
+ pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
+ if pdf_text:
+ text_parts.append(("pdf", pdf_text))
+ logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
+ except Exception as e:
+ logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
|