diff --git a/VERSION b/VERSION index 23a63f5..a6333e4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.8 +2.2.9 diff --git a/app/emails/backend/router.py b/app/emails/backend/router.py index 642a94c..7eb462f 100644 --- a/app/emails/backend/router.py +++ b/app/emails/backend/router.py @@ -633,26 +633,70 @@ async def extract_vendor_suggestion(email_id: int): from app.core.config import settings own_cvr = getattr(settings, 'OWN_CVR', '') + def resolve_file_path(raw_path: str) -> Optional[str]: + """Løs relativ/absolut filsti — prøv /app-prefix i Docker""" + import os + if os.path.exists(raw_path): + return raw_path + # Docker: CWD er /app, så prøv begge varianter + for base in ('/app', '/app/app', ''): + candidate = os.path.join(base, raw_path.lstrip('/')) + if os.path.exists(candidate): + return candidate + return None + + def html_to_text(html: str) -> str: + """Fjern HTML-tags og decode entities til plain text""" + import html as html_lib + # Fjern style/script blokke + text = re.sub(r'<(style|script)[^>]*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) + # Erstat
,

,

, med linjeskift + text = re.sub(r'<(?:br|p|div|tr|td|th|li)[^>]*>', '\n', text, flags=re.IGNORECASE) + # Fjern alle resterende tags + text = re.sub(r'<[^>]+>', ' ', text) + # Decode HTML entities (  & osv.) + text = html_lib.unescape(text) + # Normaliser whitespace men bevar linjeskift + lines = [' '.join(line.split()) for line in text.split('\n')] + return '\n'.join(line for line in lines if line) + # Saml tekst fra body + PDF-bilag text_parts = [] - if email.get('body_text'): - text_parts.append(("body", email['body_text'])) + body_text = email.get('body_text') or '' + body_html = email.get('body_html') or '' + + if body_text.strip(): + text_parts.append(("body", body_text)) + elif body_html.strip(): + # e-conomic og mange andre sender kun HTML - konverter til plain text + plain = html_to_text(body_html) + if plain.strip(): + text_parts.append(("body", plain)) + logger.info(f"📧 Email {email_id}: bruger HTML→tekst konvertering ({len(plain)} tegn)") + attachments = execute_query( "SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id", (email_id,) ) for att in (attachments or []): - file_path = att.get('file_path') - if file_path and os.path.exists(file_path): - ct = att.get('content_type', '') - if 'pdf' in ct or file_path.lower().endswith('.pdf'): - try: - pdf_text = await ollama_service._extract_text_from_file(file_path) - if pdf_text: - text_parts.append(("pdf", pdf_text)) - except Exception as e: - logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}") + raw_path = att.get('file_path') + if not raw_path: + continue + file_path = resolve_file_path(raw_path) + if not file_path: + logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}") + continue + ct = att.get('content_type', '') + if 'pdf' in ct or raw_path.lower().endswith('.pdf'): + try: + from pathlib import Path as PathLib + pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path)) + if pdf_text: + text_parts.append(("pdf", pdf_text)) + logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)") + except Exception as e: + logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}") # Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo) # Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument