fix: read body_html + resolve relative file paths for PDF extraction
This commit is contained in:
parent
14e1c87a4c
commit
eb0dad8a10
@ -633,26 +633,70 @@ async def extract_vendor_suggestion(email_id: int):
|
|||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
own_cvr = getattr(settings, 'OWN_CVR', '')
|
own_cvr = getattr(settings, 'OWN_CVR', '')
|
||||||
|
|
||||||
|
def resolve_file_path(raw_path: str) -> Optional[str]:
|
||||||
|
"""Løs relativ/absolut filsti — prøv /app-prefix i Docker"""
|
||||||
|
import os
|
||||||
|
if os.path.exists(raw_path):
|
||||||
|
return raw_path
|
||||||
|
# Docker: CWD er /app, så prøv begge varianter
|
||||||
|
for base in ('/app', '/app/app', ''):
|
||||||
|
candidate = os.path.join(base, raw_path.lstrip('/'))
|
||||||
|
if os.path.exists(candidate):
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
def html_to_text(html: str) -> str:
|
||||||
|
"""Fjern HTML-tags og decode entities til plain text"""
|
||||||
|
import html as html_lib
|
||||||
|
# Fjern style/script blokke
|
||||||
|
text = re.sub(r'<(style|script)[^>]*>.*?</\1>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
# Erstat <br>, <p>, <div>, <td> med linjeskift
|
||||||
|
text = re.sub(r'<(?:br|p|div|tr|td|th|li)[^>]*>', '\n', text, flags=re.IGNORECASE)
|
||||||
|
# Fjern alle resterende tags
|
||||||
|
text = re.sub(r'<[^>]+>', ' ', text)
|
||||||
|
# Decode HTML entities ( & osv.)
|
||||||
|
text = html_lib.unescape(text)
|
||||||
|
# Normaliser whitespace men bevar linjeskift
|
||||||
|
lines = [' '.join(line.split()) for line in text.split('\n')]
|
||||||
|
return '\n'.join(line for line in lines if line)
|
||||||
|
|
||||||
# Saml tekst fra body + PDF-bilag
|
# Saml tekst fra body + PDF-bilag
|
||||||
text_parts = []
|
text_parts = []
|
||||||
if email.get('body_text'):
|
body_text = email.get('body_text') or ''
|
||||||
text_parts.append(("body", email['body_text']))
|
body_html = email.get('body_html') or ''
|
||||||
|
|
||||||
|
if body_text.strip():
|
||||||
|
text_parts.append(("body", body_text))
|
||||||
|
elif body_html.strip():
|
||||||
|
# e-conomic og mange andre sender kun HTML - konverter til plain text
|
||||||
|
plain = html_to_text(body_html)
|
||||||
|
if plain.strip():
|
||||||
|
text_parts.append(("body", plain))
|
||||||
|
logger.info(f"📧 Email {email_id}: bruger HTML→tekst konvertering ({len(plain)} tegn)")
|
||||||
|
|
||||||
|
|
||||||
attachments = execute_query(
|
attachments = execute_query(
|
||||||
"SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id",
|
"SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id",
|
||||||
(email_id,)
|
(email_id,)
|
||||||
)
|
)
|
||||||
for att in (attachments or []):
|
for att in (attachments or []):
|
||||||
file_path = att.get('file_path')
|
raw_path = att.get('file_path')
|
||||||
if file_path and os.path.exists(file_path):
|
if not raw_path:
|
||||||
ct = att.get('content_type', '')
|
continue
|
||||||
if 'pdf' in ct or file_path.lower().endswith('.pdf'):
|
file_path = resolve_file_path(raw_path)
|
||||||
try:
|
if not file_path:
|
||||||
pdf_text = await ollama_service._extract_text_from_file(file_path)
|
logger.warning(f"⚠️ Bilag ikke fundet på disk: {raw_path}")
|
||||||
if pdf_text:
|
continue
|
||||||
text_parts.append(("pdf", pdf_text))
|
ct = att.get('content_type', '')
|
||||||
except Exception as e:
|
if 'pdf' in ct or raw_path.lower().endswith('.pdf'):
|
||||||
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
|
try:
|
||||||
|
from pathlib import Path as PathLib
|
||||||
|
pdf_text = await ollama_service._extract_text_from_file(PathLib(file_path))
|
||||||
|
if pdf_text:
|
||||||
|
text_parts.append(("pdf", pdf_text))
|
||||||
|
logger.info(f"📎 PDF udtrukket: {att.get('filename')} ({len(pdf_text)} tegn)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
|
||||||
|
|
||||||
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
|
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
|
||||||
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
|
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user