From 14e1c87a4c46dcd8aef19f8ec554e7fb7aad8521 Mon Sep 17 00:00:00 2001 From: Christian Date: Sun, 1 Mar 2026 15:51:45 +0100 Subject: [PATCH] fix: dedicated footer parser + debug logging for PDF text extraction --- VERSION | 2 +- app/emails/backend/router.py | 137 ++++++++++++++++++++++++++++------- 2 files changed, 113 insertions(+), 26 deletions(-) diff --git a/VERSION b/VERSION index 5bc1cc4..23a63f5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.7 +2.2.8 diff --git a/app/emails/backend/router.py b/app/emails/backend/router.py index be2d6ae..642a94c 100644 --- a/app/emails/backend/router.py +++ b/app/emails/backend/router.py @@ -520,29 +520,106 @@ async def extract_vendor_suggestion(email_id: int): if m: return m.group(1).strip() - # Prioritet 2: typisk faktura-footer format: - # "FirmaNavn - Adresse - Postnr By - CVR-nr.: XXXXXXXX" + # Prioritet 2: e-conomic footer: "FirmaNavn - Adresse - ..." + # Virker både med og uden linjeskift foran m = re.search( - r'^([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-–]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]', + r'(?:^|\n)([A-ZÆØÅ][A-Za-zæøåÆØÅ][^\n\-]{1,40}?)\s*[-–]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]', text, re.MULTILINE ) if m: name = m.group(1).strip() - if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil')): + if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr', 'faktura')): return name - # Prioritet 3: tekst lige FORAN "CVR" på samme linje + # Prioritet 3: tekst umiddelbart FORAN "CVR" (typisk "FirmaNavn CVR-nr.") m = re.search( - r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-–,]?\s*(?:CVR|Cvr)', + r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s&\'\.]{2,50}?)\s*[-–,]?\s*(?:CVR|cvr)', text ) if m: - name = m.group(1).strip().rstrip('-–, ') - if len(name) > 2: + name = m.group(1).strip().rstrip('-–, \t') + if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'mail', 'bank')): return name return sender_name or None + def parse_vendor_footer(text: str, own_cvr: str = '') -> dict: + """ + Parser specifikt til e-conomic/Dinero footer-format: + "KONI Accounting - Jernbanegade 12K, st.tv - 4000 Roskilde - DK - CVR-nr.: 35962344" + + Splitter på ' - ' og identificerer segmenterne. + """ + result = {} + # Find linjer der indeholder både vejnavn/postnummer OG CVR-lignende mønstre + # eller blot det klassiske "Firma - Adresse - Postnr By" mønster + for line in text.replace('\r', '\n').split('\n'): + line = line.strip() + if len(line) < 10: + continue + + # Forsøg: split på ' - ' eller ' – ' + parts = re.split(r'\s*[-–]\s*', line) + if len(parts) < 3: + continue + + # Del 0 er typisk firmanavnet (ingen tal, ingen '@') + # Del 1 er typisk adressen (indeholder tal + vejnavn) + # Del 2 (eller del med 4 cifre) er postnummer + by + name_candidate = parts[0].strip() + if not name_candidate or any(c.isdigit() for c in name_candidate[:3]): + continue + if any(w in name_candidate.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr')): + continue + + # Find adresse-del (indeholder et vejnummer: bogstaver + tal) + addr_part = None + zip_city_part = None + for part in parts[1:]: + part = part.strip() + # Postnummer-format: 4 cifre + by + if re.match(r'^\d{4}\s+[A-ZÆØÅ]', part): + zip_city_part = part + elif re.search(r'\d', part) and addr_part is None: + # Del med tal = adresse + if not re.match(r'^DK$', part.strip(), re.IGNORECASE): + addr_part = part + + if name_candidate and (addr_part or zip_city_part): + result['name'] = name_candidate + if addr_part and zip_city_part: + result['address'] = f"{addr_part}, {zip_city_part}" + elif addr_part: + result['address'] = addr_part + elif zip_city_part: + result['address'] = zip_city_part + + # Find CVR i denne linje + cvr_m = re.search(r'CVR[^:]*:\s*(\d{8})', line, re.IGNORECASE) + if cvr_m: + val = cvr_m.group(1) + if val != own_cvr and not is_placeholder_cvr(val): + result['cvr_number'] = val + + # Find telefon i denne linje + phone_m = re.search(r'(?:Tlf|Tel|Mobil)[.:]?\s*(\+?[\d][\d\s\-]{6,15})', line, re.IGNORECASE) + if phone_m: + result['phone'] = clean_phone(phone_m.group(1)) + + # Find email i denne linje + email_m = re.search(r'(?:Mail|E-mail|Email)[.:]?\s*([\w.\-+]+@[\w\-]+\.[\w\-]+)', line, re.IGNORECASE) + if email_m: + dom = email_m.group(1).split('@')[1].lower() + if dom not in PLATFORM_DOMAINS and 'bmc' not in dom: + result['email'] = email_m.group(1) + result['domain'] = dom + + if result.get('name') or result.get('cvr_number'): + break # Første matchende linje er nok + + return result + + # ── Hoved-logik ───────────────────────────────────────────────────────── try: email_result = execute_query( @@ -589,6 +666,11 @@ async def extract_vendor_suggestion(email_id: int): focused_text = "\n\n".join(focused_parts) combined_text = "\n\n".join(t for _, t in text_parts) + # Debug: log de første 500 tegn af hvert dokument så vi kan se hvad PDF'en producerer + for src, txt in text_parts: + logger.info(f"📄 [{src}] tekstlængde={len(txt)} — første 300 tegn: {repr(txt[:300])}") + logger.info(f"📄 [{src}] — sidste 300 tegn: {repr(txt[-300:])}") + sender_name = email.get('sender_name') or '' sender_email = email.get('sender_email') or '' @@ -596,26 +678,31 @@ async def extract_vendor_suggestion(email_id: int): sender_domain = sender_email.split('@')[1].lower() if '@' in sender_email else '' is_platform_sender = sender_domain in PLATFORM_DOMAINS - # Brug ikke sender_email som leverandør-email når det er en platform - # Prøv i stedet at finde en rigtig email i teksten - vendor_email = None - if not is_platform_sender and sender_email: - vendor_email = sender_email - else: - for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text): - dom = em.group(1).lower() - if dom not in PLATFORM_DOMAINS and 'bmc' not in dom: - vendor_email = em.group(0) - break + # ── Trin 1: Prøv dedikeret footer-parser på FULD tekst ────────────── + # (finder "Firma - Adresse - PostnrBy - CVR" linjer overalt i dokumentet) + footer_result = parse_vendor_footer(combined_text, own_cvr) + logger.info(f"🏷️ Footer-parser resultat: {footer_result}") - # ── Regex udtræk ──────────────────────────────────────────────────── + # Brug ikke sender_email som leverandør-email når det er en platform + vendor_email = footer_result.get('email') + if not vendor_email: + if not is_platform_sender and sender_email: + vendor_email = sender_email + else: + for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text): + dom = em.group(1).lower() + if dom not in PLATFORM_DOMAINS and 'bmc' not in dom: + vendor_email = em.group(0) + break + + # ── Trin 2: Generisk regex udtræk (supplerer footer-parser) ────────── suggestion = { - "name": extract_company_name(focused_text, sender_name) or sender_name, + "name": footer_result.get('name') or extract_company_name(focused_text, sender_name) or sender_name, "email": vendor_email, - "cvr_number": extract_cvr(focused_text, own_cvr), - "phone": extract_phones(focused_text), - "address": extract_address(focused_text), - "domain": extract_domain(focused_text, sender_email if not is_platform_sender else ''), + "cvr_number": footer_result.get('cvr_number') or extract_cvr(focused_text, own_cvr), + "phone": footer_result.get('phone') or extract_phones(focused_text), + "address": footer_result.get('address') or extract_address(focused_text), + "domain": footer_result.get('domain') or extract_domain(focused_text, sender_email if not is_platform_sender else ''), "source": "regex" }