fix: dedicated footer parser + debug logging for PDF text extraction

This commit is contained in:
Christian 2026-03-01 15:51:45 +01:00
parent 04acdecb91
commit 14e1c87a4c
2 changed files with 113 additions and 26 deletions

View File

@ -1 +1 @@
2.2.7
2.2.8

View File

@ -520,29 +520,106 @@ async def extract_vendor_suggestion(email_id: int):
if m:
return m.group(1).strip()
# Prioritet 2: typisk faktura-footer format:
# "FirmaNavn - Adresse - Postnr By - CVR-nr.: XXXXXXXX"
# Prioritet 2: e-conomic footer: "FirmaNavn - Adresse - ..."
# Virker både med og uden linjeskift foran
m = re.search(
r'^([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]',
r'(?:^|\n)([A-ZÆØÅ][A-Za-zæøåÆØÅ][^\n\-]{1,40}?)\s*[-]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]',
text, re.MULTILINE
)
if m:
name = m.group(1).strip()
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil')):
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr', 'faktura')):
return name
# Prioritet 3: tekst lige FORAN "CVR" på samme linje
# Prioritet 3: tekst umiddelbart FORAN "CVR" (typisk "FirmaNavn CVR-nr.")
m = re.search(
r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-,]?\s*(?:CVR|Cvr)',
r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s&\'\.]{2,50}?)\s*[-,]?\s*(?:CVR|cvr)',
text
)
if m:
name = m.group(1).strip().rstrip('-, ')
if len(name) > 2:
name = m.group(1).strip().rstrip('-, \t')
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'mail', 'bank')):
return name
return sender_name or None
def parse_vendor_footer(text: str, own_cvr: str = '') -> dict:
"""
Parser specifikt til e-conomic/Dinero footer-format:
"KONI Accounting - Jernbanegade 12K, st.tv - 4000 Roskilde - DK - CVR-nr.: 35962344"
Splitter ' - ' og identificerer segmenterne.
"""
result = {}
# Find linjer der indeholder både vejnavn/postnummer OG CVR-lignende mønstre
# eller blot det klassiske "Firma - Adresse - Postnr By" mønster
for line in text.replace('\r', '\n').split('\n'):
line = line.strip()
if len(line) < 10:
continue
# Forsøg: split på ' - ' eller ' '
parts = re.split(r'\s*[-]\s*', line)
if len(parts) < 3:
continue
# Del 0 er typisk firmanavnet (ingen tal, ingen '@')
# Del 1 er typisk adressen (indeholder tal + vejnavn)
# Del 2 (eller del med 4 cifre) er postnummer + by
name_candidate = parts[0].strip()
if not name_candidate or any(c.isdigit() for c in name_candidate[:3]):
continue
if any(w in name_candidate.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr')):
continue
# Find adresse-del (indeholder et vejnummer: bogstaver + tal)
addr_part = None
zip_city_part = None
for part in parts[1:]:
part = part.strip()
# Postnummer-format: 4 cifre + by
if re.match(r'^\d{4}\s+[A-ZÆØÅ]', part):
zip_city_part = part
elif re.search(r'\d', part) and addr_part is None:
# Del med tal = adresse
if not re.match(r'^DK$', part.strip(), re.IGNORECASE):
addr_part = part
if name_candidate and (addr_part or zip_city_part):
result['name'] = name_candidate
if addr_part and zip_city_part:
result['address'] = f"{addr_part}, {zip_city_part}"
elif addr_part:
result['address'] = addr_part
elif zip_city_part:
result['address'] = zip_city_part
# Find CVR i denne linje
cvr_m = re.search(r'CVR[^:]*:\s*(\d{8})', line, re.IGNORECASE)
if cvr_m:
val = cvr_m.group(1)
if val != own_cvr and not is_placeholder_cvr(val):
result['cvr_number'] = val
# Find telefon i denne linje
phone_m = re.search(r'(?:Tlf|Tel|Mobil)[.:]?\s*(\+?[\d][\d\s\-]{6,15})', line, re.IGNORECASE)
if phone_m:
result['phone'] = clean_phone(phone_m.group(1))
# Find email i denne linje
email_m = re.search(r'(?:Mail|E-mail|Email)[.:]?\s*([\w.\-+]+@[\w\-]+\.[\w\-]+)', line, re.IGNORECASE)
if email_m:
dom = email_m.group(1).split('@')[1].lower()
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
result['email'] = email_m.group(1)
result['domain'] = dom
if result.get('name') or result.get('cvr_number'):
break # Første matchende linje er nok
return result
# ── Hoved-logik ─────────────────────────────────────────────────────────
try:
email_result = execute_query(
@ -589,6 +666,11 @@ async def extract_vendor_suggestion(email_id: int):
focused_text = "\n\n".join(focused_parts)
combined_text = "\n\n".join(t for _, t in text_parts)
# Debug: log de første 500 tegn af hvert dokument så vi kan se hvad PDF'en producerer
for src, txt in text_parts:
logger.info(f"📄 [{src}] tekstlængde={len(txt)} — første 300 tegn: {repr(txt[:300])}")
logger.info(f"📄 [{src}] — sidste 300 tegn: {repr(txt[-300:])}")
sender_name = email.get('sender_name') or ''
sender_email = email.get('sender_email') or ''
@ -596,26 +678,31 @@ async def extract_vendor_suggestion(email_id: int):
sender_domain = sender_email.split('@')[1].lower() if '@' in sender_email else ''
is_platform_sender = sender_domain in PLATFORM_DOMAINS
# Brug ikke sender_email som leverandør-email når det er en platform
# Prøv i stedet at finde en rigtig email i teksten
vendor_email = None
if not is_platform_sender and sender_email:
vendor_email = sender_email
else:
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text):
dom = em.group(1).lower()
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
vendor_email = em.group(0)
break
# ── Trin 1: Prøv dedikeret footer-parser på FULD tekst ──────────────
# (finder "Firma - Adresse - PostnrBy - CVR" linjer overalt i dokumentet)
footer_result = parse_vendor_footer(combined_text, own_cvr)
logger.info(f"🏷️ Footer-parser resultat: {footer_result}")
# ── Regex udtræk ────────────────────────────────────────────────────
# Brug ikke sender_email som leverandør-email når det er en platform
vendor_email = footer_result.get('email')
if not vendor_email:
if not is_platform_sender and sender_email:
vendor_email = sender_email
else:
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text):
dom = em.group(1).lower()
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
vendor_email = em.group(0)
break
# ── Trin 2: Generisk regex udtræk (supplerer footer-parser) ──────────
suggestion = {
"name": extract_company_name(focused_text, sender_name) or sender_name,
"name": footer_result.get('name') or extract_company_name(focused_text, sender_name) or sender_name,
"email": vendor_email,
"cvr_number": extract_cvr(focused_text, own_cvr),
"phone": extract_phones(focused_text),
"address": extract_address(focused_text),
"domain": extract_domain(focused_text, sender_email if not is_platform_sender else ''),
"cvr_number": footer_result.get('cvr_number') or extract_cvr(focused_text, own_cvr),
"phone": footer_result.get('phone') or extract_phones(focused_text),
"address": footer_result.get('address') or extract_address(focused_text),
"domain": footer_result.get('domain') or extract_domain(focused_text, sender_email if not is_platform_sender else ''),
"source": "regex"
}