fix: dedicated footer parser + debug logging for PDF text extraction
This commit is contained in:
parent
04acdecb91
commit
14e1c87a4c
@ -520,29 +520,106 @@ async def extract_vendor_suggestion(email_id: int):
|
|||||||
if m:
|
if m:
|
||||||
return m.group(1).strip()
|
return m.group(1).strip()
|
||||||
|
|
||||||
# Prioritet 2: typisk faktura-footer format:
|
# Prioritet 2: e-conomic footer: "FirmaNavn - Adresse - ..."
|
||||||
# "FirmaNavn - Adresse - Postnr By - CVR-nr.: XXXXXXXX"
|
# Virker både med og uden linjeskift foran
|
||||||
m = re.search(
|
m = re.search(
|
||||||
r'^([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-–]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]',
|
r'(?:^|\n)([A-ZÆØÅ][A-Za-zæøåÆØÅ][^\n\-]{1,40}?)\s*[-–]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]',
|
||||||
text, re.MULTILINE
|
text, re.MULTILINE
|
||||||
)
|
)
|
||||||
if m:
|
if m:
|
||||||
name = m.group(1).strip()
|
name = m.group(1).strip()
|
||||||
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil')):
|
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr', 'faktura')):
|
||||||
return name
|
return name
|
||||||
|
|
||||||
# Prioritet 3: tekst lige FORAN "CVR" på samme linje
|
# Prioritet 3: tekst umiddelbart FORAN "CVR" (typisk "FirmaNavn CVR-nr.")
|
||||||
m = re.search(
|
m = re.search(
|
||||||
r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-–,]?\s*(?:CVR|Cvr)',
|
r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s&\'\.]{2,50}?)\s*[-–,]?\s*(?:CVR|cvr)',
|
||||||
text
|
text
|
||||||
)
|
)
|
||||||
if m:
|
if m:
|
||||||
name = m.group(1).strip().rstrip('-–, ')
|
name = m.group(1).strip().rstrip('-–, \t')
|
||||||
if len(name) > 2:
|
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'mail', 'bank')):
|
||||||
return name
|
return name
|
||||||
|
|
||||||
return sender_name or None
|
return sender_name or None
|
||||||
|
|
||||||
|
def parse_vendor_footer(text: str, own_cvr: str = '') -> dict:
|
||||||
|
"""
|
||||||
|
Parser specifikt til e-conomic/Dinero footer-format:
|
||||||
|
"KONI Accounting - Jernbanegade 12K, st.tv - 4000 Roskilde - DK - CVR-nr.: 35962344"
|
||||||
|
|
||||||
|
Splitter på ' - ' og identificerer segmenterne.
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
# Find linjer der indeholder både vejnavn/postnummer OG CVR-lignende mønstre
|
||||||
|
# eller blot det klassiske "Firma - Adresse - Postnr By" mønster
|
||||||
|
for line in text.replace('\r', '\n').split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if len(line) < 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Forsøg: split på ' - ' eller ' – '
|
||||||
|
parts = re.split(r'\s*[-–]\s*', line)
|
||||||
|
if len(parts) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Del 0 er typisk firmanavnet (ingen tal, ingen '@')
|
||||||
|
# Del 1 er typisk adressen (indeholder tal + vejnavn)
|
||||||
|
# Del 2 (eller del med 4 cifre) er postnummer + by
|
||||||
|
name_candidate = parts[0].strip()
|
||||||
|
if not name_candidate or any(c.isdigit() for c in name_candidate[:3]):
|
||||||
|
continue
|
||||||
|
if any(w in name_candidate.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find adresse-del (indeholder et vejnummer: bogstaver + tal)
|
||||||
|
addr_part = None
|
||||||
|
zip_city_part = None
|
||||||
|
for part in parts[1:]:
|
||||||
|
part = part.strip()
|
||||||
|
# Postnummer-format: 4 cifre + by
|
||||||
|
if re.match(r'^\d{4}\s+[A-ZÆØÅ]', part):
|
||||||
|
zip_city_part = part
|
||||||
|
elif re.search(r'\d', part) and addr_part is None:
|
||||||
|
# Del med tal = adresse
|
||||||
|
if not re.match(r'^DK$', part.strip(), re.IGNORECASE):
|
||||||
|
addr_part = part
|
||||||
|
|
||||||
|
if name_candidate and (addr_part or zip_city_part):
|
||||||
|
result['name'] = name_candidate
|
||||||
|
if addr_part and zip_city_part:
|
||||||
|
result['address'] = f"{addr_part}, {zip_city_part}"
|
||||||
|
elif addr_part:
|
||||||
|
result['address'] = addr_part
|
||||||
|
elif zip_city_part:
|
||||||
|
result['address'] = zip_city_part
|
||||||
|
|
||||||
|
# Find CVR i denne linje
|
||||||
|
cvr_m = re.search(r'CVR[^:]*:\s*(\d{8})', line, re.IGNORECASE)
|
||||||
|
if cvr_m:
|
||||||
|
val = cvr_m.group(1)
|
||||||
|
if val != own_cvr and not is_placeholder_cvr(val):
|
||||||
|
result['cvr_number'] = val
|
||||||
|
|
||||||
|
# Find telefon i denne linje
|
||||||
|
phone_m = re.search(r'(?:Tlf|Tel|Mobil)[.:]?\s*(\+?[\d][\d\s\-]{6,15})', line, re.IGNORECASE)
|
||||||
|
if phone_m:
|
||||||
|
result['phone'] = clean_phone(phone_m.group(1))
|
||||||
|
|
||||||
|
# Find email i denne linje
|
||||||
|
email_m = re.search(r'(?:Mail|E-mail|Email)[.:]?\s*([\w.\-+]+@[\w\-]+\.[\w\-]+)', line, re.IGNORECASE)
|
||||||
|
if email_m:
|
||||||
|
dom = email_m.group(1).split('@')[1].lower()
|
||||||
|
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
|
||||||
|
result['email'] = email_m.group(1)
|
||||||
|
result['domain'] = dom
|
||||||
|
|
||||||
|
if result.get('name') or result.get('cvr_number'):
|
||||||
|
break # Første matchende linje er nok
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ── Hoved-logik ─────────────────────────────────────────────────────────
|
# ── Hoved-logik ─────────────────────────────────────────────────────────
|
||||||
try:
|
try:
|
||||||
email_result = execute_query(
|
email_result = execute_query(
|
||||||
@ -589,6 +666,11 @@ async def extract_vendor_suggestion(email_id: int):
|
|||||||
focused_text = "\n\n".join(focused_parts)
|
focused_text = "\n\n".join(focused_parts)
|
||||||
combined_text = "\n\n".join(t for _, t in text_parts)
|
combined_text = "\n\n".join(t for _, t in text_parts)
|
||||||
|
|
||||||
|
# Debug: log de første 500 tegn af hvert dokument så vi kan se hvad PDF'en producerer
|
||||||
|
for src, txt in text_parts:
|
||||||
|
logger.info(f"📄 [{src}] tekstlængde={len(txt)} — første 300 tegn: {repr(txt[:300])}")
|
||||||
|
logger.info(f"📄 [{src}] — sidste 300 tegn: {repr(txt[-300:])}")
|
||||||
|
|
||||||
sender_name = email.get('sender_name') or ''
|
sender_name = email.get('sender_name') or ''
|
||||||
sender_email = email.get('sender_email') or ''
|
sender_email = email.get('sender_email') or ''
|
||||||
|
|
||||||
@ -596,26 +678,31 @@ async def extract_vendor_suggestion(email_id: int):
|
|||||||
sender_domain = sender_email.split('@')[1].lower() if '@' in sender_email else ''
|
sender_domain = sender_email.split('@')[1].lower() if '@' in sender_email else ''
|
||||||
is_platform_sender = sender_domain in PLATFORM_DOMAINS
|
is_platform_sender = sender_domain in PLATFORM_DOMAINS
|
||||||
|
|
||||||
# Brug ikke sender_email som leverandør-email når det er en platform
|
# ── Trin 1: Prøv dedikeret footer-parser på FULD tekst ──────────────
|
||||||
# Prøv i stedet at finde en rigtig email i teksten
|
# (finder "Firma - Adresse - PostnrBy - CVR" linjer overalt i dokumentet)
|
||||||
vendor_email = None
|
footer_result = parse_vendor_footer(combined_text, own_cvr)
|
||||||
if not is_platform_sender and sender_email:
|
logger.info(f"🏷️ Footer-parser resultat: {footer_result}")
|
||||||
vendor_email = sender_email
|
|
||||||
else:
|
|
||||||
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text):
|
|
||||||
dom = em.group(1).lower()
|
|
||||||
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
|
|
||||||
vendor_email = em.group(0)
|
|
||||||
break
|
|
||||||
|
|
||||||
# ── Regex udtræk ────────────────────────────────────────────────────
|
# Brug ikke sender_email som leverandør-email når det er en platform
|
||||||
|
vendor_email = footer_result.get('email')
|
||||||
|
if not vendor_email:
|
||||||
|
if not is_platform_sender and sender_email:
|
||||||
|
vendor_email = sender_email
|
||||||
|
else:
|
||||||
|
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text):
|
||||||
|
dom = em.group(1).lower()
|
||||||
|
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
|
||||||
|
vendor_email = em.group(0)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Trin 2: Generisk regex udtræk (supplerer footer-parser) ──────────
|
||||||
suggestion = {
|
suggestion = {
|
||||||
"name": extract_company_name(focused_text, sender_name) or sender_name,
|
"name": footer_result.get('name') or extract_company_name(focused_text, sender_name) or sender_name,
|
||||||
"email": vendor_email,
|
"email": vendor_email,
|
||||||
"cvr_number": extract_cvr(focused_text, own_cvr),
|
"cvr_number": footer_result.get('cvr_number') or extract_cvr(focused_text, own_cvr),
|
||||||
"phone": extract_phones(focused_text),
|
"phone": footer_result.get('phone') or extract_phones(focused_text),
|
||||||
"address": extract_address(focused_text),
|
"address": footer_result.get('address') or extract_address(focused_text),
|
||||||
"domain": extract_domain(focused_text, sender_email if not is_platform_sender else ''),
|
"domain": footer_result.get('domain') or extract_domain(focused_text, sender_email if not is_platform_sender else ''),
|
||||||
"source": "regex"
|
"source": "regex"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user