fix: dedicated footer parser + debug logging for PDF text extraction
This commit is contained in:
parent
04acdecb91
commit
14e1c87a4c
@ -520,29 +520,106 @@ async def extract_vendor_suggestion(email_id: int):
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
|
||||
# Prioritet 2: typisk faktura-footer format:
|
||||
# "FirmaNavn - Adresse - Postnr By - CVR-nr.: XXXXXXXX"
|
||||
# Prioritet 2: e-conomic footer: "FirmaNavn - Adresse - ..."
|
||||
# Virker både med og uden linjeskift foran
|
||||
m = re.search(
|
||||
r'^([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-–]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]',
|
||||
r'(?:^|\n)([A-ZÆØÅ][A-Za-zæøåÆØÅ][^\n\-]{1,40}?)\s*[-–]\s*[A-ZÆØÅ][a-zæøåA-ZÆØÅ]',
|
||||
text, re.MULTILINE
|
||||
)
|
||||
if m:
|
||||
name = m.group(1).strip()
|
||||
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil')):
|
||||
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr', 'faktura')):
|
||||
return name
|
||||
|
||||
# Prioritet 3: tekst lige FORAN "CVR" på samme linje
|
||||
# Prioritet 3: tekst umiddelbart FORAN "CVR" (typisk "FirmaNavn CVR-nr.")
|
||||
m = re.search(
|
||||
r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s\-&\'\.]{2,50}?)\s*[-–,]?\s*(?:CVR|Cvr)',
|
||||
r'([A-ZÆØÅ][A-Za-zæøåÆØÅ\s&\'\.]{2,50}?)\s*[-–,]?\s*(?:CVR|cvr)',
|
||||
text
|
||||
)
|
||||
if m:
|
||||
name = m.group(1).strip().rstrip('-–, ')
|
||||
if len(name) > 2:
|
||||
name = m.group(1).strip().rstrip('-–, \t')
|
||||
if len(name) > 2 and not any(w in name.lower() for w in ('tlf', 'mail', 'bank')):
|
||||
return name
|
||||
|
||||
return sender_name or None
|
||||
|
||||
def parse_vendor_footer(text: str, own_cvr: str = '') -> dict:
|
||||
"""
|
||||
Parser specifikt til e-conomic/Dinero footer-format:
|
||||
"KONI Accounting - Jernbanegade 12K, st.tv - 4000 Roskilde - DK - CVR-nr.: 35962344"
|
||||
|
||||
Splitter på ' - ' og identificerer segmenterne.
|
||||
"""
|
||||
result = {}
|
||||
# Find linjer der indeholder både vejnavn/postnummer OG CVR-lignende mønstre
|
||||
# eller blot det klassiske "Firma - Adresse - Postnr By" mønster
|
||||
for line in text.replace('\r', '\n').split('\n'):
|
||||
line = line.strip()
|
||||
if len(line) < 10:
|
||||
continue
|
||||
|
||||
# Forsøg: split på ' - ' eller ' – '
|
||||
parts = re.split(r'\s*[-–]\s*', line)
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
|
||||
# Del 0 er typisk firmanavnet (ingen tal, ingen '@')
|
||||
# Del 1 er typisk adressen (indeholder tal + vejnavn)
|
||||
# Del 2 (eller del med 4 cifre) er postnummer + by
|
||||
name_candidate = parts[0].strip()
|
||||
if not name_candidate or any(c.isdigit() for c in name_candidate[:3]):
|
||||
continue
|
||||
if any(w in name_candidate.lower() for w in ('tlf', 'tel', 'mail', 'bank', 'cvr', 'mobil', 'kontonr')):
|
||||
continue
|
||||
|
||||
# Find adresse-del (indeholder et vejnummer: bogstaver + tal)
|
||||
addr_part = None
|
||||
zip_city_part = None
|
||||
for part in parts[1:]:
|
||||
part = part.strip()
|
||||
# Postnummer-format: 4 cifre + by
|
||||
if re.match(r'^\d{4}\s+[A-ZÆØÅ]', part):
|
||||
zip_city_part = part
|
||||
elif re.search(r'\d', part) and addr_part is None:
|
||||
# Del med tal = adresse
|
||||
if not re.match(r'^DK$', part.strip(), re.IGNORECASE):
|
||||
addr_part = part
|
||||
|
||||
if name_candidate and (addr_part or zip_city_part):
|
||||
result['name'] = name_candidate
|
||||
if addr_part and zip_city_part:
|
||||
result['address'] = f"{addr_part}, {zip_city_part}"
|
||||
elif addr_part:
|
||||
result['address'] = addr_part
|
||||
elif zip_city_part:
|
||||
result['address'] = zip_city_part
|
||||
|
||||
# Find CVR i denne linje
|
||||
cvr_m = re.search(r'CVR[^:]*:\s*(\d{8})', line, re.IGNORECASE)
|
||||
if cvr_m:
|
||||
val = cvr_m.group(1)
|
||||
if val != own_cvr and not is_placeholder_cvr(val):
|
||||
result['cvr_number'] = val
|
||||
|
||||
# Find telefon i denne linje
|
||||
phone_m = re.search(r'(?:Tlf|Tel|Mobil)[.:]?\s*(\+?[\d][\d\s\-]{6,15})', line, re.IGNORECASE)
|
||||
if phone_m:
|
||||
result['phone'] = clean_phone(phone_m.group(1))
|
||||
|
||||
# Find email i denne linje
|
||||
email_m = re.search(r'(?:Mail|E-mail|Email)[.:]?\s*([\w.\-+]+@[\w\-]+\.[\w\-]+)', line, re.IGNORECASE)
|
||||
if email_m:
|
||||
dom = email_m.group(1).split('@')[1].lower()
|
||||
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
|
||||
result['email'] = email_m.group(1)
|
||||
result['domain'] = dom
|
||||
|
||||
if result.get('name') or result.get('cvr_number'):
|
||||
break # Første matchende linje er nok
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ── Hoved-logik ─────────────────────────────────────────────────────────
|
||||
try:
|
||||
email_result = execute_query(
|
||||
@ -589,6 +666,11 @@ async def extract_vendor_suggestion(email_id: int):
|
||||
focused_text = "\n\n".join(focused_parts)
|
||||
combined_text = "\n\n".join(t for _, t in text_parts)
|
||||
|
||||
# Debug: log de første 500 tegn af hvert dokument så vi kan se hvad PDF'en producerer
|
||||
for src, txt in text_parts:
|
||||
logger.info(f"📄 [{src}] tekstlængde={len(txt)} — første 300 tegn: {repr(txt[:300])}")
|
||||
logger.info(f"📄 [{src}] — sidste 300 tegn: {repr(txt[-300:])}")
|
||||
|
||||
sender_name = email.get('sender_name') or ''
|
||||
sender_email = email.get('sender_email') or ''
|
||||
|
||||
@ -596,26 +678,31 @@ async def extract_vendor_suggestion(email_id: int):
|
||||
sender_domain = sender_email.split('@')[1].lower() if '@' in sender_email else ''
|
||||
is_platform_sender = sender_domain in PLATFORM_DOMAINS
|
||||
|
||||
# Brug ikke sender_email som leverandør-email når det er en platform
|
||||
# Prøv i stedet at finde en rigtig email i teksten
|
||||
vendor_email = None
|
||||
if not is_platform_sender and sender_email:
|
||||
vendor_email = sender_email
|
||||
else:
|
||||
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text):
|
||||
dom = em.group(1).lower()
|
||||
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
|
||||
vendor_email = em.group(0)
|
||||
break
|
||||
# ── Trin 1: Prøv dedikeret footer-parser på FULD tekst ──────────────
|
||||
# (finder "Firma - Adresse - PostnrBy - CVR" linjer overalt i dokumentet)
|
||||
footer_result = parse_vendor_footer(combined_text, own_cvr)
|
||||
logger.info(f"🏷️ Footer-parser resultat: {footer_result}")
|
||||
|
||||
# ── Regex udtræk ────────────────────────────────────────────────────
|
||||
# Brug ikke sender_email som leverandør-email når det er en platform
|
||||
vendor_email = footer_result.get('email')
|
||||
if not vendor_email:
|
||||
if not is_platform_sender and sender_email:
|
||||
vendor_email = sender_email
|
||||
else:
|
||||
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', focused_text):
|
||||
dom = em.group(1).lower()
|
||||
if dom not in PLATFORM_DOMAINS and 'bmc' not in dom:
|
||||
vendor_email = em.group(0)
|
||||
break
|
||||
|
||||
# ── Trin 2: Generisk regex udtræk (supplerer footer-parser) ──────────
|
||||
suggestion = {
|
||||
"name": extract_company_name(focused_text, sender_name) or sender_name,
|
||||
"name": footer_result.get('name') or extract_company_name(focused_text, sender_name) or sender_name,
|
||||
"email": vendor_email,
|
||||
"cvr_number": extract_cvr(focused_text, own_cvr),
|
||||
"phone": extract_phones(focused_text),
|
||||
"address": extract_address(focused_text),
|
||||
"domain": extract_domain(focused_text, sender_email if not is_platform_sender else ''),
|
||||
"cvr_number": footer_result.get('cvr_number') or extract_cvr(focused_text, own_cvr),
|
||||
"phone": footer_result.get('phone') or extract_phones(focused_text),
|
||||
"address": footer_result.get('address') or extract_address(focused_text),
|
||||
"domain": footer_result.get('domain') or extract_domain(focused_text, sender_email if not is_platform_sender else ''),
|
||||
"source": "regex"
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user