fix: massively improved vendor info extraction (CVR/address/phone/domain)
This commit is contained in:
parent
07584b1b0c
commit
a8970701ab
@ -377,14 +377,103 @@ async def link_email(email_id: int, payload: Dict):
|
|||||||
@router.post("/emails/{email_id}/extract-vendor-suggestion")
|
@router.post("/emails/{email_id}/extract-vendor-suggestion")
|
||||||
async def extract_vendor_suggestion(email_id: int):
|
async def extract_vendor_suggestion(email_id: int):
|
||||||
"""
|
"""
|
||||||
Forsøger at udtrække leverandørinfo fra email body og vedhæftede fakturaer.
|
Udtrækker leverandørinfo fra email body og vedhæftede PDF-fakturaer.
|
||||||
Returnerer forslag til navn, CVR, adresse, telefon, email, domæne.
|
Bruger stærke regex-mønstre + AI for CVR, adresse, telefon, domæne.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
# ── Hjælpefunktioner ────────────────────────────────────────────────────
|
||||||
|
def clean_phone(raw: str) -> str:
|
||||||
|
"""Normaliser telefonnummer til +45 XXXX XXXX eller 8 cifre"""
|
||||||
|
digits = re.sub(r'[^\d+]', '', raw)
|
||||||
|
if digits.startswith('+45') and len(digits) == 11:
|
||||||
|
return digits
|
||||||
|
if digits.startswith('45') and len(digits) == 10:
|
||||||
|
return '+' + digits
|
||||||
|
bare = re.sub(r'\D', '', raw)
|
||||||
|
if len(bare) == 8:
|
||||||
|
return bare
|
||||||
|
return raw.strip()[:20]
|
||||||
|
|
||||||
|
def extract_cvr(text: str, own_cvr: str = '') -> Optional[str]:
|
||||||
|
patterns = [
|
||||||
|
# Med label
|
||||||
|
r'(?:CVR|Cvr\.?-?nr\.?|cvr|Moms(?:nr\.?|registrerings?nr\.?)|VAT\s*(?:no\.?|nr\.?|number))[:\s.\-–]*(?:DK)?[\s\-]?(\d{8})',
|
||||||
|
# DK-præfiks
|
||||||
|
r'\bDK[\s\-]?(\d{8})\b',
|
||||||
|
# Standalone 8 cifre (sidst – mindst specifik)
|
||||||
|
r'\b(\d{8})\b',
|
||||||
|
]
|
||||||
|
for pat in patterns:
|
||||||
|
for m in re.finditer(pat, text, re.IGNORECASE):
|
||||||
|
val = m.group(1)
|
||||||
|
if val != own_cvr and val.isdigit():
|
||||||
|
return val
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_phones(text: str) -> Optional[str]:
|
||||||
|
patterns = [
|
||||||
|
# Med label
|
||||||
|
r'(?:Tlf\.?|Tel\.?|Telefon|Phone|Mobil|Fax)[:\s.\-–]*(\+?[\d][\d\s\-().]{6,18})',
|
||||||
|
# +45 XXXXXXXX
|
||||||
|
r'(\+45[\s\-]?\d{2}[\s\-]?\d{2}[\s\-]?\d{2}[\s\-]?\d{2})',
|
||||||
|
# 8 cifre i grupper: 12 34 56 78 / 1234 5678
|
||||||
|
r'\b(\d{2}[\s\-]\d{2}[\s\-]\d{2}[\s\-]\d{2})\b',
|
||||||
|
r'\b(\d{4}[\s\-]\d{4})\b',
|
||||||
|
]
|
||||||
|
for pat in patterns:
|
||||||
|
m = re.search(pat, text, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return clean_phone(m.group(1))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_address(text: str) -> Optional[str]:
|
||||||
|
# Dansk postnummer 4 cifre + by
|
||||||
|
m = re.search(
|
||||||
|
r'([A-ZÆØÅ][a-zæøåA-ZÆØÅ\-\.]+(?:\s+\d+[A-Za-z]?(?:,?\s*(?:st|tv|th|\d+\.?\s*(?:sal|etage)?))?)?,?\s*\d{4}\s+[A-ZÆØÅ][a-zæøåA-ZÆØÅ\s\-]+)',
|
||||||
|
text
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
return m.group(0).strip()
|
||||||
|
# Fallback: vejnavn + husnummer + postnummer
|
||||||
|
m = re.search(
|
||||||
|
r'([A-ZÆØÅ][a-zæøåA-ZÆØÅ]+(?:vej|gade|alle|vænge|torv|plads|stræde|boulevard|have|bakke|skov|park|strand|mark|eng)\s*\d+[A-Za-z]?\s*,?\s*\d{4})',
|
||||||
|
text, re.IGNORECASE
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
return m.group(0).strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_domain(text: str, sender_email: str = '') -> Optional[str]:
|
||||||
|
# Eksplicit www
|
||||||
|
m = re.search(r'(?:www\.|https?://)([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', text, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return m.group(1).lower()
|
||||||
|
# Emailadresser i teksten (ikke @bmcnetworks)
|
||||||
|
for em in re.finditer(r'[\w.\-+]+@([\w\-]+\.[\w\-]+(?:\.[\w]{2,6})?)', text):
|
||||||
|
dom = em.group(1).lower()
|
||||||
|
if 'bmc' not in dom and 'gmail' not in dom and 'outlook' not in dom and 'hotmail' not in dom:
|
||||||
|
return dom
|
||||||
|
# Sender email
|
||||||
|
if sender_email and '@' in sender_email:
|
||||||
|
dom = sender_email.split('@')[1].lower()
|
||||||
|
if 'gmail' not in dom and 'outlook' not in dom and 'hotmail' not in dom:
|
||||||
|
return dom
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_company_name(text: str, sender_name: str = '') -> Optional[str]:
|
||||||
|
"""Prøv at finde firmanavn via CVR-nær tekst eller typiske DK-firmasuffikser"""
|
||||||
|
m = re.search(
|
||||||
|
r'\b([\w\s\-&\'\.]+(?:A/S|ApS|IVS|I/S|K/S|P/S|GmbH|Ltd\.?|LLC|AB|AS))\b',
|
||||||
|
text
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip()
|
||||||
|
return sender_name or None
|
||||||
|
|
||||||
|
# ── Hoved-logik ─────────────────────────────────────────────────────────
|
||||||
try:
|
try:
|
||||||
# Hent email
|
|
||||||
email_result = execute_query(
|
email_result = execute_query(
|
||||||
"SELECT * FROM email_messages WHERE id = %s AND deleted_at IS NULL",
|
"SELECT * FROM email_messages WHERE id = %s AND deleted_at IS NULL",
|
||||||
(email_id,)
|
(email_id,)
|
||||||
@ -393,10 +482,13 @@ async def extract_vendor_suggestion(email_id: int):
|
|||||||
raise HTTPException(status_code=404, detail="Email ikke fundet")
|
raise HTTPException(status_code=404, detail="Email ikke fundet")
|
||||||
email = email_result[0]
|
email = email_result[0]
|
||||||
|
|
||||||
# Saml tekst fra body + vedhæftede PDF-filer
|
from app.core.config import settings
|
||||||
|
own_cvr = getattr(settings, 'OWN_CVR', '')
|
||||||
|
|
||||||
|
# Saml tekst fra body + PDF-bilag
|
||||||
text_parts = []
|
text_parts = []
|
||||||
if email.get('body_text'):
|
if email.get('body_text'):
|
||||||
text_parts.append(email['body_text'])
|
text_parts.append(("body", email['body_text']))
|
||||||
|
|
||||||
attachments = execute_query(
|
attachments = execute_query(
|
||||||
"SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id",
|
"SELECT * FROM email_attachments WHERE email_id = %s ORDER BY id",
|
||||||
@ -405,89 +497,109 @@ async def extract_vendor_suggestion(email_id: int):
|
|||||||
for att in (attachments or []):
|
for att in (attachments or []):
|
||||||
file_path = att.get('file_path')
|
file_path = att.get('file_path')
|
||||||
if file_path and os.path.exists(file_path):
|
if file_path and os.path.exists(file_path):
|
||||||
content_type = att.get('content_type', '')
|
ct = att.get('content_type', '')
|
||||||
if 'pdf' in content_type or file_path.lower().endswith('.pdf'):
|
if 'pdf' in ct or file_path.lower().endswith('.pdf'):
|
||||||
try:
|
try:
|
||||||
pdf_text = await ollama_service._extract_text_from_file(file_path)
|
pdf_text = await ollama_service._extract_text_from_file(file_path)
|
||||||
if pdf_text:
|
if pdf_text:
|
||||||
text_parts.append(f"[PDF: {att['filename']}]\n{pdf_text}")
|
text_parts.append(("pdf", pdf_text))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
|
logger.warning(f"⚠️ Kunne ikke læse PDF {file_path}: {e}")
|
||||||
|
|
||||||
combined_text = "\n\n".join(text_parts)
|
# Prioriter PDF-tekst for leverandørinfo (header + footer indeholder firmainfo)
|
||||||
|
# Tag: første 800 tegn (header) + sidste 800 tegn (footer) fra hvert dokument
|
||||||
|
focused_parts = []
|
||||||
|
for src, txt in text_parts:
|
||||||
|
if len(txt) > 1200:
|
||||||
|
focused_parts.append(f"[{src} header]\n{txt[:800]}")
|
||||||
|
focused_parts.append(f"[{src} footer]\n{txt[-800:]}")
|
||||||
|
else:
|
||||||
|
focused_parts.append(f"[{src}]\n{txt}")
|
||||||
|
focused_text = "\n\n".join(focused_parts)
|
||||||
|
combined_text = "\n\n".join(t for _, t in text_parts)
|
||||||
|
|
||||||
|
sender_name = email.get('sender_name') or ''
|
||||||
|
sender_email = email.get('sender_email') or ''
|
||||||
|
|
||||||
|
# ── Regex udtræk ────────────────────────────────────────────────────
|
||||||
suggestion = {
|
suggestion = {
|
||||||
"name": email.get('sender_name') or '',
|
"name": extract_company_name(focused_text, sender_name) or sender_name,
|
||||||
"email": email.get('sender_email') or '',
|
"email": sender_email,
|
||||||
"cvr_number": None,
|
"cvr_number": extract_cvr(focused_text, own_cvr),
|
||||||
"phone": None,
|
"phone": extract_phones(focused_text),
|
||||||
"address": None,
|
"address": extract_address(focused_text),
|
||||||
"domain": None,
|
"domain": extract_domain(focused_text, sender_email),
|
||||||
"source": "regex"
|
"source": "regex"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Regex fallback: uddrag CVR (8 cifre efter CVR/Momsnr/DK)
|
logger.info(f"🔍 Regex udtræk for email {email_id}: {suggestion}")
|
||||||
cvr_match = re.search(
|
|
||||||
r'(?:CVR|Cvr|cvr|Momsnr\.?|DK)[\s:.-]*([0-9]{8})',
|
|
||||||
combined_text
|
|
||||||
)
|
|
||||||
if cvr_match:
|
|
||||||
suggestion['cvr_number'] = cvr_match.group(1)
|
|
||||||
|
|
||||||
# Regex: telefon (dansk format)
|
# ── AI udtræk (forbedrer regex-resultat) ────────────────────────────
|
||||||
phone_match = re.search(
|
if focused_text.strip():
|
||||||
r'(?:Tlf|Tel|Telefon|Phone)[\s.:]*([+\d][\d\s\-().]{6,15})',
|
|
||||||
combined_text, re.IGNORECASE
|
|
||||||
)
|
|
||||||
if phone_match:
|
|
||||||
suggestion['phone'] = phone_match.group(1).strip()
|
|
||||||
|
|
||||||
# Domæne fra sender email
|
|
||||||
if email.get('sender_email') and '@' in email['sender_email']:
|
|
||||||
suggestion['domain'] = email['sender_email'].split('@')[1]
|
|
||||||
|
|
||||||
# Brug AI hvis vi har tekst fra PDF
|
|
||||||
if len(combined_text) > 100:
|
|
||||||
try:
|
try:
|
||||||
from app.core.config import settings
|
# Send kun den fokuserede tekst (max 4000 tegn) til AI
|
||||||
own_cvr = getattr(settings, 'OWN_CVR', '')
|
ai_text = focused_text[:4000]
|
||||||
|
|
||||||
prompt = f"""OPGAVE: Udtræk leverandørens firmainfo fra denne tekst.
|
prompt = f"""Du er en ekspert i at udtrække firmaoplysninger fra danske fakturaer og e-mails.
|
||||||
TEKSTEN er body/footer fra en faktura-email eller vedhæftet faktura.
|
|
||||||
RETURNER KUN VALID JSON - ingen forklaring!
|
|
||||||
|
|
||||||
|
OPGAVE: Find LEVERANDØRENS firmaoplysninger i teksten nedenfor.
|
||||||
|
Leverandøren er AFSENDEREN - IKKE BMC Networks og IKKE køber.
|
||||||
|
|
||||||
|
RETURNER KUN DETTE JSON - ingen forklaring, ingen markdown:
|
||||||
{{
|
{{
|
||||||
\"name\": \"Firmanavn ApS\",
|
"name": "Firmanavn ApS",
|
||||||
\"cvr_number\": \"12345678\",
|
"cvr_number": "12345678",
|
||||||
\"address\": \"Vejnavn 1, 2000 By\",
|
"address": "Vejnavn 1, 2000 By",
|
||||||
\"phone\": \"12345678\",
|
"phone": "12345678",
|
||||||
\"email\": \"kontakt@firma.dk\",
|
"email": "kontakt@firma.dk",
|
||||||
\"domain\": \"firma.dk\"
|
"domain": "firma.dk"
|
||||||
}}
|
}}
|
||||||
|
|
||||||
REGLER:
|
REGLER:
|
||||||
- name: LEVERANDØRENS firmanavn (ikke køber, ikke BMC)
|
- name: Firmanavn med A/S, ApS, IVS osv. - IKKE BMC Networks
|
||||||
- cvr_number: 8-cifret CVR - IGNORER {own_cvr} (det er køber)
|
- cvr_number: Præcis 8 cifre efter "CVR", "CVR-nr", "Moms" eller "DK" - IGNORER {own_cvr}
|
||||||
- Sæt null hvis ikke fundet
|
- address: Fuld adresse med postnummer og by (dansk format: "Vejnavn 1, 1234 By")
|
||||||
- KUN JSON output!
|
- phone: Telefonnummer - foretrukket format: "+45 XXXX XXXX" eller "XXXX XXXX"
|
||||||
|
- email: Kontakt-email til firmaet (IKKE afsender-email hvis den er personlig)
|
||||||
|
- domain: Hjemmeside-domæne f.eks. "firma.dk" eller "www.firma.dk"
|
||||||
|
- Sæt null for felter der IKKE kan findes med sikkerhed
|
||||||
|
|
||||||
TEKST (max 3000 tegn):
|
KENDTE REGEX-RESULTATER (brug som hjælp, ret dem hvis de er forkerte):
|
||||||
{combined_text[:3000]}
|
- cvr: {suggestion.get('cvr_number') or 'ikke fundet'}
|
||||||
|
- phone: {suggestion.get('phone') or 'ikke fundet'}
|
||||||
|
- address: {suggestion.get('address') or 'ikke fundet'}
|
||||||
|
- domain: {suggestion.get('domain') or 'ikke fundet'}
|
||||||
|
|
||||||
KUN JSON:"""
|
TEKST:
|
||||||
|
{ai_text}
|
||||||
|
|
||||||
|
JSON:"""
|
||||||
|
|
||||||
ai_result = await ollama_service.extract_from_text(prompt)
|
ai_result = await ollama_service.extract_from_text(prompt)
|
||||||
if ai_result and isinstance(ai_result, dict):
|
if ai_result and isinstance(ai_result, dict):
|
||||||
# Merge AI resultat ind i suggestion (AI prioriteres over regex)
|
improved = False
|
||||||
for field in ('name', 'cvr_number', 'address', 'phone', 'email', 'domain'):
|
for field in ('name', 'cvr_number', 'address', 'phone', 'email', 'domain'):
|
||||||
val = ai_result.get(field)
|
val = ai_result.get(field)
|
||||||
if val and val not in (None, 'null', '', 'N/A'):
|
if val and str(val).strip() not in ('null', '', 'N/A', 'None', own_cvr):
|
||||||
suggestion[field] = str(val).strip()
|
new_val = str(val).strip()
|
||||||
|
if new_val != str(suggestion.get(field) or ''):
|
||||||
|
suggestion[field] = new_val
|
||||||
|
improved = True
|
||||||
|
if improved:
|
||||||
suggestion['source'] = 'ai'
|
suggestion['source'] = 'ai'
|
||||||
logger.info(f"✅ AI vendor suggestion for email {email_id}: {suggestion}")
|
logger.info(f"✅ AI vendor suggestion for email {email_id}: {suggestion}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"⚠️ AI udtræk fejlede, bruger regex-resultat: {e}")
|
logger.warning(f"⚠️ AI udtræk fejlede, bruger regex-resultat: {e}")
|
||||||
|
|
||||||
|
# Rens: fjern domæner der tilhører kendte mailservere
|
||||||
|
spam_domains = {'gmail.com', 'hotmail.com', 'outlook.com', 'yahoo.com', 'live.com', 'icloud.com'}
|
||||||
|
if suggestion.get('domain') in spam_domains:
|
||||||
|
suggestion['domain'] = None
|
||||||
|
|
||||||
|
# Fjern own_cvr hvis den snegte sig ind
|
||||||
|
if suggestion.get('cvr_number') == own_cvr:
|
||||||
|
suggestion['cvr_number'] = None
|
||||||
|
|
||||||
return suggestion
|
return suggestion
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user