""" Simple Keyword-Based Email Classifier Fallback when AI classification is unavailable """ import logging from typing import Dict, Optional import re logger = logging.getLogger(__name__) class SimpleEmailClassifier: """Simple rule-based email classifier using keywords""" def __init__(self): self.keyword_rules = { 'invoice': [ 'faktura', 'invoice', 'kreditnota', 'credit note', 'ordrenr', 'order number', 'betalingspåmindelse', 'payment reminder', 'fakturanr', 'invoice number', 'betaling', 'payment' ], 'freight_note': [ 'fragtbrev', 'tracking', 'forsendelse', 'shipment', 'levering', 'delivery', 'pakke', 'package', 'fragtbreve' ], 'order_confirmation': [ 'ordrebekræftelse', 'order confirmation', 'bestilling bekræftet', 'ordre modtaget', 'order received' ], 'time_confirmation': [ 'timer', 'hours', 'tidsforbrug', 'time spent', 'tidsregistrering', 'time registration' ], 'case_notification': [ 'cc[0-9]{4}', 'case #', 'sag ', 'ticket', 'support' ], 'bankruptcy': [ 'konkurs', 'bankruptcy', 'rekonstruktion', 'insolvency', 'betalingsstandsning', 'administration' ], 'spam': [ 'unsubscribe', 'click here', 'free offer', 'gratis tilbud', 'vind nu', 'win now', 'limited time' ] } def classify(self, email_data: Dict) -> Dict: """ Classify email using simple keyword matching Returns: {classification: str, confidence: float, reasoning: str} """ subject = (email_data.get('subject', '') or '').lower() sender = (email_data.get('sender_email', '') or '').lower() body = (email_data.get('body_text', '') or '').lower()[:500] # First 500 chars logger.info(f"🔍 simple_classifier: subject='{subject}', body_len={len(body)}, sender='{sender}'") # Combine all text for analysis text = f"{subject} {body}" # Check each category scores = {} for category, keywords in self.keyword_rules.items(): matches = 0 matched_keywords = [] for keyword in keywords: # Use regex for patterns like CC[0-9]{4} if re.search(keyword, text, re.IGNORECASE): matches += 1 matched_keywords.append(keyword) if matches > 0: scores[category] = { 'matches': matches, 'keywords': matched_keywords } # Determine best match if not scores: return { 'classification': 'general', 'confidence': 0.5, 'reasoning': 'No specific keywords matched - classified as general' } # Get category with most matches best_category = max(scores.items(), key=lambda x: x[1]['matches']) category_name = best_category[0] match_count = best_category[1]['matches'] matched_keywords = best_category[1]['keywords'] # Calculate confidence (0.6-0.9 based on matches) confidence = min(0.9, 0.6 + (match_count * 0.1)) reasoning = f"Matched {match_count} keyword(s): {', '.join(matched_keywords[:3])}" logger.info(f"✅ Keyword classification: {category_name} (confidence: {confidence:.2f})") return { 'classification': category_name, 'confidence': confidence, 'reasoning': reasoning } # Global instance simple_classifier = SimpleEmailClassifier()