bmc_hub/app/services/simple_classifier.py

110 lines
3.8 KiB
Python
Raw Normal View History

"""
Simple Keyword-Based Email Classifier
Fallback when AI classification is unavailable
"""
import logging
from typing import Dict, Optional
import re
logger = logging.getLogger(__name__)
class SimpleEmailClassifier:
"""Simple rule-based email classifier using keywords"""
def __init__(self):
self.keyword_rules = {
'invoice': [
'faktura', 'invoice', 'kreditnota', 'credit note',
'ordrenr', 'order number', 'betalingspåmindelse', 'payment reminder',
'fakturanr', 'invoice number', 'betaling', 'payment'
],
'freight_note': [
'fragtbrev', 'tracking', 'forsendelse', 'shipment',
'levering', 'delivery', 'pakke', 'package', 'fragtbreve'
],
'order_confirmation': [
'ordrebekræftelse', 'order confirmation', 'bestilling bekræftet',
'ordre modtaget', 'order received'
],
'time_confirmation': [
'timer', 'hours', 'tidsforbrug', 'time spent',
'tidsregistrering', 'time registration'
],
'case_notification': [
'cc[0-9]{4}', 'case #', 'sag ', 'ticket', 'support'
],
'bankruptcy': [
'konkurs', 'bankruptcy', 'rekonstruktion', 'insolvency',
'betalingsstandsning', 'administration'
],
'spam': [
'unsubscribe', 'click here', 'free offer', 'gratis tilbud',
'vind nu', 'win now', 'limited time'
]
}
def classify(self, email_data: Dict) -> Dict:
"""
Classify email using simple keyword matching
Returns: {classification: str, confidence: float, reasoning: str}
"""
subject = (email_data.get('subject', '') or '').lower()
sender = (email_data.get('sender_email', '') or '').lower()
body = (email_data.get('body_text', '') or '').lower()[:500] # First 500 chars
logger.info(f"🔍 simple_classifier: subject='{subject}', body_len={len(body)}, sender='{sender}'")
# Combine all text for analysis
text = f"{subject} {body}"
# Check each category
scores = {}
for category, keywords in self.keyword_rules.items():
matches = 0
matched_keywords = []
for keyword in keywords:
# Use regex for patterns like CC[0-9]{4}
if re.search(keyword, text, re.IGNORECASE):
matches += 1
matched_keywords.append(keyword)
if matches > 0:
scores[category] = {
'matches': matches,
'keywords': matched_keywords
}
# Determine best match
if not scores:
return {
'classification': 'general',
'confidence': 0.5,
'reasoning': 'No specific keywords matched - classified as general'
}
# Get category with most matches
best_category = max(scores.items(), key=lambda x: x[1]['matches'])
category_name = best_category[0]
match_count = best_category[1]['matches']
matched_keywords = best_category[1]['keywords']
# Calculate confidence (0.6-0.9 based on matches)
confidence = min(0.9, 0.6 + (match_count * 0.1))
reasoning = f"Matched {match_count} keyword(s): {', '.join(matched_keywords[:3])}"
logger.info(f"✅ Keyword classification: {category_name} (confidence: {confidence:.2f})")
return {
'classification': category_name,
'confidence': confidence,
'reasoning': reasoning
}
# Global instance
simple_classifier = SimpleEmailClassifier()