bmc_hub/app/services/simple_classifier.py
Christian eacbd36e83 feat: Implement Transcription Service for audio files using Whisper API
- Added `transcription_service.py` to handle audio transcription via Whisper API.
- Integrated logging for transcription processes and error handling.
- Supported audio format checks based on configuration settings.

docs: Create Ordre System Implementation Plan

- Drafted comprehensive implementation plan for e-conomic order integration.
- Outlined business requirements, database changes, backend and frontend implementation details.
- Included testing plan and deployment steps for the new order system.

feat: Add AI prompts and regex action capabilities

- Created `ai_prompts` table for storing custom AI prompts.
- Added regex extraction and linking action to email workflow actions.

feat: Introduce conversations module for transcribed audio

- Created `conversations` table to store transcribed conversations with relevant metadata.
- Added indexing for customer, ticket, and user linkage.
- Implemented full-text search capabilities for Danish language.

fix: Add category column to conversations for classification

- Added `category` column to `conversations` table for better conversation classification.
2026-01-11 19:23:21 +01:00

118 lines
4.3 KiB
Python

"""
Simple Keyword-Based Email Classifier
Fallback when AI classification is unavailable
"""
import logging
from typing import Dict, Optional
import re
logger = logging.getLogger(__name__)
class SimpleEmailClassifier:
"""Simple rule-based email classifier using keywords"""
def __init__(self):
self.keyword_rules = {
'invoice': [
'faktura', 'invoice', 'kreditnota', 'credit note',
'ordrenr', 'order number', 'betalingspåmindelse', 'payment reminder',
'fakturanr', 'invoice number', 'betaling', 'payment'
],
'freight_note': [
'fragtbrev', 'tracking', 'forsendelse', 'shipment',
'levering', 'delivery', 'pakke', 'package', 'fragtbreve'
],
'order_confirmation': [
'ordrebekræftelse', 'order confirmation', 'bestilling bekræftet',
'ordre modtaget', 'order received'
],
'time_confirmation': [
'timer', 'hours', 'tidsforbrug', 'time spent',
'tidsregistrering', 'time registration'
],
'case_notification': [
'cc[0-9]{4}', 'case #', 'sag ', 'ticket', 'support'
],
'bankruptcy': [
'konkurs', 'bankruptcy', 'rekonstruktion', 'insolvency',
'betalingsstandsning', 'administration'
],
'newsletter': [
'nyhedsbrev', 'newsletter', 'kampagne', 'campaign',
'tilbud', 'offer', 'webinar', 'invitation', 'event',
'update', 'opdatering', 'salg', 'sale', 'black friday',
'cyber monday', 'sommerudsalg', 'vinterudsalg', 'rabat',
'discount', 'no-reply', 'noreply', 'automatisk besked',
'auto-generated'
],
'spam': [
'unsubscribe', 'click here', 'free offer', 'gratis tilbud',
'vind nu', 'win now', 'limited time'
]
}
def classify(self, email_data: Dict) -> Dict:
"""
Classify email using simple keyword matching
Returns: {classification: str, confidence: float, reasoning: str}
"""
subject = (email_data.get('subject', '') or '').lower()
sender = (email_data.get('sender_email', '') or '').lower()
body = (email_data.get('body_text', '') or '').lower()[:500] # First 500 chars
logger.info(f"🔍 simple_classifier: subject='{subject}', body_len={len(body)}, sender='{sender}'")
# Combine all text for analysis
text = f"{subject} {body}"
# Check each category
scores = {}
for category, keywords in self.keyword_rules.items():
matches = 0
matched_keywords = []
for keyword in keywords:
# Use regex for patterns like CC[0-9]{4}
if re.search(keyword, text, re.IGNORECASE):
matches += 1
matched_keywords.append(keyword)
if matches > 0:
scores[category] = {
'matches': matches,
'keywords': matched_keywords
}
# Determine best match
if not scores:
return {
'classification': 'general',
'confidence': 0.5,
'reasoning': 'No specific keywords matched - classified as general'
}
# Get category with most matches
best_category = max(scores.items(), key=lambda x: x[1]['matches'])
category_name = best_category[0]
match_count = best_category[1]['matches']
matched_keywords = best_category[1]['keywords']
# Calculate confidence (0.6-0.9 based on matches)
confidence = min(0.9, 0.6 + (match_count * 0.1))
reasoning = f"Matched {match_count} keyword(s): {', '.join(matched_keywords[:3])}"
logger.info(f"✅ Keyword classification: {category_name} (confidence: {confidence:.2f})")
return {
'classification': category_name,
'confidence': confidence,
'reasoning': reasoning
}
# Global instance
simple_classifier = SimpleEmailClassifier()