bmc_hub/scripts/backfill_quick_analysis.py

90 lines
3.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Backfill quick analysis for existing files
"""
import sys
import asyncio
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.core.database import execute_query, execute_update, init_db
from app.services.ollama_service import ollama_service
async def backfill_quick_analysis():
"""Run quick analysis on all files that don't have it"""
# Initialize database
init_db()
try:
# Get files without quick analysis
files = execute_query(
"""SELECT file_id, filename, file_path
FROM incoming_files
WHERE (detected_cvr IS NULL OR detected_document_number IS NULL)
AND status NOT IN ('duplicate')
AND file_path IS NOT NULL
ORDER BY file_id DESC"""
)
print(f"📋 Found {len(files)} files without quick analysis")
success_count = 0
fail_count = 0
for file in files:
try:
file_path = Path(file['file_path'])
if not file_path.exists():
print(f"⚠️ File not found: {file_path}")
fail_count += 1
continue
print(f"\n🔍 Processing: {file['filename']} (ID: {file['file_id']})")
# Extract text
text = await ollama_service._extract_text_from_file(file_path)
# Run quick analysis
quick_result = await ollama_service.quick_analysis_on_upload(text)
# Update database
execute_update(
"""UPDATE incoming_files
SET detected_cvr = %s,
detected_vendor_id = %s,
detected_document_type = %s,
detected_document_number = %s
WHERE file_id = %s""",
(quick_result.get('cvr'),
quick_result.get('vendor_id'),
quick_result.get('document_type'),
quick_result.get('document_number'),
file['file_id'])
)
print(f"✅ Updated: CVR={quick_result.get('cvr')}, "
f"Type={quick_result.get('document_type')}, "
f"Number={quick_result.get('document_number')}, "
f"Vendor={quick_result.get('vendor_name')}")
success_count += 1
except Exception as e:
print(f"❌ Error processing {file['filename']}: {e}")
fail_count += 1
print(f"\n📊 Summary: {success_count} successful, {fail_count} failed")
except Exception as e:
print(f"❌ Fatal error: {e}")
raise
if __name__ == "__main__":
asyncio.run(backfill_quick_analysis())