- Added `check_invoice_number_exists` method in `EconomicService` to verify invoice numbers in e-conomic journals. - Introduced `quick_analysis_on_upload` method in `OllamaService` for extracting critical fields from uploaded PDFs, including CVR, document type, and document number. - Created migration script to add new fields for storing detected CVR, vendor ID, document type, and document number in the `incoming_files` table. - Developed comprehensive tests for the quick analysis functionality, validating CVR detection, document type identification, and invoice number extraction.
90 lines
3.0 KiB
Python
90 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backfill quick analysis for existing files
|
|
"""
|
|
import sys
|
|
import asyncio
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from app.core.database import execute_query, execute_update, init_db
|
|
from app.services.ollama_service import ollama_service
|
|
|
|
|
|
async def backfill_quick_analysis():
|
|
"""Run quick analysis on all files that don't have it"""
|
|
|
|
# Initialize database
|
|
init_db()
|
|
|
|
try:
|
|
# Get files without quick analysis
|
|
files = execute_query(
|
|
"""SELECT file_id, filename, file_path
|
|
FROM incoming_files
|
|
WHERE (detected_cvr IS NULL OR detected_document_number IS NULL)
|
|
AND status NOT IN ('duplicate')
|
|
AND file_path IS NOT NULL
|
|
ORDER BY file_id DESC"""
|
|
)
|
|
|
|
print(f"📋 Found {len(files)} files without quick analysis")
|
|
|
|
success_count = 0
|
|
fail_count = 0
|
|
|
|
for file in files:
|
|
try:
|
|
file_path = Path(file['file_path'])
|
|
|
|
if not file_path.exists():
|
|
print(f"⚠️ File not found: {file_path}")
|
|
fail_count += 1
|
|
continue
|
|
|
|
print(f"\n🔍 Processing: {file['filename']} (ID: {file['file_id']})")
|
|
|
|
# Extract text
|
|
text = await ollama_service._extract_text_from_file(file_path)
|
|
|
|
# Run quick analysis
|
|
quick_result = await ollama_service.quick_analysis_on_upload(text)
|
|
|
|
# Update database
|
|
execute_update(
|
|
"""UPDATE incoming_files
|
|
SET detected_cvr = %s,
|
|
detected_vendor_id = %s,
|
|
detected_document_type = %s,
|
|
detected_document_number = %s
|
|
WHERE file_id = %s""",
|
|
(quick_result.get('cvr'),
|
|
quick_result.get('vendor_id'),
|
|
quick_result.get('document_type'),
|
|
quick_result.get('document_number'),
|
|
file['file_id'])
|
|
)
|
|
|
|
print(f"✅ Updated: CVR={quick_result.get('cvr')}, "
|
|
f"Type={quick_result.get('document_type')}, "
|
|
f"Number={quick_result.get('document_number')}, "
|
|
f"Vendor={quick_result.get('vendor_name')}")
|
|
|
|
success_count += 1
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing {file['filename']}: {e}")
|
|
fail_count += 1
|
|
|
|
print(f"\n📊 Summary: {success_count} successful, {fail_count} failed")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Fatal error: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(backfill_quick_analysis())
|