bmc_hub/migrations/009_document_extraction.sql

134 lines
5.5 KiB
MySQL
Raw Normal View History

-- Migration 009: Document Extraction and Upload System
-- Adds tables for file upload tracking, AI extraction, and duplicate prevention
-- Table: incoming_files
-- Tracks all uploaded files with SHA256 checksums for duplicate detection
CREATE TABLE IF NOT EXISTS incoming_files (
file_id SERIAL PRIMARY KEY,
filename VARCHAR(500) NOT NULL,
original_filename VARCHAR(500) NOT NULL,
file_path VARCHAR(1000),
file_size INTEGER,
mime_type VARCHAR(100),
checksum VARCHAR(64) NOT NULL UNIQUE, -- SHA256 hash for duplicate detection
status VARCHAR(50) DEFAULT 'pending', -- pending, processing, processed, failed, duplicate
uploaded_by INTEGER, -- Future: REFERENCES users(user_id)
uploaded_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP,
error_message TEXT
);
CREATE INDEX idx_incoming_files_status ON incoming_files(status);
CREATE INDEX idx_incoming_files_checksum ON incoming_files(checksum);
CREATE INDEX idx_incoming_files_uploaded_at ON incoming_files(uploaded_at DESC);
-- Table: extractions
-- Stores AI-extracted data from uploaded documents
CREATE TABLE IF NOT EXISTS extractions (
extraction_id SERIAL PRIMARY KEY,
file_id INTEGER REFERENCES incoming_files(file_id) ON DELETE CASCADE,
-- Document metadata
document_type VARCHAR(100), -- invoice, purchase_order, delivery_note, credit_note
document_id VARCHAR(100), -- Invoice/order number extracted by AI
vendor_name VARCHAR(255), -- Vendor name extracted by AI
vendor_cvr VARCHAR(20), -- CVR number extracted by AI
vendor_matched_id INTEGER REFERENCES vendors(id), -- Matched vendor from database
match_confidence DECIMAL(5,4) CHECK (match_confidence >= 0.0 AND match_confidence <= 1.0),
-- Financial data
document_date DATE,
due_date DATE,
currency VARCHAR(10) DEFAULT 'DKK',
total_amount DECIMAL(12,2),
vat_amount DECIMAL(12,2),
-- AI metadata
confidence DECIMAL(5,4) CHECK (confidence >= 0.0 AND confidence <= 1.0),
raw_text_snippet TEXT,
llm_response_json TEXT, -- Full JSON response from Ollama
-- Processing status
status VARCHAR(50) DEFAULT 'extracted', -- extracted, validated, matched, needs_review, converted
reviewed_by INTEGER, -- Future: REFERENCES users(user_id)
reviewed_at TIMESTAMP,
-- Timestamps
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_extractions_file_id ON extractions(file_id);
CREATE INDEX idx_extractions_vendor_matched_id ON extractions(vendor_matched_id);
CREATE INDEX idx_extractions_status ON extractions(status);
CREATE INDEX idx_extractions_document_id ON extractions(document_id);
CREATE INDEX idx_extractions_vendor_cvr ON extractions(vendor_cvr);
-- Table: extraction_lines
-- Stores individual line items extracted from invoices
CREATE TABLE IF NOT EXISTS extraction_lines (
line_id SERIAL PRIMARY KEY,
extraction_id INTEGER NOT NULL REFERENCES extractions(extraction_id) ON DELETE CASCADE,
line_number INTEGER NOT NULL,
-- Product identification
sku VARCHAR(100),
ean VARCHAR(13), -- 13-digit EAN barcode
description TEXT,
-- Quantities and pricing
quantity DECIMAL(10,2),
unit_price DECIMAL(12,4),
line_total DECIMAL(12,2),
vat_rate DECIMAL(5,2), -- VAT percentage (25.00 for 25%)
vat_amount DECIMAL(12,2),
-- AI metadata
confidence DECIMAL(5,4) CHECK (confidence >= 0.0 AND confidence <= 1.0),
-- Timestamps
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE(extraction_id, line_number)
);
CREATE INDEX idx_extraction_lines_extraction_id ON extraction_lines(extraction_id);
-- Add file_id column to supplier_invoices (link to uploaded file)
ALTER TABLE supplier_invoices
ADD COLUMN IF NOT EXISTS extraction_id INTEGER REFERENCES extractions(extraction_id);
CREATE INDEX idx_supplier_invoices_extraction_id ON supplier_invoices(extraction_id);
-- Add UNIQUE constraint to prevent duplicate invoices (same vendor + invoice number)
-- Drop existing index if it exists
DROP INDEX IF EXISTS idx_supplier_invoices_vendor_invoice;
-- Create UNIQUE index on vendor_id + invoice_number (simple version without WHERE clause)
CREATE UNIQUE INDEX idx_supplier_invoices_vendor_invoice
ON supplier_invoices(vendor_id, invoice_number);
-- Trigger to update extractions.updated_at
CREATE OR REPLACE FUNCTION update_extractions_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS trigger_update_extractions_updated_at ON extractions;
CREATE TRIGGER trigger_update_extractions_updated_at
BEFORE UPDATE ON extractions
FOR EACH ROW
EXECUTE FUNCTION update_extractions_updated_at();
-- Comments
COMMENT ON TABLE incoming_files IS 'Tracks all uploaded files with SHA256 checksums for duplicate detection';
COMMENT ON TABLE extractions IS 'Stores AI-extracted data from uploaded documents with CVR vendor matching';
COMMENT ON TABLE extraction_lines IS 'Stores individual line items extracted from invoices';
COMMENT ON COLUMN incoming_files.checksum IS 'SHA256 hash of file content - prevents duplicate uploads';
COMMENT ON COLUMN extractions.vendor_cvr IS 'CVR number extracted by AI from invoice';
COMMENT ON COLUMN extractions.vendor_matched_id IS 'Matched vendor from vendors table based on CVR number';
COMMENT ON COLUMN extractions.match_confidence IS 'Confidence score for vendor matching (0.0-1.0)';