134 lines
5.5 KiB
MySQL
134 lines
5.5 KiB
MySQL
|
|
-- Migration 009: Document Extraction and Upload System
|
||
|
|
-- Adds tables for file upload tracking, AI extraction, and duplicate prevention
|
||
|
|
|
||
|
|
-- Table: incoming_files
|
||
|
|
-- Tracks all uploaded files with SHA256 checksums for duplicate detection
|
||
|
|
CREATE TABLE IF NOT EXISTS incoming_files (
|
||
|
|
file_id SERIAL PRIMARY KEY,
|
||
|
|
filename VARCHAR(500) NOT NULL,
|
||
|
|
original_filename VARCHAR(500) NOT NULL,
|
||
|
|
file_path VARCHAR(1000),
|
||
|
|
file_size INTEGER,
|
||
|
|
mime_type VARCHAR(100),
|
||
|
|
checksum VARCHAR(64) NOT NULL UNIQUE, -- SHA256 hash for duplicate detection
|
||
|
|
status VARCHAR(50) DEFAULT 'pending', -- pending, processing, processed, failed, duplicate
|
||
|
|
uploaded_by INTEGER, -- Future: REFERENCES users(user_id)
|
||
|
|
uploaded_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||
|
|
processed_at TIMESTAMP,
|
||
|
|
error_message TEXT
|
||
|
|
);
|
||
|
|
|
||
|
|
CREATE INDEX idx_incoming_files_status ON incoming_files(status);
|
||
|
|
CREATE INDEX idx_incoming_files_checksum ON incoming_files(checksum);
|
||
|
|
CREATE INDEX idx_incoming_files_uploaded_at ON incoming_files(uploaded_at DESC);
|
||
|
|
|
||
|
|
-- Table: extractions
|
||
|
|
-- Stores AI-extracted data from uploaded documents
|
||
|
|
CREATE TABLE IF NOT EXISTS extractions (
|
||
|
|
extraction_id SERIAL PRIMARY KEY,
|
||
|
|
file_id INTEGER REFERENCES incoming_files(file_id) ON DELETE CASCADE,
|
||
|
|
|
||
|
|
-- Document metadata
|
||
|
|
document_type VARCHAR(100), -- invoice, purchase_order, delivery_note, credit_note
|
||
|
|
document_id VARCHAR(100), -- Invoice/order number extracted by AI
|
||
|
|
vendor_name VARCHAR(255), -- Vendor name extracted by AI
|
||
|
|
vendor_cvr VARCHAR(20), -- CVR number extracted by AI
|
||
|
|
vendor_matched_id INTEGER REFERENCES vendors(id), -- Matched vendor from database
|
||
|
|
match_confidence DECIMAL(5,4) CHECK (match_confidence >= 0.0 AND match_confidence <= 1.0),
|
||
|
|
|
||
|
|
-- Financial data
|
||
|
|
document_date DATE,
|
||
|
|
due_date DATE,
|
||
|
|
currency VARCHAR(10) DEFAULT 'DKK',
|
||
|
|
total_amount DECIMAL(12,2),
|
||
|
|
vat_amount DECIMAL(12,2),
|
||
|
|
|
||
|
|
-- AI metadata
|
||
|
|
confidence DECIMAL(5,4) CHECK (confidence >= 0.0 AND confidence <= 1.0),
|
||
|
|
raw_text_snippet TEXT,
|
||
|
|
llm_response_json TEXT, -- Full JSON response from Ollama
|
||
|
|
|
||
|
|
-- Processing status
|
||
|
|
status VARCHAR(50) DEFAULT 'extracted', -- extracted, validated, matched, needs_review, converted
|
||
|
|
reviewed_by INTEGER, -- Future: REFERENCES users(user_id)
|
||
|
|
reviewed_at TIMESTAMP,
|
||
|
|
|
||
|
|
-- Timestamps
|
||
|
|
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||
|
|
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||
|
|
);
|
||
|
|
|
||
|
|
CREATE INDEX idx_extractions_file_id ON extractions(file_id);
|
||
|
|
CREATE INDEX idx_extractions_vendor_matched_id ON extractions(vendor_matched_id);
|
||
|
|
CREATE INDEX idx_extractions_status ON extractions(status);
|
||
|
|
CREATE INDEX idx_extractions_document_id ON extractions(document_id);
|
||
|
|
CREATE INDEX idx_extractions_vendor_cvr ON extractions(vendor_cvr);
|
||
|
|
|
||
|
|
-- Table: extraction_lines
|
||
|
|
-- Stores individual line items extracted from invoices
|
||
|
|
CREATE TABLE IF NOT EXISTS extraction_lines (
|
||
|
|
line_id SERIAL PRIMARY KEY,
|
||
|
|
extraction_id INTEGER NOT NULL REFERENCES extractions(extraction_id) ON DELETE CASCADE,
|
||
|
|
line_number INTEGER NOT NULL,
|
||
|
|
|
||
|
|
-- Product identification
|
||
|
|
sku VARCHAR(100),
|
||
|
|
ean VARCHAR(13), -- 13-digit EAN barcode
|
||
|
|
description TEXT,
|
||
|
|
|
||
|
|
-- Quantities and pricing
|
||
|
|
quantity DECIMAL(10,2),
|
||
|
|
unit_price DECIMAL(12,4),
|
||
|
|
line_total DECIMAL(12,2),
|
||
|
|
vat_rate DECIMAL(5,2), -- VAT percentage (25.00 for 25%)
|
||
|
|
vat_amount DECIMAL(12,2),
|
||
|
|
|
||
|
|
-- AI metadata
|
||
|
|
confidence DECIMAL(5,4) CHECK (confidence >= 0.0 AND confidence <= 1.0),
|
||
|
|
|
||
|
|
-- Timestamps
|
||
|
|
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||
|
|
|
||
|
|
UNIQUE(extraction_id, line_number)
|
||
|
|
);
|
||
|
|
|
||
|
|
CREATE INDEX idx_extraction_lines_extraction_id ON extraction_lines(extraction_id);
|
||
|
|
|
||
|
|
-- Add file_id column to supplier_invoices (link to uploaded file)
|
||
|
|
ALTER TABLE supplier_invoices
|
||
|
|
ADD COLUMN IF NOT EXISTS extraction_id INTEGER REFERENCES extractions(extraction_id);
|
||
|
|
|
||
|
|
CREATE INDEX idx_supplier_invoices_extraction_id ON supplier_invoices(extraction_id);
|
||
|
|
|
||
|
|
-- Add UNIQUE constraint to prevent duplicate invoices (same vendor + invoice number)
|
||
|
|
-- Drop existing index if it exists
|
||
|
|
DROP INDEX IF EXISTS idx_supplier_invoices_vendor_invoice;
|
||
|
|
|
||
|
|
-- Create UNIQUE index on vendor_id + invoice_number (simple version without WHERE clause)
|
||
|
|
CREATE UNIQUE INDEX idx_supplier_invoices_vendor_invoice
|
||
|
|
ON supplier_invoices(vendor_id, invoice_number);
|
||
|
|
|
||
|
|
-- Trigger to update extractions.updated_at
|
||
|
|
CREATE OR REPLACE FUNCTION update_extractions_updated_at()
|
||
|
|
RETURNS TRIGGER AS $$
|
||
|
|
BEGIN
|
||
|
|
NEW.updated_at = CURRENT_TIMESTAMP;
|
||
|
|
RETURN NEW;
|
||
|
|
END;
|
||
|
|
$$ LANGUAGE plpgsql;
|
||
|
|
|
||
|
|
DROP TRIGGER IF EXISTS trigger_update_extractions_updated_at ON extractions;
|
||
|
|
CREATE TRIGGER trigger_update_extractions_updated_at
|
||
|
|
BEFORE UPDATE ON extractions
|
||
|
|
FOR EACH ROW
|
||
|
|
EXECUTE FUNCTION update_extractions_updated_at();
|
||
|
|
|
||
|
|
-- Comments
|
||
|
|
COMMENT ON TABLE incoming_files IS 'Tracks all uploaded files with SHA256 checksums for duplicate detection';
|
||
|
|
COMMENT ON TABLE extractions IS 'Stores AI-extracted data from uploaded documents with CVR vendor matching';
|
||
|
|
COMMENT ON TABLE extraction_lines IS 'Stores individual line items extracted from invoices';
|
||
|
|
COMMENT ON COLUMN incoming_files.checksum IS 'SHA256 hash of file content - prevents duplicate uploads';
|
||
|
|
COMMENT ON COLUMN extractions.vendor_cvr IS 'CVR number extracted by AI from invoice';
|
||
|
|
COMMENT ON COLUMN extractions.vendor_matched_id IS 'Matched vendor from vendors table based on CVR number';
|
||
|
|
COMMENT ON COLUMN extractions.match_confidence IS 'Confidence score for vendor matching (0.0-1.0)';
|