-- Migration 009: Document Extraction and Upload System -- Adds tables for file upload tracking, AI extraction, and duplicate prevention -- Table: incoming_files -- Tracks all uploaded files with SHA256 checksums for duplicate detection CREATE TABLE IF NOT EXISTS incoming_files ( file_id SERIAL PRIMARY KEY, filename VARCHAR(500) NOT NULL, original_filename VARCHAR(500) NOT NULL, file_path VARCHAR(1000), file_size INTEGER, mime_type VARCHAR(100), checksum VARCHAR(64) NOT NULL UNIQUE, -- SHA256 hash for duplicate detection status VARCHAR(50) DEFAULT 'pending', -- pending, processing, processed, failed, duplicate uploaded_by INTEGER, -- Future: REFERENCES users(user_id) uploaded_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, processed_at TIMESTAMP, error_message TEXT ); CREATE INDEX idx_incoming_files_status ON incoming_files(status); CREATE INDEX idx_incoming_files_checksum ON incoming_files(checksum); CREATE INDEX idx_incoming_files_uploaded_at ON incoming_files(uploaded_at DESC); -- Table: extractions -- Stores AI-extracted data from uploaded documents CREATE TABLE IF NOT EXISTS extractions ( extraction_id SERIAL PRIMARY KEY, file_id INTEGER REFERENCES incoming_files(file_id) ON DELETE CASCADE, -- Document metadata document_type VARCHAR(100), -- invoice, purchase_order, delivery_note, credit_note document_id VARCHAR(100), -- Invoice/order number extracted by AI vendor_name VARCHAR(255), -- Vendor name extracted by AI vendor_cvr VARCHAR(20), -- CVR number extracted by AI vendor_matched_id INTEGER REFERENCES vendors(id), -- Matched vendor from database match_confidence DECIMAL(5,4) CHECK (match_confidence >= 0.0 AND match_confidence <= 1.0), -- Financial data document_date DATE, due_date DATE, currency VARCHAR(10) DEFAULT 'DKK', total_amount DECIMAL(12,2), vat_amount DECIMAL(12,2), -- AI metadata confidence DECIMAL(5,4) CHECK (confidence >= 0.0 AND confidence <= 1.0), raw_text_snippet TEXT, llm_response_json TEXT, -- Full JSON response from Ollama -- Processing status status VARCHAR(50) DEFAULT 'extracted', -- extracted, validated, matched, needs_review, converted reviewed_by INTEGER, -- Future: REFERENCES users(user_id) reviewed_at TIMESTAMP, -- Timestamps created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_extractions_file_id ON extractions(file_id); CREATE INDEX idx_extractions_vendor_matched_id ON extractions(vendor_matched_id); CREATE INDEX idx_extractions_status ON extractions(status); CREATE INDEX idx_extractions_document_id ON extractions(document_id); CREATE INDEX idx_extractions_vendor_cvr ON extractions(vendor_cvr); -- Table: extraction_lines -- Stores individual line items extracted from invoices CREATE TABLE IF NOT EXISTS extraction_lines ( line_id SERIAL PRIMARY KEY, extraction_id INTEGER NOT NULL REFERENCES extractions(extraction_id) ON DELETE CASCADE, line_number INTEGER NOT NULL, -- Product identification sku VARCHAR(100), ean VARCHAR(13), -- 13-digit EAN barcode description TEXT, -- Quantities and pricing quantity DECIMAL(10,2), unit_price DECIMAL(12,4), line_total DECIMAL(12,2), vat_rate DECIMAL(5,2), -- VAT percentage (25.00 for 25%) vat_amount DECIMAL(12,2), -- AI metadata confidence DECIMAL(5,4) CHECK (confidence >= 0.0 AND confidence <= 1.0), -- Timestamps created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, UNIQUE(extraction_id, line_number) ); CREATE INDEX idx_extraction_lines_extraction_id ON extraction_lines(extraction_id); -- Add file_id column to supplier_invoices (link to uploaded file) ALTER TABLE supplier_invoices ADD COLUMN IF NOT EXISTS extraction_id INTEGER REFERENCES extractions(extraction_id); CREATE INDEX idx_supplier_invoices_extraction_id ON supplier_invoices(extraction_id); -- Add UNIQUE constraint to prevent duplicate invoices (same vendor + invoice number) -- Drop existing index if it exists DROP INDEX IF EXISTS idx_supplier_invoices_vendor_invoice; -- Create UNIQUE index on vendor_id + invoice_number (simple version without WHERE clause) CREATE UNIQUE INDEX idx_supplier_invoices_vendor_invoice ON supplier_invoices(vendor_id, invoice_number); -- Trigger to update extractions.updated_at CREATE OR REPLACE FUNCTION update_extractions_updated_at() RETURNS TRIGGER AS $$ BEGIN NEW.updated_at = CURRENT_TIMESTAMP; RETURN NEW; END; $$ LANGUAGE plpgsql; DROP TRIGGER IF EXISTS trigger_update_extractions_updated_at ON extractions; CREATE TRIGGER trigger_update_extractions_updated_at BEFORE UPDATE ON extractions FOR EACH ROW EXECUTE FUNCTION update_extractions_updated_at(); -- Comments COMMENT ON TABLE incoming_files IS 'Tracks all uploaded files with SHA256 checksums for duplicate detection'; COMMENT ON TABLE extractions IS 'Stores AI-extracted data from uploaded documents with CVR vendor matching'; COMMENT ON TABLE extraction_lines IS 'Stores individual line items extracted from invoices'; COMMENT ON COLUMN incoming_files.checksum IS 'SHA256 hash of file content - prevents duplicate uploads'; COMMENT ON COLUMN extractions.vendor_cvr IS 'CVR number extracted by AI from invoice'; COMMENT ON COLUMN extractions.vendor_matched_id IS 'Matched vendor from vendors table based on CVR number'; COMMENT ON COLUMN extractions.match_confidence IS 'Confidence score for vendor matching (0.0-1.0)';