fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34

This commit is contained in:
Christian 2026-03-02 13:48:14 +01:00
parent 72acca9e8b
commit bf28e94d6e
3 changed files with 141 additions and 56 deletions

View File

@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
Backend API for managing supplier invoices that integrate with e-conomic Backend API for managing supplier invoices that integrate with e-conomic
""" """
from fastapi import APIRouter, HTTPException, UploadFile, File from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
from pydantic import BaseModel from pydantic import BaseModel
from typing import List, Dict, Optional from typing import List, Dict, Optional
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}") raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
@router.post("/supplier-invoices/files/batch-analyze")
async def batch_analyze_files(background_tasks: BackgroundTasks):
"""
Kør AI-analyse alle ubehandlede filer i baggrunden.
Returnerer øjeblikkeligt filer behandles async.
"""
pending = execute_query(
"""SELECT file_id, filename FROM incoming_files
WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
ORDER BY uploaded_at DESC
LIMIT 100""",
()
)
if not pending:
return {"started": 0, "message": "Ingen filer at behandle"}
file_ids = [r['file_id'] for r in pending]
logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
async def _run_batch(ids):
ok = err = 0
for fid in ids:
try:
await reprocess_uploaded_file(fid)
ok += 1
except Exception as ex:
logger.error(f"❌ Batch fejl file {fid}: {ex}")
err += 1
logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
background_tasks.add_task(_run_batch, file_ids)
return {
"started": len(file_ids),
"message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
"analyzed": 0,
"requires_vendor_selection": 0,
"failed": 0
}
@router.put("/supplier-invoices/templates/{template_id}") @router.put("/supplier-invoices/templates/{template_id}")
async def update_template( async def update_template(
template_id: int, template_id: int,

View File

@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {
// NEW: Batch analyze all files // NEW: Batch analyze all files
async function batchAnalyzeAllFiles() { async function batchAnalyzeAllFiles() {
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) { if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
return; return;
} }
try { try {
showLoadingOverlay('Analyserer filer...'); showLoadingOverlay('Starter analyse...');
const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', { const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
method: 'POST' method: 'POST'
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
hideLoadingOverlay(); hideLoadingOverlay();
alert(`✅ Batch-analyse fuldført!\n\n` + if (result.started === 0) {
`Analyseret: ${result.analyzed}\n` + alert(' Ingen filer at behandle.');
`Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` + return;
`Fejlet: ${result.failed}`); }
// Reload tables alert(`✅ ${result.message}`);
// Auto-opdater tabellen hvert 10. sekund i 5 minutter
let refreshes = 0;
const maxRefreshes = 30;
const interval = setInterval(() => {
loadUnhandledFiles();
refreshes++;
if (refreshes >= maxRefreshes) clearInterval(interval);
}, 10000);
loadUnhandledFiles(); loadUnhandledFiles();
loadKassekladdeView();
} catch (error) { } catch (error) {
hideLoadingOverlay(); hideLoadingOverlay();
console.error('Batch analysis error:', error); console.error('Batch analysis error:', error);
alert('❌ Fejl ved batch-analyse'); alert('❌ Fejl ved batch-analyse: ' + error.message);
} }
} }

View File

@ -182,10 +182,11 @@ Output: {
], ],
"stream": False, "stream": False,
"format": "json", "format": "json",
"think": False,
"options": { "options": {
"temperature": 0.1, "temperature": 0.1,
"top_p": 0.9, "top_p": 0.9,
"num_predict": 2000 "num_predict": 8000
} }
} }
) )
@ -201,7 +202,7 @@ Output: {
"options": { "options": {
"temperature": 0.1, "temperature": 0.1,
"top_p": 0.9, "top_p": 0.9,
"num_predict": 2000 "num_predict": 8000
} }
} }
) )
@ -313,53 +314,88 @@ Output: {
} }
def _parse_json_response(self, response: str) -> Dict: def _parse_json_response(self, response: str) -> Dict:
"""Parse JSON from LLM response with improved error handling""" """Parse JSON from LLM response with aggressive fallback strategies"""
logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
# Find outermost JSON object
start = response.find('{')
end = response.rfind('}') + 1
if start < 0 or end <= start:
logger.error("❌ No JSON object found in response")
return self._extract_fields_with_regex(response)
json_str = response[start:end]
# Strategy 1: direct parse
try: try:
# Log preview of response for debugging return json.loads(json_str)
logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}") except json.JSONDecodeError:
pass
# Find JSON in response (between first { and last })
start = response.find('{') # Strategy 2: remove trailing commas before } or ]
end = response.rfind('}') + 1 fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
try:
if start >= 0 and end > start: return json.loads(fixed)
json_str = response[start:end] except json.JSONDecodeError:
logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}") pass
# Try to fix common JSON issues # Strategy 3: remove JS-style comments (// and /* */)
# Remove trailing commas before } or ] fixed = re.sub(r'//[^\n]*', '', fixed)
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
# Fix single quotes to double quotes (but not in values) try:
# This is risky, so we only do it if initial parse fails return json.loads(fixed)
except json.JSONDecodeError:
pass
# Strategy 4: truncate at last valid closing brace
# Walk backwards to find longest valid JSON prefix
for i in range(len(fixed) - 1, start, -1):
if fixed[i] == '}':
candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
# rebuild from inner start
c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
try: try:
data = json.loads(json_str) data = json.loads(c2)
logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
return data return data
except json.JSONDecodeError: except json.JSONDecodeError:
# Try to fix common issues continue
# Replace single quotes with double quotes (simple approach) break
fixed_json = json_str.replace("'", '"')
try: # Strategy 5: regex extraction of key fields (always succeeds with partial data)
data = json.loads(fixed_json) logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
logger.warning("⚠️ Fixed JSON with quote replacement") return self._extract_fields_with_regex(response)
return data
except: def _extract_fields_with_regex(self, text: str) -> Dict:
pass """Extract invoice fields from text using regex when JSON parsing fails"""
def _find(pattern, default=None):
# Last resort: log the problematic JSON m = re.search(pattern, text, re.IGNORECASE)
logger.error(f"❌ Problematic JSON: {json_str[:300]}") return m.group(1).strip() if m else default
raise
else: def _find_num(pattern):
raise ValueError("No JSON found in response") m = re.search(pattern, text, re.IGNORECASE)
if not m: return None
except json.JSONDecodeError as e: val = m.group(1).replace('.', '').replace(',', '.')
logger.error(f"❌ JSON parsing failed: {e}") try: return float(val)
logger.error(f"Raw response preview: {response[:500]}") except: return None
return {
"error": f"JSON parsing failed: {str(e)}", result = {
"confidence": 0.0, "document_type": _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
"raw_response": response[:500] "invoice_number": _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
} "vendor_name": _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
"vendor_cvr": _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
"invoice_date": _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
"due_date": _find(r'"due_date"\s*:\s*"([^"]+)"'),
"currency": _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
"total_amount": _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
"vat_amount": _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
"confidence": 0.5,
"lines": [],
"_partial": True,
}
logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
return result
def calculate_file_checksum(self, file_path: Path) -> str: def calculate_file_checksum(self, file_path: Path) -> str:
"""Calculate SHA256 checksum of file for duplicate detection""" """Calculate SHA256 checksum of file for duplicate detection"""