fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34

This commit is contained in:
Christian 2026-03-02 13:48:14 +01:00
parent 72acca9e8b
commit bf28e94d6e
3 changed files with 141 additions and 56 deletions

View File

@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
Backend API for managing supplier invoices that integrate with e-conomic
"""
from fastapi import APIRouter, HTTPException, UploadFile, File
from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
from pydantic import BaseModel
from typing import List, Dict, Optional
from datetime import datetime, date, timedelta
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
@router.post("/supplier-invoices/files/batch-analyze")
async def batch_analyze_files(background_tasks: BackgroundTasks):
"""
Kør AI-analyse alle ubehandlede filer i baggrunden.
Returnerer øjeblikkeligt filer behandles async.
"""
pending = execute_query(
"""SELECT file_id, filename FROM incoming_files
WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
ORDER BY uploaded_at DESC
LIMIT 100""",
()
)
if not pending:
return {"started": 0, "message": "Ingen filer at behandle"}
file_ids = [r['file_id'] for r in pending]
logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
async def _run_batch(ids):
ok = err = 0
for fid in ids:
try:
await reprocess_uploaded_file(fid)
ok += 1
except Exception as ex:
logger.error(f"❌ Batch fejl file {fid}: {ex}")
err += 1
logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
background_tasks.add_task(_run_batch, file_ids)
return {
"started": len(file_ids),
"message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
"analyzed": 0,
"requires_vendor_selection": 0,
"failed": 0
}
@router.put("/supplier-invoices/templates/{template_id}")
async def update_template(
template_id: int,

View File

@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {
// NEW: Batch analyze all files
async function batchAnalyzeAllFiles() {
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) {
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
return;
}
try {
showLoadingOverlay('Analyserer filer...');
showLoadingOverlay('Starter analyse...');
const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
method: 'POST'
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
hideLoadingOverlay();
alert(`✅ Batch-analyse fuldført!\n\n` +
`Analyseret: ${result.analyzed}\n` +
`Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` +
`Fejlet: ${result.failed}`);
if (result.started === 0) {
alert(' Ingen filer at behandle.');
return;
}
// Reload tables
alert(`✅ ${result.message}`);
// Auto-opdater tabellen hvert 10. sekund i 5 minutter
let refreshes = 0;
const maxRefreshes = 30;
const interval = setInterval(() => {
loadUnhandledFiles();
refreshes++;
if (refreshes >= maxRefreshes) clearInterval(interval);
}, 10000);
loadUnhandledFiles();
loadKassekladdeView();
} catch (error) {
hideLoadingOverlay();
console.error('Batch analysis error:', error);
alert('❌ Fejl ved batch-analyse');
alert('❌ Fejl ved batch-analyse: ' + error.message);
}
}

View File

@ -182,10 +182,11 @@ Output: {
],
"stream": False,
"format": "json",
"think": False,
"options": {
"temperature": 0.1,
"top_p": 0.9,
"num_predict": 2000
"num_predict": 8000
}
}
)
@ -201,7 +202,7 @@ Output: {
"options": {
"temperature": 0.1,
"top_p": 0.9,
"num_predict": 2000
"num_predict": 8000
}
}
)
@ -313,53 +314,88 @@ Output: {
}
def _parse_json_response(self, response: str) -> Dict:
"""Parse JSON from LLM response with improved error handling"""
"""Parse JSON from LLM response with aggressive fallback strategies"""
logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
# Find outermost JSON object
start = response.find('{')
end = response.rfind('}') + 1
if start < 0 or end <= start:
logger.error("❌ No JSON object found in response")
return self._extract_fields_with_regex(response)
json_str = response[start:end]
# Strategy 1: direct parse
try:
# Log preview of response for debugging
logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
return json.loads(json_str)
except json.JSONDecodeError:
pass
# Find JSON in response (between first { and last })
start = response.find('{')
end = response.rfind('}') + 1
# Strategy 2: remove trailing commas before } or ]
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
if start >= 0 and end > start:
json_str = response[start:end]
logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
# Try to fix common JSON issues
# Remove trailing commas before } or ]
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
# Fix single quotes to double quotes (but not in values)
# This is risky, so we only do it if initial parse fails
# Strategy 3: remove JS-style comments (// and /* */)
fixed = re.sub(r'//[^\n]*', '', fixed)
fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
# Strategy 4: truncate at last valid closing brace
# Walk backwards to find longest valid JSON prefix
for i in range(len(fixed) - 1, start, -1):
if fixed[i] == '}':
candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
# rebuild from inner start
c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
try:
data = json.loads(json_str)
data = json.loads(c2)
logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
return data
except json.JSONDecodeError:
# Try to fix common issues
# Replace single quotes with double quotes (simple approach)
fixed_json = json_str.replace("'", '"')
try:
data = json.loads(fixed_json)
logger.warning("⚠️ Fixed JSON with quote replacement")
return data
except:
pass
continue
break
# Last resort: log the problematic JSON
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
raise
else:
raise ValueError("No JSON found in response")
# Strategy 5: regex extraction of key fields (always succeeds with partial data)
logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
return self._extract_fields_with_regex(response)
def _extract_fields_with_regex(self, text: str) -> Dict:
"""Extract invoice fields from text using regex when JSON parsing fails"""
def _find(pattern, default=None):
m = re.search(pattern, text, re.IGNORECASE)
return m.group(1).strip() if m else default
def _find_num(pattern):
m = re.search(pattern, text, re.IGNORECASE)
if not m: return None
val = m.group(1).replace('.', '').replace(',', '.')
try: return float(val)
except: return None
result = {
"document_type": _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
"invoice_number": _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
"vendor_name": _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
"vendor_cvr": _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
"invoice_date": _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
"due_date": _find(r'"due_date"\s*:\s*"([^"]+)"'),
"currency": _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
"total_amount": _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
"vat_amount": _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
"confidence": 0.5,
"lines": [],
"_partial": True,
}
logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
return result
except json.JSONDecodeError as e:
logger.error(f"❌ JSON parsing failed: {e}")
logger.error(f"Raw response preview: {response[:500]}")
return {
"error": f"JSON parsing failed: {str(e)}",
"confidence": 0.0,
"raw_response": response[:500]
}
def calculate_file_checksum(self, file_path: Path) -> str:
"""Calculate SHA256 checksum of file for duplicate detection"""