fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34

This commit is contained in:
Christian 2026-03-02 13:48:14 +01:00
parent 72acca9e8b
commit bf28e94d6e
3 changed files with 141 additions and 56 deletions

View File

@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
Backend API for managing supplier invoices that integrate with e-conomic Backend API for managing supplier invoices that integrate with e-conomic
""" """
from fastapi import APIRouter, HTTPException, UploadFile, File from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
from pydantic import BaseModel from pydantic import BaseModel
from typing import List, Dict, Optional from typing import List, Dict, Optional
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}") raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
@router.post("/supplier-invoices/files/batch-analyze")
async def batch_analyze_files(background_tasks: BackgroundTasks):
"""
Kør AI-analyse alle ubehandlede filer i baggrunden.
Returnerer øjeblikkeligt filer behandles async.
"""
pending = execute_query(
"""SELECT file_id, filename FROM incoming_files
WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
ORDER BY uploaded_at DESC
LIMIT 100""",
()
)
if not pending:
return {"started": 0, "message": "Ingen filer at behandle"}
file_ids = [r['file_id'] for r in pending]
logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
async def _run_batch(ids):
ok = err = 0
for fid in ids:
try:
await reprocess_uploaded_file(fid)
ok += 1
except Exception as ex:
logger.error(f"❌ Batch fejl file {fid}: {ex}")
err += 1
logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
background_tasks.add_task(_run_batch, file_ids)
return {
"started": len(file_ids),
"message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
"analyzed": 0,
"requires_vendor_selection": 0,
"failed": 0
}
@router.put("/supplier-invoices/templates/{template_id}") @router.put("/supplier-invoices/templates/{template_id}")
async def update_template( async def update_template(
template_id: int, template_id: int,

View File

@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {
// NEW: Batch analyze all files // NEW: Batch analyze all files
async function batchAnalyzeAllFiles() { async function batchAnalyzeAllFiles() {
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) { if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
return; return;
} }
try { try {
showLoadingOverlay('Analyserer filer...'); showLoadingOverlay('Starter analyse...');
const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', { const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
method: 'POST' method: 'POST'
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
hideLoadingOverlay(); hideLoadingOverlay();
alert(`✅ Batch-analyse fuldført!\n\n` + if (result.started === 0) {
`Analyseret: ${result.analyzed}\n` + alert(' Ingen filer at behandle.');
`Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` + return;
`Fejlet: ${result.failed}`); }
// Reload tables alert(`✅ ${result.message}`);
// Auto-opdater tabellen hvert 10. sekund i 5 minutter
let refreshes = 0;
const maxRefreshes = 30;
const interval = setInterval(() => {
loadUnhandledFiles();
refreshes++;
if (refreshes >= maxRefreshes) clearInterval(interval);
}, 10000);
loadUnhandledFiles(); loadUnhandledFiles();
loadKassekladdeView();
} catch (error) { } catch (error) {
hideLoadingOverlay(); hideLoadingOverlay();
console.error('Batch analysis error:', error); console.error('Batch analysis error:', error);
alert('❌ Fejl ved batch-analyse'); alert('❌ Fejl ved batch-analyse: ' + error.message);
} }
} }

View File

@ -182,10 +182,11 @@ Output: {
], ],
"stream": False, "stream": False,
"format": "json", "format": "json",
"think": False,
"options": { "options": {
"temperature": 0.1, "temperature": 0.1,
"top_p": 0.9, "top_p": 0.9,
"num_predict": 2000 "num_predict": 8000
} }
} }
) )
@ -201,7 +202,7 @@ Output: {
"options": { "options": {
"temperature": 0.1, "temperature": 0.1,
"top_p": 0.9, "top_p": 0.9,
"num_predict": 2000 "num_predict": 8000
} }
} }
) )
@ -313,53 +314,88 @@ Output: {
} }
def _parse_json_response(self, response: str) -> Dict: def _parse_json_response(self, response: str) -> Dict:
"""Parse JSON from LLM response with improved error handling""" """Parse JSON from LLM response with aggressive fallback strategies"""
try: logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
# Log preview of response for debugging
logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
# Find JSON in response (between first { and last }) # Find outermost JSON object
start = response.find('{') start = response.find('{')
end = response.rfind('}') + 1 end = response.rfind('}') + 1
if start < 0 or end <= start:
logger.error("❌ No JSON object found in response")
return self._extract_fields_with_regex(response)
if start >= 0 and end > start:
json_str = response[start:end] json_str = response[start:end]
logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
# Try to fix common JSON issues
# Remove trailing commas before } or ]
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
# Fix single quotes to double quotes (but not in values)
# This is risky, so we only do it if initial parse fails
# Strategy 1: direct parse
try: try:
data = json.loads(json_str) return json.loads(json_str)
return data
except json.JSONDecodeError: except json.JSONDecodeError:
# Try to fix common issues
# Replace single quotes with double quotes (simple approach)
fixed_json = json_str.replace("'", '"')
try:
data = json.loads(fixed_json)
logger.warning("⚠️ Fixed JSON with quote replacement")
return data
except:
pass pass
# Last resort: log the problematic JSON # Strategy 2: remove trailing commas before } or ]
logger.error(f"❌ Problematic JSON: {json_str[:300]}") fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
raise try:
else: return json.loads(fixed)
raise ValueError("No JSON found in response") except json.JSONDecodeError:
pass
except json.JSONDecodeError as e: # Strategy 3: remove JS-style comments (// and /* */)
logger.error(f"❌ JSON parsing failed: {e}") fixed = re.sub(r'//[^\n]*', '', fixed)
logger.error(f"Raw response preview: {response[:500]}") fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
return { try:
"error": f"JSON parsing failed: {str(e)}", return json.loads(fixed)
"confidence": 0.0, except json.JSONDecodeError:
"raw_response": response[:500] pass
# Strategy 4: truncate at last valid closing brace
# Walk backwards to find longest valid JSON prefix
for i in range(len(fixed) - 1, start, -1):
if fixed[i] == '}':
candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
# rebuild from inner start
c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
try:
data = json.loads(c2)
logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
return data
except json.JSONDecodeError:
continue
break
# Strategy 5: regex extraction of key fields (always succeeds with partial data)
logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
return self._extract_fields_with_regex(response)
def _extract_fields_with_regex(self, text: str) -> Dict:
"""Extract invoice fields from text using regex when JSON parsing fails"""
def _find(pattern, default=None):
m = re.search(pattern, text, re.IGNORECASE)
return m.group(1).strip() if m else default
def _find_num(pattern):
m = re.search(pattern, text, re.IGNORECASE)
if not m: return None
val = m.group(1).replace('.', '').replace(',', '.')
try: return float(val)
except: return None
result = {
"document_type": _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
"invoice_number": _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
"vendor_name": _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
"vendor_cvr": _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
"invoice_date": _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
"due_date": _find(r'"due_date"\s*:\s*"([^"]+)"'),
"currency": _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
"total_amount": _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
"vat_amount": _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
"confidence": 0.5,
"lines": [],
"_partial": True,
} }
logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
return result
def calculate_file_checksum(self, file_path: Path) -> str: def calculate_file_checksum(self, file_path: Path) -> str:
"""Calculate SHA256 checksum of file for duplicate detection""" """Calculate SHA256 checksum of file for duplicate detection"""