fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34
This commit is contained in:
parent
72acca9e8b
commit
bf28e94d6e
@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
|
||||
Backend API for managing supplier invoices that integrate with e-conomic
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime, date, timedelta
|
||||
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
|
||||
raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/supplier-invoices/files/batch-analyze")
|
||||
async def batch_analyze_files(background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Kør AI-analyse på alle ubehandlede filer i baggrunden.
|
||||
Returnerer øjeblikkeligt – filer behandles async.
|
||||
"""
|
||||
pending = execute_query(
|
||||
"""SELECT file_id, filename FROM incoming_files
|
||||
WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
|
||||
ORDER BY uploaded_at DESC
|
||||
LIMIT 100""",
|
||||
()
|
||||
)
|
||||
if not pending:
|
||||
return {"started": 0, "message": "Ingen filer at behandle"}
|
||||
|
||||
file_ids = [r['file_id'] for r in pending]
|
||||
logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
|
||||
|
||||
async def _run_batch(ids):
|
||||
ok = err = 0
|
||||
for fid in ids:
|
||||
try:
|
||||
await reprocess_uploaded_file(fid)
|
||||
ok += 1
|
||||
except Exception as ex:
|
||||
logger.error(f"❌ Batch fejl file {fid}: {ex}")
|
||||
err += 1
|
||||
logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
|
||||
|
||||
background_tasks.add_task(_run_batch, file_ids)
|
||||
|
||||
return {
|
||||
"started": len(file_ids),
|
||||
"message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
|
||||
"analyzed": 0,
|
||||
"requires_vendor_selection": 0,
|
||||
"failed": 0
|
||||
}
|
||||
|
||||
|
||||
@router.put("/supplier-invoices/templates/{template_id}")
|
||||
async def update_template(
|
||||
template_id: int,
|
||||
|
||||
@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {
|
||||
|
||||
// NEW: Batch analyze all files
|
||||
async function batchAnalyzeAllFiles() {
|
||||
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) {
|
||||
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
showLoadingOverlay('Analyserer filer...');
|
||||
showLoadingOverlay('Starter analyse...');
|
||||
|
||||
const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
|
||||
method: 'POST'
|
||||
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
|
||||
|
||||
hideLoadingOverlay();
|
||||
|
||||
alert(`✅ Batch-analyse fuldført!\n\n` +
|
||||
`Analyseret: ${result.analyzed}\n` +
|
||||
`Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` +
|
||||
`Fejlet: ${result.failed}`);
|
||||
if (result.started === 0) {
|
||||
alert('ℹ️ Ingen filer at behandle.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Reload tables
|
||||
alert(`✅ ${result.message}`);
|
||||
|
||||
// Auto-opdater tabellen hvert 10. sekund i 5 minutter
|
||||
let refreshes = 0;
|
||||
const maxRefreshes = 30;
|
||||
const interval = setInterval(() => {
|
||||
loadUnhandledFiles();
|
||||
refreshes++;
|
||||
if (refreshes >= maxRefreshes) clearInterval(interval);
|
||||
}, 10000);
|
||||
loadUnhandledFiles();
|
||||
loadKassekladdeView();
|
||||
|
||||
} catch (error) {
|
||||
hideLoadingOverlay();
|
||||
console.error('Batch analysis error:', error);
|
||||
alert('❌ Fejl ved batch-analyse');
|
||||
alert('❌ Fejl ved batch-analyse: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -182,10 +182,11 @@ Output: {
|
||||
],
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"think": False,
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"top_p": 0.9,
|
||||
"num_predict": 2000
|
||||
"num_predict": 8000
|
||||
}
|
||||
}
|
||||
)
|
||||
@ -201,7 +202,7 @@ Output: {
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"top_p": 0.9,
|
||||
"num_predict": 2000
|
||||
"num_predict": 8000
|
||||
}
|
||||
}
|
||||
)
|
||||
@ -313,53 +314,88 @@ Output: {
|
||||
}
|
||||
|
||||
def _parse_json_response(self, response: str) -> Dict:
|
||||
"""Parse JSON from LLM response with improved error handling"""
|
||||
"""Parse JSON from LLM response with aggressive fallback strategies"""
|
||||
logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
|
||||
|
||||
# Find outermost JSON object
|
||||
start = response.find('{')
|
||||
end = response.rfind('}') + 1
|
||||
if start < 0 or end <= start:
|
||||
logger.error("❌ No JSON object found in response")
|
||||
return self._extract_fields_with_regex(response)
|
||||
|
||||
json_str = response[start:end]
|
||||
|
||||
# Strategy 1: direct parse
|
||||
try:
|
||||
# Log preview of response for debugging
|
||||
logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Find JSON in response (between first { and last })
|
||||
start = response.find('{')
|
||||
end = response.rfind('}') + 1
|
||||
# Strategy 2: remove trailing commas before } or ]
|
||||
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||||
try:
|
||||
return json.loads(fixed)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if start >= 0 and end > start:
|
||||
json_str = response[start:end]
|
||||
logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
|
||||
|
||||
# Try to fix common JSON issues
|
||||
# Remove trailing commas before } or ]
|
||||
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||||
# Fix single quotes to double quotes (but not in values)
|
||||
# This is risky, so we only do it if initial parse fails
|
||||
# Strategy 3: remove JS-style comments (// and /* */)
|
||||
fixed = re.sub(r'//[^\n]*', '', fixed)
|
||||
fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
|
||||
try:
|
||||
return json.loads(fixed)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 4: truncate at last valid closing brace
|
||||
# Walk backwards to find longest valid JSON prefix
|
||||
for i in range(len(fixed) - 1, start, -1):
|
||||
if fixed[i] == '}':
|
||||
candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
|
||||
# rebuild from inner start
|
||||
c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
data = json.loads(c2)
|
||||
logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
# Try to fix common issues
|
||||
# Replace single quotes with double quotes (simple approach)
|
||||
fixed_json = json_str.replace("'", '"')
|
||||
try:
|
||||
data = json.loads(fixed_json)
|
||||
logger.warning("⚠️ Fixed JSON with quote replacement")
|
||||
return data
|
||||
except:
|
||||
pass
|
||||
continue
|
||||
break
|
||||
|
||||
# Last resort: log the problematic JSON
|
||||
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
|
||||
raise
|
||||
else:
|
||||
raise ValueError("No JSON found in response")
|
||||
# Strategy 5: regex extraction of key fields (always succeeds with partial data)
|
||||
logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
|
||||
return self._extract_fields_with_regex(response)
|
||||
|
||||
def _extract_fields_with_regex(self, text: str) -> Dict:
|
||||
"""Extract invoice fields from text using regex when JSON parsing fails"""
|
||||
def _find(pattern, default=None):
|
||||
m = re.search(pattern, text, re.IGNORECASE)
|
||||
return m.group(1).strip() if m else default
|
||||
|
||||
def _find_num(pattern):
|
||||
m = re.search(pattern, text, re.IGNORECASE)
|
||||
if not m: return None
|
||||
val = m.group(1).replace('.', '').replace(',', '.')
|
||||
try: return float(val)
|
||||
except: return None
|
||||
|
||||
result = {
|
||||
"document_type": _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
|
||||
"invoice_number": _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
|
||||
"vendor_name": _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
|
||||
"vendor_cvr": _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
|
||||
"invoice_date": _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
|
||||
"due_date": _find(r'"due_date"\s*:\s*"([^"]+)"'),
|
||||
"currency": _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
|
||||
"total_amount": _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
|
||||
"vat_amount": _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
|
||||
"confidence": 0.5,
|
||||
"lines": [],
|
||||
"_partial": True,
|
||||
}
|
||||
logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"❌ JSON parsing failed: {e}")
|
||||
logger.error(f"Raw response preview: {response[:500]}")
|
||||
return {
|
||||
"error": f"JSON parsing failed: {str(e)}",
|
||||
"confidence": 0.0,
|
||||
"raw_response": response[:500]
|
||||
}
|
||||
|
||||
def calculate_file_checksum(self, file_path: Path) -> str:
|
||||
"""Calculate SHA256 checksum of file for duplicate detection"""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user