fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34
This commit is contained in:
parent
72acca9e8b
commit
bf28e94d6e
@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
|
|||||||
Backend API for managing supplier invoices that integrate with e-conomic
|
Backend API for managing supplier invoices that integrate with e-conomic
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, UploadFile, File
|
from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
from datetime import datetime, date, timedelta
|
from datetime import datetime, date, timedelta
|
||||||
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
|
|||||||
raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/supplier-invoices/files/batch-analyze")
|
||||||
|
async def batch_analyze_files(background_tasks: BackgroundTasks):
|
||||||
|
"""
|
||||||
|
Kør AI-analyse på alle ubehandlede filer i baggrunden.
|
||||||
|
Returnerer øjeblikkeligt – filer behandles async.
|
||||||
|
"""
|
||||||
|
pending = execute_query(
|
||||||
|
"""SELECT file_id, filename FROM incoming_files
|
||||||
|
WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
|
||||||
|
ORDER BY uploaded_at DESC
|
||||||
|
LIMIT 100""",
|
||||||
|
()
|
||||||
|
)
|
||||||
|
if not pending:
|
||||||
|
return {"started": 0, "message": "Ingen filer at behandle"}
|
||||||
|
|
||||||
|
file_ids = [r['file_id'] for r in pending]
|
||||||
|
logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
|
||||||
|
|
||||||
|
async def _run_batch(ids):
|
||||||
|
ok = err = 0
|
||||||
|
for fid in ids:
|
||||||
|
try:
|
||||||
|
await reprocess_uploaded_file(fid)
|
||||||
|
ok += 1
|
||||||
|
except Exception as ex:
|
||||||
|
logger.error(f"❌ Batch fejl file {fid}: {ex}")
|
||||||
|
err += 1
|
||||||
|
logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
|
||||||
|
|
||||||
|
background_tasks.add_task(_run_batch, file_ids)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"started": len(file_ids),
|
||||||
|
"message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
|
||||||
|
"analyzed": 0,
|
||||||
|
"requires_vendor_selection": 0,
|
||||||
|
"failed": 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@router.put("/supplier-invoices/templates/{template_id}")
|
@router.put("/supplier-invoices/templates/{template_id}")
|
||||||
async def update_template(
|
async def update_template(
|
||||||
template_id: int,
|
template_id: int,
|
||||||
|
|||||||
@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {
|
|||||||
|
|
||||||
// NEW: Batch analyze all files
|
// NEW: Batch analyze all files
|
||||||
async function batchAnalyzeAllFiles() {
|
async function batchAnalyzeAllFiles() {
|
||||||
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) {
|
if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
showLoadingOverlay('Analyserer filer...');
|
showLoadingOverlay('Starter analyse...');
|
||||||
|
|
||||||
const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
|
const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
|
||||||
method: 'POST'
|
method: 'POST'
|
||||||
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
|
|||||||
|
|
||||||
hideLoadingOverlay();
|
hideLoadingOverlay();
|
||||||
|
|
||||||
alert(`✅ Batch-analyse fuldført!\n\n` +
|
if (result.started === 0) {
|
||||||
`Analyseret: ${result.analyzed}\n` +
|
alert('ℹ️ Ingen filer at behandle.');
|
||||||
`Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` +
|
return;
|
||||||
`Fejlet: ${result.failed}`);
|
}
|
||||||
|
|
||||||
// Reload tables
|
alert(`✅ ${result.message}`);
|
||||||
|
|
||||||
|
// Auto-opdater tabellen hvert 10. sekund i 5 minutter
|
||||||
|
let refreshes = 0;
|
||||||
|
const maxRefreshes = 30;
|
||||||
|
const interval = setInterval(() => {
|
||||||
|
loadUnhandledFiles();
|
||||||
|
refreshes++;
|
||||||
|
if (refreshes >= maxRefreshes) clearInterval(interval);
|
||||||
|
}, 10000);
|
||||||
loadUnhandledFiles();
|
loadUnhandledFiles();
|
||||||
loadKassekladdeView();
|
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
hideLoadingOverlay();
|
hideLoadingOverlay();
|
||||||
console.error('Batch analysis error:', error);
|
console.error('Batch analysis error:', error);
|
||||||
alert('❌ Fejl ved batch-analyse');
|
alert('❌ Fejl ved batch-analyse: ' + error.message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -182,10 +182,11 @@ Output: {
|
|||||||
],
|
],
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"format": "json",
|
"format": "json",
|
||||||
|
"think": False,
|
||||||
"options": {
|
"options": {
|
||||||
"temperature": 0.1,
|
"temperature": 0.1,
|
||||||
"top_p": 0.9,
|
"top_p": 0.9,
|
||||||
"num_predict": 2000
|
"num_predict": 8000
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -201,7 +202,7 @@ Output: {
|
|||||||
"options": {
|
"options": {
|
||||||
"temperature": 0.1,
|
"temperature": 0.1,
|
||||||
"top_p": 0.9,
|
"top_p": 0.9,
|
||||||
"num_predict": 2000
|
"num_predict": 8000
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -313,53 +314,88 @@ Output: {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def _parse_json_response(self, response: str) -> Dict:
|
def _parse_json_response(self, response: str) -> Dict:
|
||||||
"""Parse JSON from LLM response with improved error handling"""
|
"""Parse JSON from LLM response with aggressive fallback strategies"""
|
||||||
try:
|
logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
|
||||||
# Log preview of response for debugging
|
|
||||||
logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
|
|
||||||
|
|
||||||
# Find JSON in response (between first { and last })
|
# Find outermost JSON object
|
||||||
start = response.find('{')
|
start = response.find('{')
|
||||||
end = response.rfind('}') + 1
|
end = response.rfind('}') + 1
|
||||||
|
if start < 0 or end <= start:
|
||||||
|
logger.error("❌ No JSON object found in response")
|
||||||
|
return self._extract_fields_with_regex(response)
|
||||||
|
|
||||||
if start >= 0 and end > start:
|
|
||||||
json_str = response[start:end]
|
json_str = response[start:end]
|
||||||
logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
|
|
||||||
|
|
||||||
# Try to fix common JSON issues
|
|
||||||
# Remove trailing commas before } or ]
|
|
||||||
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
|
||||||
# Fix single quotes to double quotes (but not in values)
|
|
||||||
# This is risky, so we only do it if initial parse fails
|
|
||||||
|
|
||||||
|
# Strategy 1: direct parse
|
||||||
try:
|
try:
|
||||||
data = json.loads(json_str)
|
return json.loads(json_str)
|
||||||
return data
|
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# Try to fix common issues
|
|
||||||
# Replace single quotes with double quotes (simple approach)
|
|
||||||
fixed_json = json_str.replace("'", '"')
|
|
||||||
try:
|
|
||||||
data = json.loads(fixed_json)
|
|
||||||
logger.warning("⚠️ Fixed JSON with quote replacement")
|
|
||||||
return data
|
|
||||||
except:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Last resort: log the problematic JSON
|
# Strategy 2: remove trailing commas before } or ]
|
||||||
logger.error(f"❌ Problematic JSON: {json_str[:300]}")
|
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||||||
raise
|
try:
|
||||||
else:
|
return json.loads(fixed)
|
||||||
raise ValueError("No JSON found in response")
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
# Strategy 3: remove JS-style comments (// and /* */)
|
||||||
logger.error(f"❌ JSON parsing failed: {e}")
|
fixed = re.sub(r'//[^\n]*', '', fixed)
|
||||||
logger.error(f"Raw response preview: {response[:500]}")
|
fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
|
||||||
return {
|
try:
|
||||||
"error": f"JSON parsing failed: {str(e)}",
|
return json.loads(fixed)
|
||||||
"confidence": 0.0,
|
except json.JSONDecodeError:
|
||||||
"raw_response": response[:500]
|
pass
|
||||||
|
|
||||||
|
# Strategy 4: truncate at last valid closing brace
|
||||||
|
# Walk backwards to find longest valid JSON prefix
|
||||||
|
for i in range(len(fixed) - 1, start, -1):
|
||||||
|
if fixed[i] == '}':
|
||||||
|
candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
|
||||||
|
# rebuild from inner start
|
||||||
|
c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
|
||||||
|
try:
|
||||||
|
data = json.loads(c2)
|
||||||
|
logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
|
||||||
|
return data
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
|
||||||
|
# Strategy 5: regex extraction of key fields (always succeeds with partial data)
|
||||||
|
logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
|
||||||
|
return self._extract_fields_with_regex(response)
|
||||||
|
|
||||||
|
def _extract_fields_with_regex(self, text: str) -> Dict:
|
||||||
|
"""Extract invoice fields from text using regex when JSON parsing fails"""
|
||||||
|
def _find(pattern, default=None):
|
||||||
|
m = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
return m.group(1).strip() if m else default
|
||||||
|
|
||||||
|
def _find_num(pattern):
|
||||||
|
m = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if not m: return None
|
||||||
|
val = m.group(1).replace('.', '').replace(',', '.')
|
||||||
|
try: return float(val)
|
||||||
|
except: return None
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"document_type": _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
|
||||||
|
"invoice_number": _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
|
||||||
|
"vendor_name": _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
|
||||||
|
"vendor_cvr": _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
|
||||||
|
"invoice_date": _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
|
||||||
|
"due_date": _find(r'"due_date"\s*:\s*"([^"]+)"'),
|
||||||
|
"currency": _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
|
||||||
|
"total_amount": _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
|
||||||
|
"vat_amount": _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
|
||||||
|
"confidence": 0.5,
|
||||||
|
"lines": [],
|
||||||
|
"_partial": True,
|
||||||
}
|
}
|
||||||
|
logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def calculate_file_checksum(self, file_path: Path) -> str:
|
def calculate_file_checksum(self, file_path: Path) -> str:
|
||||||
"""Calculate SHA256 checksum of file for duplicate detection"""
|
"""Calculate SHA256 checksum of file for duplicate detection"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user