From bf28e94d6ebb04897c7bca619955895df3fecb4e Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 2 Mar 2026 13:48:14 +0100 Subject: [PATCH] fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34 --- app/billing/backend/supplier_invoices.py | 43 ++++++- app/billing/frontend/supplier_invoices.html | 28 +++-- app/services/ollama_service.py | 126 +++++++++++++------- 3 files changed, 141 insertions(+), 56 deletions(-) diff --git a/app/billing/backend/supplier_invoices.py b/app/billing/backend/supplier_invoices.py index db96d65..2eed551 100644 --- a/app/billing/backend/supplier_invoices.py +++ b/app/billing/backend/supplier_invoices.py @@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde) Backend API for managing supplier invoices that integrate with e-conomic """ -from fastapi import APIRouter, HTTPException, UploadFile, File +from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks from pydantic import BaseModel from typing import List, Dict, Optional from datetime import datetime, date, timedelta @@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int): raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}") +@router.post("/supplier-invoices/files/batch-analyze") +async def batch_analyze_files(background_tasks: BackgroundTasks): + """ + Kør AI-analyse på alle ubehandlede filer i baggrunden. + Returnerer øjeblikkeligt – filer behandles async. + """ + pending = execute_query( + """SELECT file_id, filename FROM incoming_files + WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed') + ORDER BY uploaded_at DESC + LIMIT 100""", + () + ) + if not pending: + return {"started": 0, "message": "Ingen filer at behandle"} + + file_ids = [r['file_id'] for r in pending] + logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer") + + async def _run_batch(ids): + ok = err = 0 + for fid in ids: + try: + await reprocess_uploaded_file(fid) + ok += 1 + except Exception as ex: + logger.error(f"❌ Batch fejl file {fid}: {ex}") + err += 1 + logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet") + + background_tasks.add_task(_run_batch, file_ids) + + return { + "started": len(file_ids), + "message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.", + "analyzed": 0, + "requires_vendor_selection": 0, + "failed": 0 + } + + @router.put("/supplier-invoices/templates/{template_id}") async def update_template( template_id: int, diff --git a/app/billing/frontend/supplier_invoices.html b/app/billing/frontend/supplier_invoices.html index 7232335..5337a71 100644 --- a/app/billing/frontend/supplier_invoices.html +++ b/app/billing/frontend/supplier_invoices.html @@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) { // NEW: Batch analyze all files async function batchAnalyzeAllFiles() { - if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) { + if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) { return; } try { - showLoadingOverlay('Analyserer filer...'); + showLoadingOverlay('Starter analyse...'); const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', { method: 'POST' @@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() { hideLoadingOverlay(); - alert(`✅ Batch-analyse fuldført!\n\n` + - `Analyseret: ${result.analyzed}\n` + - `Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` + - `Fejlet: ${result.failed}`); - - // Reload tables + if (result.started === 0) { + alert('ℹ️ Ingen filer at behandle.'); + return; + } + + alert(`✅ ${result.message}`); + + // Auto-opdater tabellen hvert 10. sekund i 5 minutter + let refreshes = 0; + const maxRefreshes = 30; + const interval = setInterval(() => { + loadUnhandledFiles(); + refreshes++; + if (refreshes >= maxRefreshes) clearInterval(interval); + }, 10000); loadUnhandledFiles(); - loadKassekladdeView(); } catch (error) { hideLoadingOverlay(); console.error('Batch analysis error:', error); - alert('❌ Fejl ved batch-analyse'); + alert('❌ Fejl ved batch-analyse: ' + error.message); } } diff --git a/app/services/ollama_service.py b/app/services/ollama_service.py index dd63131..06c0a89 100644 --- a/app/services/ollama_service.py +++ b/app/services/ollama_service.py @@ -182,10 +182,11 @@ Output: { ], "stream": False, "format": "json", + "think": False, "options": { "temperature": 0.1, "top_p": 0.9, - "num_predict": 2000 + "num_predict": 8000 } } ) @@ -201,7 +202,7 @@ Output: { "options": { "temperature": 0.1, "top_p": 0.9, - "num_predict": 2000 + "num_predict": 8000 } } ) @@ -313,53 +314,88 @@ Output: { } def _parse_json_response(self, response: str) -> Dict: - """Parse JSON from LLM response with improved error handling""" + """Parse JSON from LLM response with aggressive fallback strategies""" + logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}") + + # Find outermost JSON object + start = response.find('{') + end = response.rfind('}') + 1 + if start < 0 or end <= start: + logger.error("❌ No JSON object found in response") + return self._extract_fields_with_regex(response) + + json_str = response[start:end] + + # Strategy 1: direct parse try: - # Log preview of response for debugging - logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}") - - # Find JSON in response (between first { and last }) - start = response.find('{') - end = response.rfind('}') + 1 - - if start >= 0 and end > start: - json_str = response[start:end] - logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}") - - # Try to fix common JSON issues - # Remove trailing commas before } or ] - json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) - # Fix single quotes to double quotes (but not in values) - # This is risky, so we only do it if initial parse fails - + return json.loads(json_str) + except json.JSONDecodeError: + pass + + # Strategy 2: remove trailing commas before } or ] + fixed = re.sub(r',(\s*[}\]])', r'\1', json_str) + try: + return json.loads(fixed) + except json.JSONDecodeError: + pass + + # Strategy 3: remove JS-style comments (// and /* */) + fixed = re.sub(r'//[^\n]*', '', fixed) + fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL) + try: + return json.loads(fixed) + except json.JSONDecodeError: + pass + + # Strategy 4: truncate at last valid closing brace + # Walk backwards to find longest valid JSON prefix + for i in range(len(fixed) - 1, start, -1): + if fixed[i] == '}': + candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1] + # rebuild from inner start + c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1] try: - data = json.loads(json_str) + data = json.loads(c2) + logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK") return data except json.JSONDecodeError: - # Try to fix common issues - # Replace single quotes with double quotes (simple approach) - fixed_json = json_str.replace("'", '"') - try: - data = json.loads(fixed_json) - logger.warning("⚠️ Fixed JSON with quote replacement") - return data - except: - pass - - # Last resort: log the problematic JSON - logger.error(f"❌ Problematic JSON: {json_str[:300]}") - raise - else: - raise ValueError("No JSON found in response") - - except json.JSONDecodeError as e: - logger.error(f"❌ JSON parsing failed: {e}") - logger.error(f"Raw response preview: {response[:500]}") - return { - "error": f"JSON parsing failed: {str(e)}", - "confidence": 0.0, - "raw_response": response[:500] - } + continue + break + + # Strategy 5: regex extraction of key fields (always succeeds with partial data) + logger.warning("⚠️ All JSON strategies failed — using regex field extraction") + return self._extract_fields_with_regex(response) + + def _extract_fields_with_regex(self, text: str) -> Dict: + """Extract invoice fields from text using regex when JSON parsing fails""" + def _find(pattern, default=None): + m = re.search(pattern, text, re.IGNORECASE) + return m.group(1).strip() if m else default + + def _find_num(pattern): + m = re.search(pattern, text, re.IGNORECASE) + if not m: return None + val = m.group(1).replace('.', '').replace(',', '.') + try: return float(val) + except: return None + + result = { + "document_type": _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'), + "invoice_number": _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'), + "vendor_name": _find(r'"vendor_name"\s*:\s*"([^"]+)"'), + "vendor_cvr": _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'), + "invoice_date": _find(r'"invoice_date"\s*:\s*"([^"]+)"'), + "due_date": _find(r'"due_date"\s*:\s*"([^"]+)"'), + "currency": _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'), + "total_amount": _find_num(r'"total_amount"\s*:\s*([\d.,]+)'), + "vat_amount": _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'), + "confidence": 0.5, + "lines": [], + "_partial": True, + } + logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}") + return result + def calculate_file_checksum(self, file_path: Path) -> str: """Calculate SHA256 checksum of file for duplicate detection"""