fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34

2026-03-02 13:48:14 +01:00 · 2026-03-02 13:48:14 +01:00 · bf28e94d6e
commit bf28e94d6e
parent 72acca9e8b
3 changed files with 141 additions and 56 deletions
--- a/app/billing/backend/supplier_invoices.py
+++ b/app/billing/backend/supplier_invoices.py
@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
 Backend API for managing supplier invoices that integrate with e-conomic
 """
-from fastapi import APIRouter, HTTPException, UploadFile, File
+from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
 from pydantic import BaseModel
 from typing import List, Dict, Optional
 from datetime import datetime, date, timedelta
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
        raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")
@router.post("/supplier-invoices/files/batch-analyze")
 async def batch_analyze_files(background_tasks: BackgroundTasks):
    """
    Kør AI-analyse på alle ubehandlede filer i baggrunden.
    Returnerer øjeblikkeligt – filer behandles async.
    """
    pending = execute_query(
        """SELECT file_id, filename FROM incoming_files
           WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
           ORDER BY uploaded_at DESC
           LIMIT 100""",
        ()
    )
    if not pending:
        return {"started": 0, "message": "Ingen filer at behandle"}
    file_ids = [r['file_id'] for r in pending]
    logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
    async def _run_batch(ids):
        ok = err = 0
        for fid in ids:
            try:
                await reprocess_uploaded_file(fid)
                ok += 1
            except Exception as ex:
                logger.error(f"❌ Batch fejl file {fid}: {ex}")
                err += 1
        logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
    background_tasks.add_task(_run_batch, file_ids)
    return {
        "started": len(file_ids),
        "message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
        "analyzed": 0,
        "requires_vendor_selection": 0,
        "failed": 0
    }
@router.put("/supplier-invoices/templates/{template_id}")
 async def update_template(
    template_id: int,
--- a/app/billing/frontend/supplier_invoices.html
+++ b/app/billing/frontend/supplier_invoices.html
@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {
 // NEW: Batch analyze all files
 async function batchAnalyzeAllFiles() {
-    if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) {
+    if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
        return;
    }
    try {
-        showLoadingOverlay('Analyserer filer...');
+        showLoadingOverlay('Starter analyse...');
        const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
            method: 'POST'
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
        hideLoadingOverlay();
-        alert(`✅ Batch-analyse fuldført!\n\n` +
+        if (result.started === 0) {
-            `Analyseret: ${result.analyzed}\n` +
+            alert('ℹ️ Ingen filer at behandle.');
-            `Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` +
+            return;
-            `Fejlet: ${result.failed}`);
+        }
-        
+
-        // Reload tables
+        alert(`✅ ${result.message}`);
        // Auto-opdater tabellen hvert 10. sekund i 5 minutter
        let refreshes = 0;
        const maxRefreshes = 30;
        const interval = setInterval(() => {
            loadUnhandledFiles();
            refreshes++;
            if (refreshes >= maxRefreshes) clearInterval(interval);
        }, 10000);
        loadUnhandledFiles();
        loadKassekladdeView();
    } catch (error) {
        hideLoadingOverlay();
        console.error('Batch analysis error:', error);
-        alert('❌ Fejl ved batch-analyse');
+        alert('❌ Fejl ved batch-analyse: ' + error.message);
    }
 }
--- a/app/services/ollama_service.py
+++ b/app/services/ollama_service.py
@ -182,10 +182,11 @@ Output: {
                            ],
                            "stream": False,
                            "format": "json",
                            "think": False,
                            "options": {
                                "temperature": 0.1,
                                "top_p": 0.9,
-                                "num_predict": 2000
+                                "num_predict": 8000
                            }
                        }
                    )
@ -201,7 +202,7 @@ Output: {
                            "options": {
                                "temperature": 0.1,
                                "top_p": 0.9,
-                                "num_predict": 2000
+                                "num_predict": 8000
                            }
                        }
                    )
@ -313,53 +314,88 @@ Output: {
                }
    def _parse_json_response(self, response: str) -> Dict:
-        """Parse JSON from LLM response with improved error handling"""
+        """Parse JSON from LLM response with aggressive fallback strategies"""
        logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
        # Find outermost JSON object
        start = response.find('{')
        end = response.rfind('}') + 1
        if start < 0 or end <= start:
            logger.error("❌ No JSON object found in response")
            return self._extract_fields_with_regex(response)
        json_str = response[start:end]
        # Strategy 1: direct parse
        try:
-            # Log preview of response for debugging
+            return json.loads(json_str)
-            logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
+        except json.JSONDecodeError:
-            
+            pass
-            # Find JSON in response (between first { and last })
+
-            start = response.find('{')
+        # Strategy 2: remove trailing commas before } or ]
-            end = response.rfind('}') + 1
+        fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
-            
+        try:
-            if start >= 0 and end > start:
+            return json.loads(fixed)
-                json_str = response[start:end]
+        except json.JSONDecodeError:
-                logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
+            pass
-                
+
-                # Try to fix common JSON issues
+        # Strategy 3: remove JS-style comments (// and /* */)
-                # Remove trailing commas before } or ]
+        fixed = re.sub(r'//[^\n]*', '', fixed)
-                json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
+        fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
-                # Fix single quotes to double quotes (but not in values)
+        try:
-                # This is risky, so we only do it if initial parse fails
+            return json.loads(fixed)
-                
+        except json.JSONDecodeError:
            pass
        # Strategy 4: truncate at last valid closing brace
        # Walk backwards to find longest valid JSON prefix
        for i in range(len(fixed) - 1, start, -1):
            if fixed[i] == '}':
                candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
                # rebuild from inner start
                c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
                try:
-                    data = json.loads(json_str)
+                    data = json.loads(c2)
                    logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
                    return data
                except json.JSONDecodeError:
-                    # Try to fix common issues
+                    continue
-                    # Replace single quotes with double quotes (simple approach)
+                break
-                    fixed_json = json_str.replace("'", '"')
+
-                    try:
+        # Strategy 5: regex extraction of key fields (always succeeds with partial data)
-                        data = json.loads(fixed_json)
+        logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
-                        logger.warning("⚠️ Fixed JSON with quote replacement")
+        return self._extract_fields_with_regex(response)
-                        return data
+
-                    except:
+    def _extract_fields_with_regex(self, text: str) -> Dict:
-                        pass
+        """Extract invoice fields from text using regex when JSON parsing fails"""
-                    
+        def _find(pattern, default=None):
-                    # Last resort: log the problematic JSON
+            m = re.search(pattern, text, re.IGNORECASE)
-                    logger.error(f"❌ Problematic JSON: {json_str[:300]}")
+            return m.group(1).strip() if m else default
-                    raise
+
-            else:
+        def _find_num(pattern):
-                raise ValueError("No JSON found in response")
+            m = re.search(pattern, text, re.IGNORECASE)
-        
+            if not m: return None
-        except json.JSONDecodeError as e:
+            val = m.group(1).replace('.', '').replace(',', '.')
-            logger.error(f"❌ JSON parsing failed: {e}")
+            try: return float(val)
-            logger.error(f"Raw response preview: {response[:500]}")
+            except: return None
-            return {
+
-                "error": f"JSON parsing failed: {str(e)}",
+        result = {
-                "confidence": 0.0,
+            "document_type":   _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
-                "raw_response": response[:500]
+            "invoice_number":  _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
-            }
+            "vendor_name":     _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
            "vendor_cvr":      _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
            "invoice_date":    _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
            "due_date":        _find(r'"due_date"\s*:\s*"([^"]+)"'),
            "currency":        _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
            "total_amount":    _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
            "vat_amount":      _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
            "confidence":      0.5,
            "lines":           [],
            "_partial":        True,
        }
        logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
        return result
    def calculate_file_checksum(self, file_path: Path) -> str:
        """Calculate SHA256 checksum of file for duplicate detection"""