fix: JSON truncation num_predict 8000 + 5-stage parser + batch-analyze endpoint v2.2.34

2026-03-02 13:48:14 +01:00 · 2026-03-02 13:48:14 +01:00 · bf28e94d6e
commit bf28e94d6e
parent 72acca9e8b
3 changed files with 141 additions and 56 deletions
--- a/app/billing/backend/supplier_invoices.py
+++ b/app/billing/backend/supplier_invoices.py
@ -3,7 +3,7 @@ Supplier Invoices Router - Leverandørfakturaer (Kassekladde)
 Backend API for managing supplier invoices that integrate with e-conomic
 """

-from fastapi import APIRouter, HTTPException, UploadFile, File
+from fastapi import APIRouter, HTTPException, UploadFile, File, BackgroundTasks
 from pydantic import BaseModel
 from typing import List, Dict, Optional
 from datetime import datetime, date, timedelta
@ -2519,6 +2519,47 @@ async def reprocess_uploaded_file(file_id: int):
        raise HTTPException(status_code=500, detail=f"Genbehandling fejlede: {str(e)}")


+@router.post("/supplier-invoices/files/batch-analyze")
+async def batch_analyze_files(background_tasks: BackgroundTasks):
+    """
+    Kør AI-analyse på alle ubehandlede filer i baggrunden.
+    Returnerer øjeblikkeligt – filer behandles async.
+    """
+    pending = execute_query(
+        """SELECT file_id, filename FROM incoming_files
+           WHERE status IN ('pending', 'requires_vendor_selection', 'uploaded', 'failed')
+           ORDER BY uploaded_at DESC
+           LIMIT 100""",
+        ()
+    )
+    if not pending:
+        return {"started": 0, "message": "Ingen filer at behandle"}
+
+    file_ids = [r['file_id'] for r in pending]
+    logger.info(f"🚀 Batch-analyse startet for {len(file_ids)} filer")
+
+    async def _run_batch(ids):
+        ok = err = 0
+        for fid in ids:
+            try:
+                await reprocess_uploaded_file(fid)
+                ok += 1
+            except Exception as ex:
+                logger.error(f"❌ Batch fejl file {fid}: {ex}")
+                err += 1
+        logger.info(f"✅ Batch færdig: {ok} ok, {err} fejlet")
+
+    background_tasks.add_task(_run_batch, file_ids)
+
+    return {
+        "started": len(file_ids),
+        "message": f"{len(file_ids)} filer sendt til analyse i baggrunden. Opdater siden om lidt.",
+        "analyzed": 0,
+        "requires_vendor_selection": 0,
+        "failed": 0
+    }
+
+
@router.put("/supplier-invoices/templates/{template_id}")
 async def update_template(
    template_id: int,
--- a/app/billing/frontend/supplier_invoices.html
+++ b/app/billing/frontend/supplier_invoices.html
@ -2030,12 +2030,12 @@ function getFileStatusBadge(status) {

 // NEW: Batch analyze all files
 async function batchAnalyzeAllFiles() {
-    if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette vil:\n- Matche leverandører via CVR\n- Ekstrahere fakturadata\n- Oprette fakturaer i kassekladde ved 100% match')) {
+    if (!confirm('Kør automatisk analyse på alle ubehandlede filer?\n\nDette kan tage flere minutter afhængigt af antal filer.\nSiden opdateres automatisk undervejs.')) {
        return;
    }
    
    try {
-        showLoadingOverlay('Analyserer filer...');
+        showLoadingOverlay('Starter analyse...');
        
        const response = await fetch('/api/v1/supplier-invoices/files/batch-analyze', {
            method: 'POST'
@ -2047,19 +2047,27 @@ async function batchAnalyzeAllFiles() {
        
        hideLoadingOverlay();
        
-        alert(`✅ Batch-analyse fuldført!\n\n` +
-            `Analyseret: ${result.analyzed}\n` +
-            `Kræver manuel leverandør-valg: ${result.requires_vendor_selection}\n` +
-            `Fejlet: ${result.failed}`);
+        if (result.started === 0) {
+            alert('ℹ️ Ingen filer at behandle.');
+            return;
+        }

-        // Reload tables
+        alert(`✅ ${result.message}`);
+
+        // Auto-opdater tabellen hvert 10. sekund i 5 minutter
+        let refreshes = 0;
+        const maxRefreshes = 30;
+        const interval = setInterval(() => {
+            loadUnhandledFiles();
+            refreshes++;
+            if (refreshes >= maxRefreshes) clearInterval(interval);
+        }, 10000);
        loadUnhandledFiles();
-        loadKassekladdeView();
        
    } catch (error) {
        hideLoadingOverlay();
        console.error('Batch analysis error:', error);
-        alert('❌ Fejl ved batch-analyse');
+        alert('❌ Fejl ved batch-analyse: ' + error.message);
    }
 }

--- a/app/services/ollama_service.py
+++ b/app/services/ollama_service.py
@ -182,10 +182,11 @@ Output: {
                            ],
                            "stream": False,
                            "format": "json",
+                            "think": False,
                            "options": {
                                "temperature": 0.1,
                                "top_p": 0.9,
-                                "num_predict": 2000
+                                "num_predict": 8000
                            }
                        }
                    )
@ -201,7 +202,7 @@ Output: {
                            "options": {
                                "temperature": 0.1,
                                "top_p": 0.9,
-                                "num_predict": 2000
+                                "num_predict": 8000
                            }
                        }
                    )
@ -313,53 +314,88 @@ Output: {
                }
    
    def _parse_json_response(self, response: str) -> Dict:
-        """Parse JSON from LLM response with improved error handling"""
+        """Parse JSON from LLM response with aggressive fallback strategies"""
+        logger.info(f"🔍 Response length: {len(response)}, preview: {response[:200]}")
+
+        # Find outermost JSON object
+        start = response.find('{')
+        end = response.rfind('}') + 1
+        if start < 0 or end <= start:
+            logger.error("❌ No JSON object found in response")
+            return self._extract_fields_with_regex(response)
+
+        json_str = response[start:end]
+
+        # Strategy 1: direct parse
        try:
-            # Log preview of response for debugging
-            logger.info(f"🔍 Response preview (first 500 chars): {response[:500]}")
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass

-            # Find JSON in response (between first { and last })
-            start = response.find('{')
-            end = response.rfind('}') + 1
+        # Strategy 2: remove trailing commas before } or ]
+        fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
+        try:
+            return json.loads(fixed)
+        except json.JSONDecodeError:
+            pass

-            if start >= 0 and end > start:
-                json_str = response[start:end]
-                logger.info(f"🔍 Extracted JSON string length: {len(json_str)}, starts at position {start}")
-                
-                # Try to fix common JSON issues
-                # Remove trailing commas before } or ]
-                json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
-                # Fix single quotes to double quotes (but not in values)
-                # This is risky, so we only do it if initial parse fails
+        # Strategy 3: remove JS-style comments (// and /* */)
+        fixed = re.sub(r'//[^\n]*', '', fixed)
+        fixed = re.sub(r'/\*.*?\*/', '', fixed, flags=re.DOTALL)
+        try:
+            return json.loads(fixed)
+        except json.JSONDecodeError:
+            pass

+        # Strategy 4: truncate at last valid closing brace
+        # Walk backwards to find longest valid JSON prefix
+        for i in range(len(fixed) - 1, start, -1):
+            if fixed[i] == '}':
+                candidate = fixed[start - start:i + 1] if start == 0 else fixed[:i + 1]
+                # rebuild from inner start
+                c2 = fixed[:i + 1] if start == 0 else json_str[:i - start + 1]
                try:
-                    data = json.loads(json_str)
+                    data = json.loads(c2)
+                    logger.warning(f"⚠️ JSON truncated to position {i} — partial parse OK")
                    return data
                except json.JSONDecodeError:
-                    # Try to fix common issues
-                    # Replace single quotes with double quotes (simple approach)
-                    fixed_json = json_str.replace("'", '"')
-                    try:
-                        data = json.loads(fixed_json)
-                        logger.warning("⚠️ Fixed JSON with quote replacement")
-                        return data
-                    except:
-                        pass
+                    continue
+                break

-                    # Last resort: log the problematic JSON
-                    logger.error(f"❌ Problematic JSON: {json_str[:300]}")
-                    raise
-            else:
-                raise ValueError("No JSON found in response")
+        # Strategy 5: regex extraction of key fields (always succeeds with partial data)
+        logger.warning("⚠️ All JSON strategies failed — using regex field extraction")
+        return self._extract_fields_with_regex(response)
+
+    def _extract_fields_with_regex(self, text: str) -> Dict:
+        """Extract invoice fields from text using regex when JSON parsing fails"""
+        def _find(pattern, default=None):
+            m = re.search(pattern, text, re.IGNORECASE)
+            return m.group(1).strip() if m else default
+
+        def _find_num(pattern):
+            m = re.search(pattern, text, re.IGNORECASE)
+            if not m: return None
+            val = m.group(1).replace('.', '').replace(',', '.')
+            try: return float(val)
+            except: return None
+
+        result = {
+            "document_type":   _find(r'"document_type"\s*:\s*"([^"]+)"', 'invoice'),
+            "invoice_number":  _find(r'"invoice_number"\s*:\s*"?([^",\n}]+)"?'),
+            "vendor_name":     _find(r'"vendor_name"\s*:\s*"([^"]+)"'),
+            "vendor_cvr":      _find(r'"vendor_cvr"\s*:\s*"?(\d{8})"?'),
+            "invoice_date":    _find(r'"invoice_date"\s*:\s*"([^"]+)"'),
+            "due_date":        _find(r'"due_date"\s*:\s*"([^"]+)"'),
+            "currency":        _find(r'"currency"\s*:\s*"([^"]+)"', 'DKK'),
+            "total_amount":    _find_num(r'"total_amount"\s*:\s*([\d.,]+)'),
+            "vat_amount":      _find_num(r'"vat_amount"\s*:\s*([\d.,]+)'),
+            "confidence":      0.5,
+            "lines":           [],
+            "_partial":        True,
+        }
+        logger.info(f"🔧 Regex extraction: vendor={result['vendor_name']}, cvr={result['vendor_cvr']}, total={result['total_amount']}")
+        return result

-        except json.JSONDecodeError as e:
-            logger.error(f"❌ JSON parsing failed: {e}")
-            logger.error(f"Raw response preview: {response[:500]}")
-            return {
-                "error": f"JSON parsing failed: {str(e)}",
-                "confidence": 0.0,
-                "raw_response": response[:500]
-            }
    
    def calculate_file_checksum(self, file_path: Path) -> str:
        """Calculate SHA256 checksum of file for duplicate detection"""