From 52f283c7c9dcff436f14e98ecdb028c71fe29586 Mon Sep 17 00:00:00 2001
From: huangrh <huangrh@chinaweal.com.cn>
Date: Sun, 8 Feb 2026 13:46:56 +0800
Subject: [PATCH] feat(seal): add double verification and institution name
 cleaning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Key improvements:
1. Double verification mechanism for OCR failures
   - When unwarp OCR fails (empty text), automatically try PaddleOCRVL backup on crop
   - Fixes issue where correct seal was ignored due to unwarp image distortion
   - Test result: 4% → 93.8% similarity on problematic PDFs

2. Institution name cleaning
   - Remove unwanted suffixes: 检验检测专用章, 专用章, etc.
   - Clean names before adding to results and similarity calculation
   - Improves matching accuracy

3. Enhanced logging for institution selection
   - Show all extracted institutions with similarity scores
   - Track why specific institution was selected
   - Better debugging and transparency

Example impact:
- Before: "成都虹之川科技有限公司" (wrong seal, 4% similarity)
- After: "中科测试技术（广东）集团有限公司" (correct seal, 93.8% similarity)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 test_accuracy_batch_full.py | 103 +++++++++++++++++++++++++++++++++---
 1 file changed, 95 insertions(+), 8 deletions(-)

diff --git a/test_accuracy_batch_full.py b/test_accuracy_batch_full.py
index 343897f..6b1a1c5 100644
--- a/test_accuracy_batch_full.py
+++ b/test_accuracy_batch_full.py
@@ -717,8 +717,10 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
                 result['seals'].append(seal_data)
 
                 if ocr_result['success']:
-                    result['institutions'].append(ocr_result['text'])
-                    logger.info(f"  ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
+                    # Clean the institution name before adding
+                    cleaned_name = clean_institution_name(ocr_result['text'])
+                    result['institutions'].append(cleaned_name)
+                    logger.info(f"  ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})")
                 else:
                     logger.warning(f"  ✗ Seal #{i} FAILED: Could not extract institution name")
 
@@ -872,8 +874,9 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
         marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
         imwrite_safe(marked_path, marked)
 
-        # OCR recognition
+        # OCR recognition with double verification
         ocr_result = {'text': '', 'score': 0.0, 'success': False}
+        ocr_method_used = method_used
 
         if unwarp is not None:
             # Standard path: Recognize unwarp image
@@ -885,13 +888,35 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
             else:
                 ocr_result = run_ocr_recognition(unwarp_path, rec_model)
 
-            logger.info(f"  Seal #{i} OCR Result:")
+            ocr_method_used = f"{method_used}_unwarp"
+            logger.info(f"  Seal #{i} OCR Result (unwarp):")
             logger.info(f"    - Text: '{ocr_result['text']}'")
             logger.info(f"    - Score: {ocr_result['score']:.4f}")
             logger.info(f"    - Success: {ocr_result['success']}")
             logger.info(f"    - Text length: {len(ocr_result['text'])} chars")
             if used_fallback:
                 logger.info(f"    - ** Used fallback angle range (7:30 to 4:30) **")
+
+            # ============ DOUBLE VERIFICATION: Try PaddleOCRVL on crop if unwarp OCR fails ============
+            # If unwarp OCR failed (empty text or success=False), try PaddleOCRVL backup on crop
+            if (not ocr_result['success'] or len(ocr_result['text'].strip()) == 0) and vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
+                logger.warning(f"  Seal #{i}: Unwarp OCR failed (empty result), trying PaddleOCRVL backup on crop image")
+                seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
+                backup_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
+
+                logger.info(f"  Seal #{i} PaddleOCRVL Backup Result (crop):")
+                logger.info(f"    - Text: '{backup_result['text']}'")
+                logger.info(f"    - Score: {backup_result['score']:.4f}")
+                logger.info(f"    - Success: {backup_result['success']}")
+                logger.info(f"    - Text length: {len(backup_result['text'])} chars")
+
+                # Use backup result if it's better (non-empty text)
+                if backup_result['success'] and len(backup_result['text'].strip()) > 0:
+                    logger.info(f"  Seal #{i}: ** Using PaddleOCRVL backup result (unwarp failed) **")
+                    ocr_result = backup_result
+                    ocr_method_used = f"{method_used}_crop_backup"
+                else:
+                    logger.warning(f"  Seal #{i}: ** Both unwarp and crop OCR failed **")
         else:
             # ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
             logger.warning(f"  Seal #{i}: No unwarp image available (polar unwarp failed)")
@@ -900,6 +925,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
                 logger.info(f"  Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
                 seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
                 ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
+                ocr_method_used = f"{method_used}_crop_backup"
                 logger.info(f"  Seal #{i} PaddleOCRVL Backup Result:")
                 logger.info(f"    - Text: '{ocr_result['text']}'")
                 logger.info(f"    - Score: {ocr_result['score']:.4f}")
@@ -919,7 +945,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
             'text': ocr_result['text'],
             'confidence': float(ocr_result['score']),
             'success': bool(ocr_result['success']),
-            'method_used': method_used,  # Add method tracking
+            'method_used': ocr_method_used,  # Track actual OCR method used
             'used_fallback': used_fallback,  # Track if fallback was used
             'debug_info': {
                 'center': center,
@@ -934,8 +960,10 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
         result['seals'].append(seal_data)
 
         if ocr_result['success']:
-            result['institutions'].append(ocr_result['text'])
-            logger.info(f"  ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
+            # Clean the institution name before adding
+            cleaned_name = clean_institution_name(ocr_result['text'])
+            result['institutions'].append(cleaned_name)
+            logger.info(f"  ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})")
         else:
             logger.warning(f"  ✗ Seal #{i} FAILED: Could not extract institution name")
 
@@ -943,6 +971,56 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
     return result
 
 
+# ============ Text Cleaning Functions ============
+
+def clean_institution_name(text: str) -> str:
+    """
+    Clean extracted institution name by removing unwanted suffixes.
+
+    Removes common seal-related text that is not part of the institution name:
+    - 检验检测专用章
+    - 检验检测专用
+    - 专用章
+    - 及其他变体
+
+    Args:
+        text: Raw extracted institution name
+
+    Returns:
+        Cleaned institution name
+    """
+    if not text:
+        return text
+
+    # Define patterns to remove (order matters: most specific first)
+    patterns_to_remove = [
+        '检验检测专用章',
+        '检验检测专用',
+        '检测专用章',
+        '检验专用章',
+        '专用章',
+        '（检验检测）',
+        '(检验检测)',
+        '【检验检测】',
+        '[检验检测]',
+    ]
+
+    cleaned = text
+    for pattern in patterns_to_remove:
+        if pattern in cleaned:
+            cleaned = cleaned.replace(pattern, '')
+            logger.debug(f"Removed pattern '{pattern}' from institution name")
+
+    # Strip whitespace
+    cleaned = cleaned.strip()
+
+    # Log if cleaning occurred
+    if cleaned != text:
+        logger.info(f"Cleaned institution name: '{text}' → '{cleaned}'")
+
+    return cleaned
+
+
 # ============ Similarity and Matching Functions ============
 
 def calculate_similarity(str1: str, str2: str) -> float:
@@ -1122,23 +1200,32 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str,
 
     # Select best institution match
     if seal_result['institutions']:
+        logger.info(f"  Institution Extraction:")
+        logger.info(f"    - Expected: {expected_inst if expected_inst else 'N/A'}")
+        logger.info(f"    - Found {len(seal_result['institutions'])} institution(s) from seals")
+
         # Find best matching institution
         best_inst = None
         best_similarity = 0.0
 
-        for inst in seal_result['institutions']:
+        for idx, inst in enumerate(seal_result['institutions']):
             if expected_inst and expected_inst != "无":
                 sim = calculate_similarity(inst, expected_inst)
+                logger.info(f"    - Inst #{idx+1}: '{inst[:50]}...' → Similarity: {sim:.1f}%")
                 if sim > best_similarity:
                     best_similarity = sim
                     best_inst = inst
+                    logger.info(f"      → New best match! ({sim:.1f}% > {best_similarity:.1f}%)")
             elif not best_inst:
                 best_inst = inst
+                logger.info(f"    - Inst #{idx+1}: '{inst[:50]}...' (no expected value for comparison)")
 
         # Fallback: if best_inst is still None (all similarities were 0), use first institution
         if best_inst is None and seal_result['institutions']:
             best_inst = seal_result['institutions'][0]
+            logger.warning(f"    - All similarities were 0%, using first institution: '{best_inst[:50]}...'")
 
+        logger.info(f"    - Selected: '{best_inst[:50]}...' (similarity: {best_similarity:.1f}%)")
         result['extracted']['institution'] = best_inst
 
         # Compare institution