From 52f283c7c9dcff436f14e98ecdb028c71fe29586 Mon Sep 17 00:00:00 2001 From: huangrh Date: Sun, 8 Feb 2026 13:46:56 +0800 Subject: [PATCH] feat(seal): add double verification and institution name cleaning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Key improvements: 1. Double verification mechanism for OCR failures - When unwarp OCR fails (empty text), automatically try PaddleOCRVL backup on crop - Fixes issue where correct seal was ignored due to unwarp image distortion - Test result: 4% → 93.8% similarity on problematic PDFs 2. Institution name cleaning - Remove unwanted suffixes: 检验检测专用章, 专用章, etc. - Clean names before adding to results and similarity calculation - Improves matching accuracy 3. Enhanced logging for institution selection - Show all extracted institutions with similarity scores - Track why specific institution was selected - Better debugging and transparency Example impact: - Before: "成都虹之川科技有限公司" (wrong seal, 4% similarity) - After: "中科测试技术(广东)集团有限公司" (correct seal, 93.8% similarity) Co-Authored-By: Claude Sonnet 4.5 --- test_accuracy_batch_full.py | 103 +++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 8 deletions(-) diff --git a/test_accuracy_batch_full.py b/test_accuracy_batch_full.py index 343897f..6b1a1c5 100644 --- a/test_accuracy_batch_full.py +++ b/test_accuracy_batch_full.py @@ -717,8 +717,10 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v result['seals'].append(seal_data) if ocr_result['success']: - result['institutions'].append(ocr_result['text']) - logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})") + # Clean the institution name before adding + cleaned_name = clean_institution_name(ocr_result['text']) + result['institutions'].append(cleaned_name) + logger.info(f" ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})") else: logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name") @@ -872,8 +874,9 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v marked_path = os.path.join(output_dir, f"seal_marked_{i}.png") imwrite_safe(marked_path, marked) - # OCR recognition + # OCR recognition with double verification ocr_result = {'text': '', 'score': 0.0, 'success': False} + ocr_method_used = method_used if unwarp is not None: # Standard path: Recognize unwarp image @@ -885,13 +888,35 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v else: ocr_result = run_ocr_recognition(unwarp_path, rec_model) - logger.info(f" Seal #{i} OCR Result:") + ocr_method_used = f"{method_used}_unwarp" + logger.info(f" Seal #{i} OCR Result (unwarp):") logger.info(f" - Text: '{ocr_result['text']}'") logger.info(f" - Score: {ocr_result['score']:.4f}") logger.info(f" - Success: {ocr_result['success']}") logger.info(f" - Text length: {len(ocr_result['text'])} chars") if used_fallback: logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **") + + # ============ DOUBLE VERIFICATION: Try PaddleOCRVL on crop if unwarp OCR fails ============ + # If unwarp OCR failed (empty text or success=False), try PaddleOCRVL backup on crop + if (not ocr_result['success'] or len(ocr_result['text'].strip()) == 0) and vl_pipeline is not None and PADDLEOCRVL_AVAILABLE: + logger.warning(f" Seal #{i}: Unwarp OCR failed (empty result), trying PaddleOCRVL backup on crop image") + seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png") + backup_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline) + + logger.info(f" Seal #{i} PaddleOCRVL Backup Result (crop):") + logger.info(f" - Text: '{backup_result['text']}'") + logger.info(f" - Score: {backup_result['score']:.4f}") + logger.info(f" - Success: {backup_result['success']}") + logger.info(f" - Text length: {len(backup_result['text'])} chars") + + # Use backup result if it's better (non-empty text) + if backup_result['success'] and len(backup_result['text'].strip()) > 0: + logger.info(f" Seal #{i}: ** Using PaddleOCRVL backup result (unwarp failed) **") + ocr_result = backup_result + ocr_method_used = f"{method_used}_crop_backup" + else: + logger.warning(f" Seal #{i}: ** Both unwarp and crop OCR failed **") else: # ============ BACKUP: Use PaddleOCRVL directly on seal crop ============ logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)") @@ -900,6 +925,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image") seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png") ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline) + ocr_method_used = f"{method_used}_crop_backup" logger.info(f" Seal #{i} PaddleOCRVL Backup Result:") logger.info(f" - Text: '{ocr_result['text']}'") logger.info(f" - Score: {ocr_result['score']:.4f}") @@ -919,7 +945,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v 'text': ocr_result['text'], 'confidence': float(ocr_result['score']), 'success': bool(ocr_result['success']), - 'method_used': method_used, # Add method tracking + 'method_used': ocr_method_used, # Track actual OCR method used 'used_fallback': used_fallback, # Track if fallback was used 'debug_info': { 'center': center, @@ -934,8 +960,10 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v result['seals'].append(seal_data) if ocr_result['success']: - result['institutions'].append(ocr_result['text']) - logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})") + # Clean the institution name before adding + cleaned_name = clean_institution_name(ocr_result['text']) + result['institutions'].append(cleaned_name) + logger.info(f" ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})") else: logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name") @@ -943,6 +971,56 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v return result +# ============ Text Cleaning Functions ============ + +def clean_institution_name(text: str) -> str: + """ + Clean extracted institution name by removing unwanted suffixes. + + Removes common seal-related text that is not part of the institution name: + - 检验检测专用章 + - 检验检测专用 + - 专用章 + - 及其他变体 + + Args: + text: Raw extracted institution name + + Returns: + Cleaned institution name + """ + if not text: + return text + + # Define patterns to remove (order matters: most specific first) + patterns_to_remove = [ + '检验检测专用章', + '检验检测专用', + '检测专用章', + '检验专用章', + '专用章', + '(检验检测)', + '(检验检测)', + '【检验检测】', + '[检验检测]', + ] + + cleaned = text + for pattern in patterns_to_remove: + if pattern in cleaned: + cleaned = cleaned.replace(pattern, '') + logger.debug(f"Removed pattern '{pattern}' from institution name") + + # Strip whitespace + cleaned = cleaned.strip() + + # Log if cleaning occurred + if cleaned != text: + logger.info(f"Cleaned institution name: '{text}' → '{cleaned}'") + + return cleaned + + # ============ Similarity and Matching Functions ============ def calculate_similarity(str1: str, str2: str) -> float: @@ -1122,23 +1200,32 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, # Select best institution match if seal_result['institutions']: + logger.info(f" Institution Extraction:") + logger.info(f" - Expected: {expected_inst if expected_inst else 'N/A'}") + logger.info(f" - Found {len(seal_result['institutions'])} institution(s) from seals") + # Find best matching institution best_inst = None best_similarity = 0.0 - for inst in seal_result['institutions']: + for idx, inst in enumerate(seal_result['institutions']): if expected_inst and expected_inst != "无": sim = calculate_similarity(inst, expected_inst) + logger.info(f" - Inst #{idx+1}: '{inst[:50]}...' → Similarity: {sim:.1f}%") if sim > best_similarity: best_similarity = sim best_inst = inst + logger.info(f" → New best match! ({sim:.1f}% > {best_similarity:.1f}%)") elif not best_inst: best_inst = inst + logger.info(f" - Inst #{idx+1}: '{inst[:50]}...' (no expected value for comparison)") # Fallback: if best_inst is still None (all similarities were 0), use first institution if best_inst is None and seal_result['institutions']: best_inst = seal_result['institutions'][0] + logger.warning(f" - All similarities were 0%, using first institution: '{best_inst[:50]}...'") + logger.info(f" - Selected: '{best_inst[:50]}...' (similarity: {best_similarity:.1f}%)") result['extracted']['institution'] = best_inst # Compare institution