feat(seal): add double verification and institution name cleaning
Key improvements: 1. Double verification mechanism for OCR failures - When unwarp OCR fails (empty text), automatically try PaddleOCRVL backup on crop - Fixes issue where correct seal was ignored due to unwarp image distortion - Test result: 4% → 93.8% similarity on problematic PDFs 2. Institution name cleaning - Remove unwanted suffixes: 检验检测专用章, 专用章, etc. - Clean names before adding to results and similarity calculation - Improves matching accuracy 3. Enhanced logging for institution selection - Show all extracted institutions with similarity scores - Track why specific institution was selected - Better debugging and transparency Example impact: - Before: "成都虹之川科技有限公司" (wrong seal, 4% similarity) - After: "中科测试技术(广东)集团有限公司" (correct seal, 93.8% similarity) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
5a493b8d67
commit
52f283c7c9
|
|
@ -717,8 +717,10 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
result['seals'].append(seal_data)
|
result['seals'].append(seal_data)
|
||||||
|
|
||||||
if ocr_result['success']:
|
if ocr_result['success']:
|
||||||
result['institutions'].append(ocr_result['text'])
|
# Clean the institution name before adding
|
||||||
logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
|
cleaned_name = clean_institution_name(ocr_result['text'])
|
||||||
|
result['institutions'].append(cleaned_name)
|
||||||
|
logger.info(f" ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
||||||
|
|
||||||
|
|
@ -872,8 +874,9 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
|
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
|
||||||
imwrite_safe(marked_path, marked)
|
imwrite_safe(marked_path, marked)
|
||||||
|
|
||||||
# OCR recognition
|
# OCR recognition with double verification
|
||||||
ocr_result = {'text': '', 'score': 0.0, 'success': False}
|
ocr_result = {'text': '', 'score': 0.0, 'success': False}
|
||||||
|
ocr_method_used = method_used
|
||||||
|
|
||||||
if unwarp is not None:
|
if unwarp is not None:
|
||||||
# Standard path: Recognize unwarp image
|
# Standard path: Recognize unwarp image
|
||||||
|
|
@ -885,13 +888,35 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
else:
|
else:
|
||||||
ocr_result = run_ocr_recognition(unwarp_path, rec_model)
|
ocr_result = run_ocr_recognition(unwarp_path, rec_model)
|
||||||
|
|
||||||
logger.info(f" Seal #{i} OCR Result:")
|
ocr_method_used = f"{method_used}_unwarp"
|
||||||
|
logger.info(f" Seal #{i} OCR Result (unwarp):")
|
||||||
logger.info(f" - Text: '{ocr_result['text']}'")
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
||||||
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
||||||
logger.info(f" - Success: {ocr_result['success']}")
|
logger.info(f" - Success: {ocr_result['success']}")
|
||||||
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
|
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
|
||||||
if used_fallback:
|
if used_fallback:
|
||||||
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
|
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
|
||||||
|
|
||||||
|
# ============ DOUBLE VERIFICATION: Try PaddleOCRVL on crop if unwarp OCR fails ============
|
||||||
|
# If unwarp OCR failed (empty text or success=False), try PaddleOCRVL backup on crop
|
||||||
|
if (not ocr_result['success'] or len(ocr_result['text'].strip()) == 0) and vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
|
||||||
|
logger.warning(f" Seal #{i}: Unwarp OCR failed (empty result), trying PaddleOCRVL backup on crop image")
|
||||||
|
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
||||||
|
backup_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
|
||||||
|
|
||||||
|
logger.info(f" Seal #{i} PaddleOCRVL Backup Result (crop):")
|
||||||
|
logger.info(f" - Text: '{backup_result['text']}'")
|
||||||
|
logger.info(f" - Score: {backup_result['score']:.4f}")
|
||||||
|
logger.info(f" - Success: {backup_result['success']}")
|
||||||
|
logger.info(f" - Text length: {len(backup_result['text'])} chars")
|
||||||
|
|
||||||
|
# Use backup result if it's better (non-empty text)
|
||||||
|
if backup_result['success'] and len(backup_result['text'].strip()) > 0:
|
||||||
|
logger.info(f" Seal #{i}: ** Using PaddleOCRVL backup result (unwarp failed) **")
|
||||||
|
ocr_result = backup_result
|
||||||
|
ocr_method_used = f"{method_used}_crop_backup"
|
||||||
|
else:
|
||||||
|
logger.warning(f" Seal #{i}: ** Both unwarp and crop OCR failed **")
|
||||||
else:
|
else:
|
||||||
# ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
|
# ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
|
||||||
logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)")
|
logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)")
|
||||||
|
|
@ -900,6 +925,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
|
logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
|
||||||
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
||||||
ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
|
ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
|
||||||
|
ocr_method_used = f"{method_used}_crop_backup"
|
||||||
logger.info(f" Seal #{i} PaddleOCRVL Backup Result:")
|
logger.info(f" Seal #{i} PaddleOCRVL Backup Result:")
|
||||||
logger.info(f" - Text: '{ocr_result['text']}'")
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
||||||
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
||||||
|
|
@ -919,7 +945,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
'text': ocr_result['text'],
|
'text': ocr_result['text'],
|
||||||
'confidence': float(ocr_result['score']),
|
'confidence': float(ocr_result['score']),
|
||||||
'success': bool(ocr_result['success']),
|
'success': bool(ocr_result['success']),
|
||||||
'method_used': method_used, # Add method tracking
|
'method_used': ocr_method_used, # Track actual OCR method used
|
||||||
'used_fallback': used_fallback, # Track if fallback was used
|
'used_fallback': used_fallback, # Track if fallback was used
|
||||||
'debug_info': {
|
'debug_info': {
|
||||||
'center': center,
|
'center': center,
|
||||||
|
|
@ -934,8 +960,10 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
result['seals'].append(seal_data)
|
result['seals'].append(seal_data)
|
||||||
|
|
||||||
if ocr_result['success']:
|
if ocr_result['success']:
|
||||||
result['institutions'].append(ocr_result['text'])
|
# Clean the institution name before adding
|
||||||
logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
|
cleaned_name = clean_institution_name(ocr_result['text'])
|
||||||
|
result['institutions'].append(cleaned_name)
|
||||||
|
logger.info(f" ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
||||||
|
|
||||||
|
|
@ -943,6 +971,56 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Text Cleaning Functions ============
|
||||||
|
|
||||||
|
def clean_institution_name(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean extracted institution name by removing unwanted suffixes.
|
||||||
|
|
||||||
|
Removes common seal-related text that is not part of the institution name:
|
||||||
|
- 检验检测专用章
|
||||||
|
- 检验检测专用
|
||||||
|
- 专用章
|
||||||
|
- 及其他变体
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw extracted institution name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned institution name
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Define patterns to remove (order matters: most specific first)
|
||||||
|
patterns_to_remove = [
|
||||||
|
'检验检测专用章',
|
||||||
|
'检验检测专用',
|
||||||
|
'检测专用章',
|
||||||
|
'检验专用章',
|
||||||
|
'专用章',
|
||||||
|
'(检验检测)',
|
||||||
|
'(检验检测)',
|
||||||
|
'【检验检测】',
|
||||||
|
'[检验检测]',
|
||||||
|
]
|
||||||
|
|
||||||
|
cleaned = text
|
||||||
|
for pattern in patterns_to_remove:
|
||||||
|
if pattern in cleaned:
|
||||||
|
cleaned = cleaned.replace(pattern, '')
|
||||||
|
logger.debug(f"Removed pattern '{pattern}' from institution name")
|
||||||
|
|
||||||
|
# Strip whitespace
|
||||||
|
cleaned = cleaned.strip()
|
||||||
|
|
||||||
|
# Log if cleaning occurred
|
||||||
|
if cleaned != text:
|
||||||
|
logger.info(f"Cleaned institution name: '{text}' → '{cleaned}'")
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
# ============ Similarity and Matching Functions ============
|
# ============ Similarity and Matching Functions ============
|
||||||
|
|
||||||
def calculate_similarity(str1: str, str2: str) -> float:
|
def calculate_similarity(str1: str, str2: str) -> float:
|
||||||
|
|
@ -1122,23 +1200,32 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str,
|
||||||
|
|
||||||
# Select best institution match
|
# Select best institution match
|
||||||
if seal_result['institutions']:
|
if seal_result['institutions']:
|
||||||
|
logger.info(f" Institution Extraction:")
|
||||||
|
logger.info(f" - Expected: {expected_inst if expected_inst else 'N/A'}")
|
||||||
|
logger.info(f" - Found {len(seal_result['institutions'])} institution(s) from seals")
|
||||||
|
|
||||||
# Find best matching institution
|
# Find best matching institution
|
||||||
best_inst = None
|
best_inst = None
|
||||||
best_similarity = 0.0
|
best_similarity = 0.0
|
||||||
|
|
||||||
for inst in seal_result['institutions']:
|
for idx, inst in enumerate(seal_result['institutions']):
|
||||||
if expected_inst and expected_inst != "无":
|
if expected_inst and expected_inst != "无":
|
||||||
sim = calculate_similarity(inst, expected_inst)
|
sim = calculate_similarity(inst, expected_inst)
|
||||||
|
logger.info(f" - Inst #{idx+1}: '{inst[:50]}...' → Similarity: {sim:.1f}%")
|
||||||
if sim > best_similarity:
|
if sim > best_similarity:
|
||||||
best_similarity = sim
|
best_similarity = sim
|
||||||
best_inst = inst
|
best_inst = inst
|
||||||
|
logger.info(f" → New best match! ({sim:.1f}% > {best_similarity:.1f}%)")
|
||||||
elif not best_inst:
|
elif not best_inst:
|
||||||
best_inst = inst
|
best_inst = inst
|
||||||
|
logger.info(f" - Inst #{idx+1}: '{inst[:50]}...' (no expected value for comparison)")
|
||||||
|
|
||||||
# Fallback: if best_inst is still None (all similarities were 0), use first institution
|
# Fallback: if best_inst is still None (all similarities were 0), use first institution
|
||||||
if best_inst is None and seal_result['institutions']:
|
if best_inst is None and seal_result['institutions']:
|
||||||
best_inst = seal_result['institutions'][0]
|
best_inst = seal_result['institutions'][0]
|
||||||
|
logger.warning(f" - All similarities were 0%, using first institution: '{best_inst[:50]}...'")
|
||||||
|
|
||||||
|
logger.info(f" - Selected: '{best_inst[:50]}...' (similarity: {best_similarity:.1f}%)")
|
||||||
result['extracted']['institution'] = best_inst
|
result['extracted']['institution'] = best_inst
|
||||||
|
|
||||||
# Compare institution
|
# Compare institution
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue