feat(seal): fix seal text extraction for edge cases
- Add extent limit (max 350°) to prevent polar unwarp distortion - Add polygon count check (<3 polygons → use PaddleOCRVL backup) - Add imwrite_safe() to handle Chinese paths on Windows - Add --pdf-names parameter for targeted debugging Fixes issue where seal extraction returned empty string when: - Arc extent exceeded 360° causing severe image distortion - Too few text polygons detected leading to inaccurate arc calculation Test results: - Before: 0% similarity (empty string) - After: 52.4% similarity (partial extraction) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
8b416e9f5a
commit
5a493b8d67
|
|
@ -1,35 +0,0 @@
|
||||||
from paddleocr import SealTextDetection
|
|
||||||
import os
|
|
||||||
|
|
||||||
def debug_paddle():
|
|
||||||
img_path = "seal_cropped.png"
|
|
||||||
if not os.path.exists(img_path):
|
|
||||||
print(f"Error: {img_path} not found")
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f"Loading SealTextDetection model on {img_path}...")
|
|
||||||
try:
|
|
||||||
model = SealTextDetection(model_name="PP-OCRv4_server_seal_det")
|
|
||||||
output = model.predict(img_path, batch_size=1)
|
|
||||||
|
|
||||||
print(f"Output type: {type(output)}")
|
|
||||||
for i, res in enumerate(output):
|
|
||||||
print(f"Result {i} attributes: {dir(res)}")
|
|
||||||
res.print()
|
|
||||||
# Try to see if it has boxes or polygons
|
|
||||||
if hasattr(res, 'boxes'):
|
|
||||||
print(f"Boxes found: {len(res.boxes)}")
|
|
||||||
if hasattr(res, 'polygons'):
|
|
||||||
print(f"Polygons found: {len(res.polygons)}")
|
|
||||||
|
|
||||||
# Save to see what it does
|
|
||||||
res.save_to_img(save_path="./debug_output")
|
|
||||||
print("Saved debug image to ./debug_output")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Caught Exception: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
debug_paddle()
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
||||||
import os
|
|
||||||
import glob
|
|
||||||
import re
|
|
||||||
|
|
||||||
def generate_html(viz_dir="report_viz"):
|
|
||||||
html_file = os.path.join(viz_dir, "index.html")
|
|
||||||
|
|
||||||
# Sort files by timestamp
|
|
||||||
files = sorted(os.listdir(viz_dir))
|
|
||||||
|
|
||||||
full_pages = [f for f in files if f.startswith("viz_")]
|
|
||||||
crops = [f for f in files if "seal_crop_" in f]
|
|
||||||
unwarps = [f for f in files if "seal_localized_" in f]
|
|
||||||
|
|
||||||
html = """
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Seal Unwarp Verification Report</title>
|
|
||||||
<style>
|
|
||||||
body { font-family: sans-serif; margin: 20px; background: #f0f0f0; }
|
|
||||||
.section { background: white; padding: 20px; margin-bottom: 20px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
|
|
||||||
.row { display: flex; align-items: flex-start; gap: 20px; margin-bottom: 20px; border-bottom: 1px solid #eee; padding-bottom: 20px; }
|
|
||||||
.col { flex: 1; }
|
|
||||||
img { max-width: 100%; border: 1px solid #ccc; }
|
|
||||||
.label { font-weight: bold; margin-bottom: 5px; color: #555; }
|
|
||||||
h1 { color: #333; }
|
|
||||||
h2 { border-bottom: 2px solid #333; padding-bottom: 5px; }
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h1>Seal Unwarp Verification Report</h1>
|
|
||||||
<p>Intermediate steps for seal detection and unwarping.</p>
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Group by similarity in timestamp (they might not be identical)
|
|
||||||
# Actually, let's just show them in sequence.
|
|
||||||
|
|
||||||
html += '<div class="section"><h2>1. Full Page Detections</h2>'
|
|
||||||
for pf in full_pages:
|
|
||||||
html += f'<div class="row"><div class="col"><div class="label">{pf}</div><img src="{pf}"></div></div>'
|
|
||||||
html += '</div>'
|
|
||||||
|
|
||||||
html += '<div class="section"><h2>2. Seal Crops & Unwarps</h2>'
|
|
||||||
# Match crops with unwarps by proximity in sorted list or timestamp extraction
|
|
||||||
for crop in crops:
|
|
||||||
ts = re.search(r"(\d+)", crop).group(1)
|
|
||||||
# Find unwarps that happened shortly after this crop
|
|
||||||
matching_unwarps = [u for u in unwarps if abs(int(re.search(r"(\d+)", u).group(1)) - int(ts)) < 2000]
|
|
||||||
|
|
||||||
html += '<div class="row">'
|
|
||||||
html += f'<div class="col"><div class="label">Step A: Seal Crop</div><img src="{crop}"></div>'
|
|
||||||
|
|
||||||
for u in matching_unwarps:
|
|
||||||
label = "Step B: 7:30 Unwarp" if "730" in u else "Step C: Smart Unwarp"
|
|
||||||
html += f'<div class="col"><div class="label">{label}</div><img src="{u}"></div>'
|
|
||||||
|
|
||||||
html += '</div>'
|
|
||||||
html += '</div>'
|
|
||||||
|
|
||||||
html += "</body></html>"
|
|
||||||
|
|
||||||
with open(html_file, "w", encoding="utf-8") as f:
|
|
||||||
f.write(html)
|
|
||||||
print(f"HTML report generated: {html_file}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
generate_html()
|
|
||||||
|
|
@ -99,6 +99,39 @@ SIMILARITY_THRESHOLD = 85.0
|
||||||
OCR_MODEL = os.environ.get("OCR_MODEL", "ppocr_v5")
|
OCR_MODEL = os.environ.get("OCR_MODEL", "ppocr_v5")
|
||||||
|
|
||||||
|
|
||||||
|
# ============ Helper Functions ============
|
||||||
|
|
||||||
|
def imwrite_safe(file_path, img):
|
||||||
|
"""
|
||||||
|
Write image file safely, handling Chinese paths on Windows.
|
||||||
|
|
||||||
|
On Windows, cv2.imwrite fails with Chinese paths. This function uses
|
||||||
|
cv2.imencode + tofile as a fallback.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to save the image
|
||||||
|
img: Image data (numpy array)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Try standard cv2.imwrite first
|
||||||
|
success = cv2.imwrite(file_path, img)
|
||||||
|
if success:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Fallback: Use imencode + tofile for Chinese paths
|
||||||
|
is_success, buffer = cv2.imencode(".png", img)
|
||||||
|
if is_success:
|
||||||
|
buffer.tofile(file_path)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to write image to {file_path}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
# ============ Seal Processing Functions (from v_verify_logic.py) ============
|
# ============ Seal Processing Functions (from v_verify_logic.py) ============
|
||||||
|
|
||||||
def polar_unwarp(img, center, radius, start_theta, angular_extent):
|
def polar_unwarp(img, center, radius, start_theta, angular_extent):
|
||||||
|
|
@ -219,7 +252,18 @@ def calculate_precise_arc(polygons, center):
|
||||||
candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight})
|
candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight})
|
||||||
candidates.sort(key=lambda x: x['score'], reverse=True)
|
candidates.sort(key=lambda x: x['score'], reverse=True)
|
||||||
best = candidates[0]
|
best = candidates[0]
|
||||||
return best['start'], best['end'] - best['start']
|
|
||||||
|
# FIX: Limit extent to max 350° to avoid overlap and distortion
|
||||||
|
# Extent > 360° causes severe image distortion in polar unwarping
|
||||||
|
MAX_EXTENT_DEG = 350.0
|
||||||
|
start_theta = best['start']
|
||||||
|
extent = best['end'] - best['start']
|
||||||
|
|
||||||
|
if math.degrees(extent) > MAX_EXTENT_DEG:
|
||||||
|
logger.warning(f"Arc extent {math.degrees(extent):.2f}° exceeds {MAX_EXTENT_DEG}°, clamping to avoid distortion")
|
||||||
|
extent = math.radians(MAX_EXTENT_DEG)
|
||||||
|
|
||||||
|
return start_theta, extent
|
||||||
|
|
||||||
|
|
||||||
def fit_circle_from_text_polygons(all_polygons):
|
def fit_circle_from_text_polygons(all_polygons):
|
||||||
|
|
@ -384,10 +428,12 @@ def run_ocr_recognition(image_path, rec_model):
|
||||||
|
|
||||||
def run_ocr_recognition_vl(image_path, vl_pipeline):
|
def run_ocr_recognition_vl(image_path, vl_pipeline):
|
||||||
"""
|
"""
|
||||||
Run OCR recognition using PaddleOCRVL on unwarp seal image.
|
Run OCR recognition using PaddleOCRVL on seal image.
|
||||||
|
|
||||||
|
Can be used on both unwarp images and crop images (backup mode).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_path: Path to unwarp seal image
|
image_path: Path to seal image (unwarp or crop)
|
||||||
vl_pipeline: Initialized PaddleOCRVL pipeline
|
vl_pipeline: Initialized PaddleOCRVL pipeline
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -492,9 +538,9 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
# Save page image
|
# Save page image
|
||||||
doc_path = os.path.join(output_dir, "doc_page.png")
|
doc_path = os.path.join(output_dir, "doc_page.png")
|
||||||
try:
|
try:
|
||||||
success = cv2.imwrite(doc_path, page_img)
|
success = imwrite_safe(doc_path, page_img)
|
||||||
if not success:
|
if not success:
|
||||||
logger.error(f"cv2.imwrite returned False for {doc_path}")
|
logger.error(f"imwrite_safe returned False for {doc_path}")
|
||||||
# Try alternative save method using PIL
|
# Try alternative save method using PIL
|
||||||
try:
|
try:
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
@ -544,7 +590,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
if is_seal:
|
if is_seal:
|
||||||
seal_boxes.append(box)
|
seal_boxes.append(box)
|
||||||
|
|
||||||
cv2.imwrite(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)
|
imwrite_safe(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)
|
||||||
|
|
||||||
if not seal_boxes:
|
if not seal_boxes:
|
||||||
logger.warning("No seals detected")
|
logger.warning("No seals detected")
|
||||||
|
|
@ -585,7 +631,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
continue
|
continue
|
||||||
|
|
||||||
crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
||||||
success = cv2.imwrite(crop_path, seal_crop)
|
success = imwrite_safe(crop_path, seal_crop)
|
||||||
if not success:
|
if not success:
|
||||||
# Try PIL fallback
|
# Try PIL fallback
|
||||||
try:
|
try:
|
||||||
|
|
@ -623,6 +669,88 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
logger.info(f" - Center: ({center[0]}, {center[1]})")
|
logger.info(f" - Center: ({center[0]}, {center[1]})")
|
||||||
logger.info(f" - Radius: {radius}")
|
logger.info(f" - Radius: {radius}")
|
||||||
|
|
||||||
|
# ============ INSUFFICIENT POLYGONS CHECK ============
|
||||||
|
# If too few text polygons detected, polar unwarping will likely fail
|
||||||
|
# Skip directly to PaddleOCRVL backup in this case
|
||||||
|
MIN_POLYGONS_FOR_UNWARP = 3
|
||||||
|
if len(all_polygons) < MIN_POLYGONS_FOR_UNWARP:
|
||||||
|
logger.warning(f" Seal #{i}: Only {len(all_polygons)} text polygons detected (< {MIN_POLYGONS_FOR_UNWARP})")
|
||||||
|
logger.warning(f" Seal #{i}: Skipping polar unwarping (insufficient polygon data)")
|
||||||
|
logger.info(f" Seal #{i}: Using PaddleOCRVL backup instead")
|
||||||
|
|
||||||
|
# Save crop image
|
||||||
|
imwrite_safe(crop_path, seal_crop)
|
||||||
|
|
||||||
|
# Use PaddleOCRVL directly on crop (no unwarp)
|
||||||
|
if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
|
||||||
|
ocr_result = run_ocr_recognition_vl(crop_path, vl_pipeline)
|
||||||
|
logger.info(f" Seal #{i} PaddleOCRVL Result (direct crop):")
|
||||||
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
||||||
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
||||||
|
logger.info(f" - Success: {ocr_result['success']}")
|
||||||
|
logger.info(f" - ** Used PaddleOCRVL (insufficient polygons for unwarping) **")
|
||||||
|
|
||||||
|
# Create debug info without unwarp
|
||||||
|
seal_data = {
|
||||||
|
'index': i,
|
||||||
|
'box': box,
|
||||||
|
'crop_path': Path(crop_path).name,
|
||||||
|
'unwarp_path': None, # No unwarp performed
|
||||||
|
'marked_path': None, # No marked image
|
||||||
|
'polar_viz_path': None, # No polar visualization
|
||||||
|
'text': ocr_result['text'],
|
||||||
|
'confidence': float(ocr_result['score']),
|
||||||
|
'success': bool(ocr_result['success']),
|
||||||
|
'method_used': f'{method_used}_skip_unwarp',
|
||||||
|
'used_fallback': True,
|
||||||
|
'debug_info': {
|
||||||
|
'center': center,
|
||||||
|
'radius': radius,
|
||||||
|
'start_theta_deg': None,
|
||||||
|
'extent_deg': None,
|
||||||
|
'num_polygons': len(all_polygons),
|
||||||
|
'crop_size': (cw, ch),
|
||||||
|
'unwarp_size': None,
|
||||||
|
'skip_reason': f'Insufficient polygons ({len(all_polygons)} < {MIN_POLYGONS_FOR_UNWARP})'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result['seals'].append(seal_data)
|
||||||
|
|
||||||
|
if ocr_result['success']:
|
||||||
|
result['institutions'].append(ocr_result['text'])
|
||||||
|
logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
|
||||||
|
else:
|
||||||
|
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
||||||
|
|
||||||
|
continue # Skip to next seal
|
||||||
|
else:
|
||||||
|
logger.error(f" Seal #{i}: PaddleOCRVL not available, cannot extract text")
|
||||||
|
seal_data = {
|
||||||
|
'index': i,
|
||||||
|
'box': box,
|
||||||
|
'crop_path': Path(crop_path).name,
|
||||||
|
'unwarp_path': None,
|
||||||
|
'marked_path': None,
|
||||||
|
'polar_viz_path': None,
|
||||||
|
'text': '',
|
||||||
|
'confidence': 0.0,
|
||||||
|
'success': False,
|
||||||
|
'method_used': f'{method_used}_skip_unwarp',
|
||||||
|
'used_fallback': True,
|
||||||
|
'debug_info': {
|
||||||
|
'center': center,
|
||||||
|
'radius': radius,
|
||||||
|
'start_theta_deg': None,
|
||||||
|
'extent_deg': None,
|
||||||
|
'num_polygons': len(all_polygons),
|
||||||
|
'crop_size': (cw, ch),
|
||||||
|
'unwarp_size': None,
|
||||||
|
'skip_reason': f'Insufficient polygons and no PaddleOCRVL backup'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result['seals'].append(seal_data)
|
||||||
|
continue
|
||||||
|
|
||||||
# Calculate arc and unwarp
|
# Calculate arc and unwarp
|
||||||
start_theta, extent = calculate_precise_arc(all_polygons, center)
|
start_theta, extent = calculate_precise_arc(all_polygons, center)
|
||||||
logger.info(f" Seal #{i} Arc Parameters:")
|
logger.info(f" Seal #{i} Arc Parameters:")
|
||||||
|
|
@ -658,7 +786,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
logger.info(f" Seal #{i}: Performing polar unwarping with detected text polygons...")
|
logger.info(f" Seal #{i}: Performing polar unwarping with detected text polygons...")
|
||||||
unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent)
|
unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent)
|
||||||
if unwarp is not None:
|
if unwarp is not None:
|
||||||
cv2.imwrite(unwarp_path, unwarp)
|
imwrite_safe(unwarp_path, unwarp)
|
||||||
logger.info(f" - Unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
|
logger.info(f" - Unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
|
||||||
|
|
||||||
def draw_line(m, theta, color):
|
def draw_line(m, theta, color):
|
||||||
|
|
@ -684,7 +812,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
|
|
||||||
# Save polar visualization
|
# Save polar visualization
|
||||||
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
|
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
|
||||||
cv2.imwrite(polar_viz_path, polar_viz)
|
imwrite_safe(polar_viz_path, polar_viz)
|
||||||
logger.info(f" - Polar visualization saved: seal_polar_viz_{i}.png")
|
logger.info(f" - Polar visualization saved: seal_polar_viz_{i}.png")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" Seal #{i}: Polar unwarp returned None")
|
logger.warning(f" Seal #{i}: Polar unwarp returned None")
|
||||||
|
|
@ -707,7 +835,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
|
|
||||||
unwarp = polar_unwarp(seal_crop, center, radius, fallback_start_theta, fallback_extent)
|
unwarp = polar_unwarp(seal_crop, center, radius, fallback_start_theta, fallback_extent)
|
||||||
if unwarp is not None:
|
if unwarp is not None:
|
||||||
cv2.imwrite(unwarp_path, unwarp)
|
imwrite_safe(unwarp_path, unwarp)
|
||||||
logger.info(f" - Fallback unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
|
logger.info(f" - Fallback unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
|
||||||
|
|
||||||
# Update start_theta and extent for visualization
|
# Update start_theta and extent for visualization
|
||||||
|
|
@ -736,20 +864,19 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
|
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
|
||||||
|
|
||||||
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
|
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
|
||||||
cv2.imwrite(polar_viz_path, polar_viz)
|
imwrite_safe(polar_viz_path, polar_viz)
|
||||||
logger.info(f" - Fallback polar visualization saved: seal_polar_viz_{i}.png")
|
logger.info(f" - Fallback polar visualization saved: seal_polar_viz_{i}.png")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" Seal #{i}: Fallback polar unwarp also returned None")
|
logger.warning(f" Seal #{i}: Fallback polar unwarp also returned None")
|
||||||
|
|
||||||
if unwarp is None:
|
|
||||||
logger.warning(f" Seal #{i}: No unwarp image available, skipping OCR")
|
|
||||||
|
|
||||||
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
|
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
|
||||||
cv2.imwrite(marked_path, marked)
|
imwrite_safe(marked_path, marked)
|
||||||
|
|
||||||
# OCR recognition
|
# OCR recognition
|
||||||
ocr_result = {'text': '', 'score': 0.0, 'success': False}
|
ocr_result = {'text': '', 'score': 0.0, 'success': False}
|
||||||
|
|
||||||
if unwarp is not None:
|
if unwarp is not None:
|
||||||
|
# Standard path: Recognize unwarp image
|
||||||
method_str = "FALLBACK" if used_fallback else "Standard"
|
method_str = "FALLBACK" if used_fallback else "Standard"
|
||||||
logger.info(f" Seal #{i}: Running OCR ({method_str}, model={ocr_model}) on unwarp image...")
|
logger.info(f" Seal #{i}: Running OCR ({method_str}, model={ocr_model}) on unwarp image...")
|
||||||
|
|
||||||
|
|
@ -766,7 +893,21 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
|
||||||
if used_fallback:
|
if used_fallback:
|
||||||
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
|
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" Seal #{i}: No unwarp image available, skipping OCR")
|
# ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
|
||||||
|
logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)")
|
||||||
|
|
||||||
|
if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
|
||||||
|
logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
|
||||||
|
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
||||||
|
ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
|
||||||
|
logger.info(f" Seal #{i} PaddleOCRVL Backup Result:")
|
||||||
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
||||||
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
||||||
|
logger.info(f" - Success: {ocr_result['success']}")
|
||||||
|
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
|
||||||
|
logger.info(f" - ** Used PaddleOCRVL backup (direct crop recognition) **")
|
||||||
|
else:
|
||||||
|
logger.warning(f" Seal #{i}: No backup available (vl_pipeline=None or PaddleOCRVL not installed), skipping OCR")
|
||||||
|
|
||||||
seal_data = {
|
seal_data = {
|
||||||
'index': int(i),
|
'index': int(i),
|
||||||
|
|
@ -994,6 +1135,10 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str,
|
||||||
elif not best_inst:
|
elif not best_inst:
|
||||||
best_inst = inst
|
best_inst = inst
|
||||||
|
|
||||||
|
# Fallback: if best_inst is still None (all similarities were 0), use first institution
|
||||||
|
if best_inst is None and seal_result['institutions']:
|
||||||
|
best_inst = seal_result['institutions'][0]
|
||||||
|
|
||||||
result['extracted']['institution'] = best_inst
|
result['extracted']['institution'] = best_inst
|
||||||
|
|
||||||
# Compare institution
|
# Compare institution
|
||||||
|
|
@ -1299,11 +1444,14 @@ def main():
|
||||||
help='OCR model to use (default: from OCR_MODEL env var or ppocr_v5)')
|
help='OCR model to use (default: from OCR_MODEL env var or ppocr_v5)')
|
||||||
parser.add_argument('--batch-size', type=int, default=BATCH_SIZE,
|
parser.add_argument('--batch-size', type=int, default=BATCH_SIZE,
|
||||||
help=f'Number of PDFs to process (default: {BATCH_SIZE})')
|
help=f'Number of PDFs to process (default: {BATCH_SIZE})')
|
||||||
|
parser.add_argument('--pdf-names', type=str, default=None,
|
||||||
|
help='Comma-separated list of PDF names to process (e.g., "1.pdf,2.pdf"). Overrides --batch-size')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Use command line argument if provided
|
# Use command line argument if provided
|
||||||
ocr_model = args.ocr_model
|
ocr_model = args.ocr_model
|
||||||
batch_size = args.batch_size
|
batch_size = args.batch_size
|
||||||
|
pdf_names_filter = args.pdf_names
|
||||||
|
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
print("CMA & INSTITUTION EXTRACTION - BATCH ACCURACY TEST")
|
print("CMA & INSTITUTION EXTRACTION - BATCH ACCURACY TEST")
|
||||||
|
|
@ -1322,12 +1470,23 @@ def main():
|
||||||
with open(RESULTS_JSON, 'r', encoding='utf-8') as f:
|
with open(RESULTS_JSON, 'r', encoding='utf-8') as f:
|
||||||
ground_truth = json.load(f)
|
ground_truth = json.load(f)
|
||||||
|
|
||||||
# Get first N PDFs
|
# Filter PDFs: either by name filter or by batch size
|
||||||
pdf_list = list(ground_truth.items())[:batch_size]
|
if pdf_names_filter:
|
||||||
|
# Split comma-separated names and strip whitespace
|
||||||
|
requested_names = [name.strip() for name in pdf_names_filter.split(',')]
|
||||||
|
pdf_list = [(name, ground_truth[name]) for name in requested_names if name in ground_truth]
|
||||||
|
if not pdf_list:
|
||||||
|
logger.error(f"None of the specified PDFs found in results.json: {requested_names}")
|
||||||
|
print(f"ERROR: None of the specified PDFs found in results.json: {requested_names}")
|
||||||
|
return
|
||||||
|
print(f"Processing {len(pdf_list)} specified PDF(s): {[name for name, _ in pdf_list]}")
|
||||||
|
else:
|
||||||
|
# Get first N PDFs
|
||||||
|
pdf_list = list(ground_truth.items())[:batch_size]
|
||||||
|
|
||||||
# Initialize OCR engines
|
# Initialize OCR engines
|
||||||
# Note: We ALWAYS initialize ocr_engine for CMA recognition
|
# Note: We ALWAYS initialize ocr_engine for CMA recognition
|
||||||
# PaddleOCRVL is ONLY used for seal text recognition
|
# We ALWAYS try to initialize vl_pipeline for backup seal recognition (when unwarp fails)
|
||||||
ocr_engine = None
|
ocr_engine = None
|
||||||
vl_pipeline = None
|
vl_pipeline = None
|
||||||
|
|
||||||
|
|
@ -1337,35 +1496,40 @@ def main():
|
||||||
logger.info("PaddleOCR initialized successfully")
|
logger.info("PaddleOCR initialized successfully")
|
||||||
print("PaddleOCR initialized successfully\n")
|
print("PaddleOCR initialized successfully\n")
|
||||||
|
|
||||||
# Initialize PaddleOCRVL if requested for seal recognition
|
# Initialize PaddleOCRVL for backup seal recognition (always try if available)
|
||||||
if ocr_model == "paddleocr_vl":
|
# This provides a fallback when polar unwarping fails
|
||||||
if not PADDLEOCRVL_AVAILABLE:
|
if PADDLEOCRVL_AVAILABLE:
|
||||||
print("WARNING: PaddleOCRVL requested but not available!")
|
logger.info("Initializing PaddleOCRVL for backup seal recognition...")
|
||||||
print("Falling back to PP-OCRv5 for seal recognition")
|
print("Initializing PaddleOCRVL for backup seal recognition (this may take a while)...")
|
||||||
print("Please install: pip install paddleocr[doc-parser]")
|
try:
|
||||||
ocr_model = "ppocr_v5"
|
vl_pipeline = PaddleOCRVL(
|
||||||
else:
|
use_seal_recognition=True,
|
||||||
logger.info("Initializing PaddleOCRVL for seal recognition...")
|
use_ocr_for_image_block=True,
|
||||||
print("Initializing PaddleOCRVL for seal recognition (this may take a while)...")
|
use_layout_detection=True
|
||||||
try:
|
)
|
||||||
vl_pipeline = PaddleOCRVL(
|
|
||||||
use_seal_recognition=True,
|
|
||||||
use_ocr_for_image_block=True,
|
|
||||||
use_layout_detection=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify initialization
|
# Verify initialization
|
||||||
if vl_pipeline is None:
|
if vl_pipeline is None:
|
||||||
raise RuntimeError("PaddleOCRVL initialization returned None")
|
raise RuntimeError("PaddleOCRVL initialization returned None")
|
||||||
|
|
||||||
logger.info("PaddleOCRVL initialized successfully")
|
logger.info("PaddleOCRVL initialized successfully (backup ready)")
|
||||||
print("PaddleOCRVL for seal recognition initialized successfully\n")
|
print("PaddleOCRVL backup ready - will be used when polar unwarping fails\n")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to initialize PaddleOCRVL: {e}")
|
logger.error(f"Failed to initialize PaddleOCRVL: {e}")
|
||||||
logger.error(f"Exception type: {type(e).__name__}")
|
logger.error(f"Exception type: {type(e).__name__}")
|
||||||
print(f"WARNING: Failed to initialize PaddleOCRVL: {e}")
|
print(f"WARNING: Failed to initialize PaddleOCRVL: {e}")
|
||||||
print("Falling back to PP-OCRv5 for seal recognition")
|
print("Polar unwarping failures will skip OCR (no backup available)\n")
|
||||||
ocr_model = "ppocr_v5"
|
else:
|
||||||
|
logger.info("PaddleOCRVL not available - polar unwarping failures will skip OCR")
|
||||||
|
print("Note: PaddleOCRVL not installed - polar unwarping failures will skip OCR")
|
||||||
|
print(" To enable backup: pip install paddleocr[doc-parser]\n")
|
||||||
|
|
||||||
|
# Validate OCR model selection
|
||||||
|
if ocr_model == "paddleocr_vl" and vl_pipeline is None:
|
||||||
|
print("WARNING: PaddleOCRVL requested for primary seal recognition but not available!")
|
||||||
|
print("Falling back to PP-OCRv5 for seal recognition")
|
||||||
|
print("Please install: pip install paddleocr[doc-parser]")
|
||||||
|
ocr_model = "ppocr_v5"
|
||||||
|
|
||||||
# Create output directory
|
# Create output directory
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
|
|
@ -1374,11 +1538,12 @@ def main():
|
||||||
all_results = []
|
all_results = []
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
total_pdfs = len(pdf_list)
|
||||||
for i, (pdf_name, expected_data) in enumerate(pdf_list, 1):
|
for i, (pdf_name, expected_data) in enumerate(pdf_list, 1):
|
||||||
expected_cma = expected_data.get('CMA', '')
|
expected_cma = expected_data.get('CMA', '')
|
||||||
expected_inst = expected_data.get('机构名', '')
|
expected_inst = expected_data.get('机构名', '')
|
||||||
|
|
||||||
print(f"\n[{i}/{BATCH_SIZE}] Processing: {pdf_name}")
|
print(f"\n[{i}/{total_pdfs}] Processing: {pdf_name}")
|
||||||
print(" + Loading PDF and extracting page...")
|
print(" + Loading PDF and extracting page...")
|
||||||
|
|
||||||
result = process_single_pdf(
|
result = process_single_pdf(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue