report-detect/test_accuracy_batch_full.py

1766 lines
72 KiB
Python
Raw Normal View History

"""
CMA Code Extraction & Institution Name - Batch Accuracy Testing Script (Enhanced)
This script implements comprehensive batch accuracy testing for BOTH:
1. CMA code extraction
2. Institution name extraction from seals
Uses the complete workflow from v_verify_logic.py including:
- Layout detection (Paddlex PP-DocLayout-L)
- Seal detection and refinement
- Polar unwarping
- OCR text recognition for institution names
Author: Claude Code
Date: 2025-02-05
Version: 2.0 (Enhanced with seal/institution extraction)
"""
import os
import sys
import json
import time
import logging
import re
import math
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any
import numpy as np
# Set UTF-8 encoding for Windows console
if sys.platform == 'win32':
import codecs
try:
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
except:
pass
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
class NumpyEncoder(json.JSONEncoder):
"""Custom JSON encoder for numpy types"""
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super().default(obj)
try:
import fitz # PyMuPDF
import cv2
from paddleocr import PaddleOCR, SealTextDetection, TextRecognition
try:
from paddleocr import PaddleOCRVL
PADDLEOCRVL_AVAILABLE = True
except ImportError:
PADDLEOCRVL_AVAILABLE = False
print("Warning: PaddleOCRVL not available. Install with: pip install paddleocr[doc-parser]")
import paddlex as px
from Levenshtein import distance as levenshtein_distance
except ImportError as e:
print(f"Error: Required dependency not found: {e}")
print("Please install: pip install python-Levenshtein paddleocr paddlex pymupdf-ng opencv-python numpy")
sys.exit(1)
# Import CMA extraction module
try:
from cma_extraction_final import extract_cma_code_fullpage, imread_unicode
except ImportError:
print("Error: cma_extraction_final.py not found in current directory")
sys.exit(1)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('test_accuracy_full.log', encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Constants
PDF_DIR = Path(r"src/test/resources/data/pdfs")
RESULTS_JSON = Path(r"src/test/resources/data/results.json")
OUTPUT_DIR = Path("test_reports_full")
BATCH_SIZE = 20
SIMILARITY_THRESHOLD = 85.0
# OCR Model Configuration
# Options: "ppocr_v5" (default), "paddleocr_vl"
OCR_MODEL = os.environ.get("OCR_MODEL", "ppocr_v5")
# ============ Helper Functions ============
def imwrite_safe(file_path, img):
"""
Write image file safely, handling Chinese paths on Windows.
On Windows, cv2.imwrite fails with Chinese paths. This function uses
cv2.imencode + tofile as a fallback.
Args:
file_path: Path to save the image
img: Image data (numpy array)
Returns:
bool: True if successful, False otherwise
"""
try:
# Try standard cv2.imwrite first
success = cv2.imwrite(file_path, img)
if success:
return True
# Fallback: Use imencode + tofile for Chinese paths
is_success, buffer = cv2.imencode(".png", img)
if is_success:
buffer.tofile(file_path)
return True
return False
except Exception as e:
logger.error(f"Failed to write image to {file_path}: {e}")
return False
# ============ Seal Processing Functions (from v_verify_logic.py) ============
def polar_unwarp(img, center, radius, start_theta, angular_extent):
"""
Polar Unwarp with Canvas Padding for Partial Seals
Extended version:
- Creates a padded canvas to handle partial seals (seals cut off at edges)
- Samples both inward (toward center) and outward (away from center)
- Uses white padding for areas outside the original image boundary
- This ensures we can always sample at the full radius even if seal is cut off
"""
if angular_extent <= 0: return None
strip_w = int(angular_extent * radius)
# Extended sampling range:
# - Inward: 100% of radius (toward center) - all the way to center
# - Outward: 20% beyond radius (away from center)
inward_range = int(radius * 0.85) # 向内到圆心
outward_range = int(radius * 0.2) # 向外20%
strip_h = inward_range + outward_range
if strip_w <= 0 or strip_h <= 0: return None
ch, cw = img.shape[:2]
# Calculate padding needed to ensure all sampling points are within bounds
# Maximum distance from center will be radius + outward_range
max_distance = radius + outward_range
# Calculate padding needed on each side
pad_top = max(0, max_distance - center[1])
pad_bottom = max(0, max_distance - (ch - center[1]))
pad_left = max(0, max_distance - center[0])
pad_right = max(0, max_distance - (cw - center[0]))
# Create padded canvas with white background
padded_h = ch + pad_top + pad_bottom
padded_w = cw + pad_left + pad_right
padded_canvas = np.ones((padded_h, padded_w, 3), dtype=np.uint8) * 255
# Place original image in center
padded_canvas[pad_top:pad_top+ch, pad_left:pad_left+cw] = img
# Adjust center position for padded canvas
center_padded = [center[0] + pad_left, center[1] + pad_top]
strip = np.zeros((strip_h, strip_w, 3), dtype=np.uint8)
for y in range(strip_h):
# Calculate radius at this row
# Start from radius + outward_range (outside)
# Move inward toward center
r = radius + outward_range - y
for x in range(strip_w):
theta = start_theta + angular_extent * (x / strip_w)
src_x = center_padded[0] + r * math.cos(theta)
src_y = center_padded[1] + r * math.sin(theta)
# Sample from padded canvas (all points should be within bounds now)
sx, sy = int(src_x), int(src_y)
if 0 <= sx < padded_w and 0 <= sy < padded_h:
strip[y, x] = padded_canvas[sy, sx]
else:
strip[y, x] = [255, 255, 255]
return strip
def calculate_precise_arc(polygons, center):
"""Calculate precise arc parameters for seal text"""
initial_clusters = []
gap_thresh = math.radians(15)
for poly in polygons:
thetas = sorted([math.atan2(p[1] - center[1], p[0] - center[0]) for i, p in enumerate(poly)])
if not thetas: continue
max_gap = 0
gap_idx = -1
for i in range(len(thetas)):
gap = (thetas[0] + 2*math.pi - thetas[i]) if i == len(thetas)-1 else (thetas[i+1]-thetas[i])
if gap > max_gap: max_gap = gap; gap_idx = i
if gap_idx == len(thetas) - 1:
t_arc = thetas
else:
t_arc = thetas[gap_idx+1:] + [t + 2*math.pi for t in thetas[:gap_idx+1]]
if not t_arc: continue
curr = [t_arc[0]]
for i in range(1, len(t_arc)):
if t_arc[i] - t_arc[i-1] > gap_thresh:
initial_clusters.append({'start': curr[0], 'end': curr[-1]})
curr = [t_arc[i]]
else:
curr.append(t_arc[i])
initial_clusters.append({'start': curr[0], 'end': curr[-1]})
if not initial_clusters: return 0.0, 0.0
initial_clusters.sort(key=lambda x: x['start'])
merged = []
merge_thresh = math.radians(45)
if initial_clusters:
curr = initial_clusters[0]
for i in range(1, len(initial_clusters)):
nxt = initial_clusters[i]
if nxt['start'] - curr['end'] < merge_thresh:
curr['end'] = max(curr['end'], nxt['end'])
else:
merged.append(curr)
curr = nxt
merged.append(curr)
candidates = []
for m in merged:
st, en = m['start'], m['end']
ex = en - st
mid = (st + en) / 2
dist_to_top = abs(((mid + math.pi/2 + math.pi) % (2*math.pi)) - math.pi)
weight = math.exp(-0.5 * (dist_to_top / (math.pi/2))**2)
candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight})
candidates.sort(key=lambda x: x['score'], reverse=True)
best = candidates[0]
# FIX: Limit extent to max 350° to avoid overlap and distortion
# Extent > 360° causes severe image distortion in polar unwarping
MAX_EXTENT_DEG = 350.0
start_theta = best['start']
extent = best['end'] - best['start']
if math.degrees(extent) > MAX_EXTENT_DEG:
logger.warning(f"Arc extent {math.degrees(extent):.2f}° exceeds {MAX_EXTENT_DEG}°, clamping to avoid distortion")
extent = math.radians(MAX_EXTENT_DEG)
return start_theta, extent
def fit_circle_from_text_polygons(all_polygons):
"""
Fit circle from text polygons using least squares method.
Equation: (x - a)² + (y - b)² =
Expanded: + - 2ax - 2by + ( + - ) = 0
Let: c = + -
Then: + = 2ax + 2by - c
This is a linear system: [2x, 2y, -1] * [a, b, c] = +
"""
if len(all_polygons) == 0:
return None, None, None
# Collect all points from polygons
points = []
for poly in all_polygons:
for p in poly:
points.append([float(p[0]), float(p[1])])
if len(points) < 5:
return None, None, None
points = np.array(points)
# Build linear system
# A * [a, b, c]ᵀ = b
A = np.column_stack([2 * points[:, 0], 2 * points[:, 1], -np.ones(len(points))])
b_vec = np.sum(points ** 2, axis=1)
try:
# Solve least squares
sol, residuals, rank, singular_values = np.linalg.lstsq(A, b_vec, rcond=None)
a, b, c = sol
center_x = a
center_y = b
radius = np.sqrt(a**2 + b**2 - c)
# Calculate fitting error (RMSE)
if len(residuals) > 0:
rmse = np.sqrt(residuals[0] / len(points))
else:
# Calculate manually
predicted = A @ sol
errors = predicted - b_vec
rmse = np.sqrt(np.mean(errors ** 2))
return (int(center_x), int(center_y)), int(radius), rmse
except Exception as e:
logger.error(f"Circle fitting failed: {e}")
return None, None, None
def detect_seal_center_dual_method(seal_crop, all_polygons):
"""
Dual strategy: Automatically select the best center detection method.
Strategy:
1. Try circle fitting
2. Check fitting quality (RMSE, offset distance)
3. If fitting quality is good use fitted center
4. Otherwise use crop center
Returns:
center: [x, y] - detected center
radius: int - detected radius
method: str - "crop_center" or "circle_fitting"
"""
ch, cw = seal_crop.shape[:2]
# Method 1: Crop center (default method)
center_crop = [cw // 2, ch // 2]
radius_crop = min(cw, ch) // 2 - 10
# Method 2: Circle fitting
center_fit, radius_fit, rmse = fit_circle_from_text_polygons(all_polygons)
if center_fit is None:
logger.info(" Circle fitting failed, using crop center")
return center_crop, radius_crop, "crop_center"
# Calculate offset between fitted center and crop center
offset = math.sqrt((center_fit[0] - center_crop[0])**2 +
(center_fit[1] - center_crop[1])**2)
offset_ratio = offset / min(cw, ch)
# Quality check criteria
# 1. RMSE should be low (good fit)
# 2. Offset should not be too large (center should be reasonable)
# 3. Need enough polygons for reliable fitting
rmse_threshold = 3000
offset_threshold = 0.2 # 20% of crop size
min_polygons = 3
is_fit_good = (
rmse < rmse_threshold and
offset_ratio < offset_threshold and
len(all_polygons) >= min_polygons
)
if is_fit_good:
logger.info(f" Using circle fitting: RMSE={rmse:.2f}, offset_ratio={offset_ratio:.2f}")
return center_fit, radius_fit, "circle_fitting"
else:
reasons = []
if rmse >= rmse_threshold:
reasons.append(f"RMSE too high ({rmse:.2f} >= {rmse_threshold})")
if offset_ratio >= offset_threshold:
reasons.append(f"offset too large ({offset_ratio:.2f} >= {offset_threshold})")
if len(all_polygons) < min_polygons:
reasons.append(f"not enough polygons ({len(all_polygons)} < {min_polygons})")
logger.info(f" Circle fitting unreliable ({', '.join(reasons)}), using crop center")
return center_crop, radius_crop, "crop_center"
def run_layout_detection(image_path):
"""Run Paddlex PP-DocLayout-L for layout analysis"""
try:
model = px.create_model("PP-DocLayout-L")
output = model.predict(image_path, batch_size=1)
all_regions = []
for res in output:
boxes = res.get('boxes', [])
for box in boxes:
label_name = box.get('label_name', box.get('label', 'unknown'))
score = box.get('score', 0.0)
coords = box.get('coordinate')
all_regions.append({
'label': label_name,
'score': score,
'box': coords
})
return all_regions
except Exception as e:
logger.error(f"Layout detection failed: {e}")
return []
def run_ocr_recognition(image_path, rec_model):
"""Run OCR recognition on unwarp seal image"""
try:
output = rec_model.predict(input=image_path, batch_size=1)
if output and len(output) > 0:
res = output[0]
text = res.get('rec_text', '').strip()
score = res.get('rec_score', 0.0)
return {
'text': text,
'score': score,
'success': len(text) > 0
}
else:
return {'text': '', 'score': 0.0, 'success': False}
except Exception as e:
logger.error(f"OCR recognition failed: {e}")
return {'text': '', 'score': 0.0, 'success': False}
def run_ocr_recognition_vl(image_path, vl_pipeline):
"""
Run OCR recognition using PaddleOCRVL on seal image.
Can be used on both unwarp images and crop images (backup mode).
Args:
image_path: Path to seal image (unwarp or crop)
vl_pipeline: Initialized PaddleOCRVL pipeline
Returns:
Dict with 'text', 'score', 'success' keys
"""
try:
# Create temp output directory for VL results
temp_output_dir = Path("temp_paddleocr_vl")
temp_output_dir.mkdir(exist_ok=True)
# Run prediction
output = vl_pipeline.predict(image_path)
if output and len(output) > 0:
res = output[0]
# Save JSON to extract text
res.save_to_json(save_path=str(temp_output_dir))
# Read JSON to find seal text
json_file = temp_output_dir / f"{Path(image_path).stem}_res.json"
if json_file.exists():
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Find seal block and extract content
for block in data.get('parsing_res_list', []):
if block.get('block_label') == 'seal':
text = block.get('block_content', '').strip()
# Clean up temp files
import shutil
if temp_output_dir.exists():
shutil.rmtree(temp_output_dir, ignore_errors=True)
return {
'text': text,
'score': 1.0, # PaddleOCRVL doesn't provide confidence score
'success': len(text) > 0
}
# Clean up temp files
import shutil
if temp_output_dir.exists():
shutil.rmtree(temp_output_dir, ignore_errors=True)
return {'text': '', 'score': 0.0, 'success': False}
else:
return {'text': '', 'score': 0.0, 'success': False}
except Exception as e:
logger.error(f"PaddleOCRVL recognition failed: {e}")
import traceback
logger.error(traceback.format_exc())
return {'text': '', 'score': 0.0, 'success': False}
def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", vl_pipeline=None):
"""
Extract seals and recognize institution names from page image.
Args:
page_img: Input page image
output_dir: Directory to save intermediate results
ocr_model: OCR model to use ("ppocr_v5" or "paddleocr_vl")
vl_pipeline: PaddleOCRVL pipeline (required if ocr_model="paddleocr_vl")
Returns:
Dict with:
- 'seals': list of seal results
- 'institutions': list of recognized institution names
- 'processing_time': time taken
"""
start_time = time.time()
result = {
'seals': [],
'institutions': [],
'processing_time': 0.0
}
# Validate input image
if page_img is None:
logger.error("Input page_img is None")
result['processing_time'] = time.time() - start_time
return result
if not isinstance(page_img, np.ndarray):
logger.error(f"Input page_img is not numpy array, type: {type(page_img)}")
result['processing_time'] = time.time() - start_time
return result
if page_img.size == 0:
logger.error("Input page_img is empty")
result['processing_time'] = time.time() - start_time
return result
logger.info(f"Input image shape: {page_img.shape}, dtype: {page_img.dtype}")
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Save page image
doc_path = os.path.join(output_dir, "doc_page.png")
try:
success = imwrite_safe(doc_path, page_img)
if not success:
logger.error(f"imwrite_safe returned False for {doc_path}")
# Try alternative save method using PIL
try:
from PIL import Image
img_rgb = cv2.cvtColor(page_img, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(img_rgb)
pil_img.save(doc_path)
logger.info(f"Saved using PIL as fallback: {doc_path}")
# Verify PIL save worked
if not os.path.exists(doc_path):
logger.error(f"PIL save also failed, file not found: {doc_path}")
result['processing_time'] = time.time() - start_time
return result
except Exception as pil_e:
logger.error(f"PIL fallback also failed: {pil_e}")
result['processing_time'] = time.time() - start_time
return result
except Exception as e:
logger.error(f"Failed to save page image: {e}")
result['processing_time'] = time.time() - start_time
return result
# Verify file exists before proceeding
if not os.path.exists(doc_path):
logger.error(f"Page image file not found after save: {doc_path}")
result['processing_time'] = time.time() - start_time
return result
# Run layout detection
logger.info("Running layout detection...")
all_regions = run_layout_detection(doc_path)
# Extract seal boxes
seal_boxes = []
page_viz = page_img.copy()
for reg in all_regions:
box = reg.get('box')
label = reg.get('label')
score = reg.get('score', 0.0)
is_seal = (label == 'seal')
if score > 0.2:
x1, y1, x2, y2 = [int(v) for v in box]
color = (0, 0, 255) if is_seal else (0, 255, 0)
cv2.rectangle(page_viz, (x1, y1), (x2, y2), color, 2)
if is_seal:
seal_boxes.append(box)
imwrite_safe(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)
if not seal_boxes:
logger.warning("No seals detected")
result['processing_time'] = time.time() - start_time
return result
# Process each seal
logger.info(f"Processing {len(seal_boxes)} detected seals...")
det_model = SealTextDetection(model_name="PP-OCRv4_server_seal_det")
# Initialize OCR model based on selection
if ocr_model == "paddleocr_vl":
if not PADDLEOCRVL_AVAILABLE:
logger.error("PaddleOCRVL requested but not available. Falling back to PP-OCRv5.")
ocr_model = "ppocr_v5"
rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
elif vl_pipeline is None:
logger.error("PaddleOCRVL requested but vl_pipeline is None. Falling back to PP-OCRv5.")
ocr_model = "ppocr_v5"
rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
else:
logger.info("Using PaddleOCRVL for seal text recognition")
rec_model = None # Not used for PaddleOCRVL
else:
logger.info("Using PP-OCRv5_server_rec for seal text recognition")
rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
for i, box in enumerate(seal_boxes):
x1, y1, x2, y2 = [int(v) for v in box]
pad = 40
y1_p, y2_p = max(0, y1-pad), min(page_img.shape[0], y2+pad)
x1_p, x2_p = max(0, x1-pad), min(page_img.shape[1], x2+pad)
seal_crop = page_img[y1_p:y2_p, x1_p:x2_p]
# Validate crop
if seal_crop.size == 0 or seal_crop.shape[0] == 0 or seal_crop.shape[1] == 0:
logger.warning(f"Invalid seal crop dimensions: {seal_crop.shape}, skipping seal {i}")
continue
crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
success = imwrite_safe(crop_path, seal_crop)
if not success:
# Try PIL fallback
try:
from PIL import Image
crop_rgb = cv2.cvtColor(seal_crop, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(crop_rgb)
pil_img.save(crop_path)
logger.info(f"Saved seal crop using PIL fallback: {crop_path}")
except Exception as pil_e:
logger.error(f"Failed to save seal crop to {crop_path}: {pil_e}, skipping seal {i}")
continue
# Verify file exists
if not os.path.exists(crop_path):
logger.error(f"Seal crop file not found after save: {crop_path}, skipping seal {i}")
continue
# Detect text polygons
output = det_model.predict(crop_path, batch_size=1)
all_polygons = []
for res in output:
polys = res.get('dt_polys') if isinstance(res, dict) else None
if polys:
all_polygons.extend(polys)
ch, cw = seal_crop.shape[:2]
# ============ DUAL STRATEGY: Choose best center detection method ============
logger.info(f" Seal #{i} Geometry:")
logger.info(f" - Crop size: {cw}x{ch}")
logger.info(f" - Text polygons detected: {len(all_polygons)}")
center, radius, method_used = detect_seal_center_dual_method(seal_crop, all_polygons)
logger.info(f" - Method used: {method_used}")
logger.info(f" - Center: ({center[0]}, {center[1]})")
logger.info(f" - Radius: {radius}")
# ============ INSUFFICIENT POLYGONS CHECK ============
# If too few text polygons detected, polar unwarping will likely fail
# Skip directly to PaddleOCRVL backup in this case
MIN_POLYGONS_FOR_UNWARP = 3
if len(all_polygons) < MIN_POLYGONS_FOR_UNWARP:
logger.warning(f" Seal #{i}: Only {len(all_polygons)} text polygons detected (< {MIN_POLYGONS_FOR_UNWARP})")
logger.warning(f" Seal #{i}: Skipping polar unwarping (insufficient polygon data)")
logger.info(f" Seal #{i}: Using PaddleOCRVL backup instead")
# Save crop image
imwrite_safe(crop_path, seal_crop)
# Use PaddleOCRVL directly on crop (no unwarp)
if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
ocr_result = run_ocr_recognition_vl(crop_path, vl_pipeline)
logger.info(f" Seal #{i} PaddleOCRVL Result (direct crop):")
logger.info(f" - Text: '{ocr_result['text']}'")
logger.info(f" - Score: {ocr_result['score']:.4f}")
logger.info(f" - Success: {ocr_result['success']}")
logger.info(f" - ** Used PaddleOCRVL (insufficient polygons for unwarping) **")
# Create debug info without unwarp
seal_data = {
'index': i,
'box': box,
'crop_path': Path(crop_path).name,
'unwarp_path': None, # No unwarp performed
'marked_path': None, # No marked image
'polar_viz_path': None, # No polar visualization
'text': ocr_result['text'],
'confidence': float(ocr_result['score']),
'success': bool(ocr_result['success']),
'method_used': f'{method_used}_skip_unwarp',
'used_fallback': True,
'debug_info': {
'center': center,
'radius': radius,
'start_theta_deg': None,
'extent_deg': None,
'num_polygons': len(all_polygons),
'crop_size': (cw, ch),
'unwarp_size': None,
'skip_reason': f'Insufficient polygons ({len(all_polygons)} < {MIN_POLYGONS_FOR_UNWARP})'
}
}
result['seals'].append(seal_data)
if ocr_result['success']:
# Clean the institution name before adding
cleaned_name = clean_institution_name(ocr_result['text'])
result['institutions'].append(cleaned_name)
logger.info(f" ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})")
else:
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
continue # Skip to next seal
else:
logger.error(f" Seal #{i}: PaddleOCRVL not available, cannot extract text")
seal_data = {
'index': i,
'box': box,
'crop_path': Path(crop_path).name,
'unwarp_path': None,
'marked_path': None,
'polar_viz_path': None,
'text': '',
'confidence': 0.0,
'success': False,
'method_used': f'{method_used}_skip_unwarp',
'used_fallback': True,
'debug_info': {
'center': center,
'radius': radius,
'start_theta_deg': None,
'extent_deg': None,
'num_polygons': len(all_polygons),
'crop_size': (cw, ch),
'unwarp_size': None,
'skip_reason': f'Insufficient polygons and no PaddleOCRVL backup'
}
}
result['seals'].append(seal_data)
continue
# Calculate arc and unwarp
start_theta, extent = calculate_precise_arc(all_polygons, center)
logger.info(f" Seal #{i} Arc Parameters:")
logger.info(f" - Start theta: {math.degrees(start_theta):.2f}°")
logger.info(f" - Extent: {math.degrees(extent):.2f}° ({math.degrees(extent)*radius:.1f} pixels width)")
marked = seal_crop.copy()
# Draw all text polygons in green
for p in all_polygons:
cv2.polylines(marked, [np.array(p, dtype=np.int32)], True, (0, 255, 0), 2)
# Draw center point (yellow cross)
center_x, center_y = int(center[0]), int(center[1])
cv2.drawMarker(marked, (center_x, center_y), (0, 255, 255),
markerType=cv2.MARKER_CROSS, markerSize=20, thickness=2)
cv2.circle(marked, (center_x, center_y), 5, (0, 255, 255), -1)
# Draw estimated radius circle (cyan)
cv2.circle(marked, (center_x, center_y), radius, (255, 255, 0), 2)
# Draw polar sampling visualization
polar_viz = seal_crop.copy()
cv2.drawMarker(polar_viz, (center_x, center_y), (0, 255, 255),
markerType=cv2.MARKER_CROSS, markerSize=20, thickness=2)
cv2.circle(polar_viz, (center_x, center_y), radius, (255, 255, 0), 2)
unwarp_path = os.path.join(output_dir, f"seal_unwarp_{i}.png")
unwarp = None
used_fallback = False
if extent > 0:
logger.info(f" Seal #{i}: Performing polar unwarping with detected text polygons...")
unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent)
if unwarp is not None:
imwrite_safe(unwarp_path, unwarp)
logger.info(f" - Unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
def draw_line(m, theta, color):
x = center[0] + radius * math.cos(theta)
y = center[1] + radius * math.sin(theta)
cv2.line(m, (int(center[0]), int(center[1])), (int(x), int(y)), color, 2)
# Draw start angle line (blue)
draw_line(marked, start_theta, (255, 0, 0))
# Draw end angle line (red)
draw_line(marked, start_theta + extent, (0, 0, 255))
# Draw sampling points on polar_viz (show where polar samples come from)
num_sample_points = min(50, int(extent * radius)) # Show up to 50 sample points
for r_idx in range(5): # 5 different radii
r = radius - r_idx * (radius * 0.6 / 5)
for theta_idx in range(num_sample_points):
theta = start_theta + extent * (theta_idx / num_sample_points)
src_x = center[0] + r * math.cos(theta)
src_y = center[1] + r * math.sin(theta)
if 0 <= src_x < cw and 0 <= src_y < ch:
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
# Save polar visualization
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
imwrite_safe(polar_viz_path, polar_viz)
logger.info(f" - Polar visualization saved: seal_polar_viz_{i}.png")
else:
logger.warning(f" Seal #{i}: Polar unwarp returned None")
# ============ FALLBACK: Use fixed angle range when no text detected ============
if unwarp is None and extent <= 0 and len(all_polygons) == 0:
logger.warning(f" Seal #{i}: No text polygons detected, using fallback angle range (7:30 to 4:30 clockwise)")
used_fallback = True
# 7:30 direction (left-bottom) to 4:30 direction (right-bottom) clockwise
# In standard math angle (0 = 3 o'clock, CCW):
# 7:30 = 225 degrees = 3.927 rad
# 4:30 = 135 degrees = 2.356 rad
# Clockwise from 7:30 to 4:30 covers 270 degrees
# We start at 4:30 (135 degrees) and go counter-clockwise 270 degrees
fallback_start_theta = math.radians(135) # 4:30 position
fallback_extent = math.radians(270) # 270 degree coverage
logger.info(f" Seal #{i}: Fallback - Start: 135.00° (4:30), Extent: 270.00°")
unwarp = polar_unwarp(seal_crop, center, radius, fallback_start_theta, fallback_extent)
if unwarp is not None:
imwrite_safe(unwarp_path, unwarp)
logger.info(f" - Fallback unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
# Update start_theta and extent for visualization
start_theta = fallback_start_theta
extent = fallback_extent
def draw_line(m, theta, color):
x = center[0] + radius * math.cos(theta)
y = center[1] + radius * math.sin(theta)
cv2.line(m, (int(center[0]), int(center[1])), (int(x), int(y)), color, 2)
# Draw start angle line (blue) - 4:30 position
draw_line(marked, start_theta, (255, 0, 0))
# Draw end angle line (red) - 7:30 position
draw_line(marked, start_theta + extent, (0, 0, 255))
# Draw sampling points
num_sample_points = 50
for r_idx in range(5):
r = radius - r_idx * (radius * 0.6 / 5)
for theta_idx in range(num_sample_points):
theta = start_theta + extent * (theta_idx / num_sample_points)
src_x = center[0] + r * math.cos(theta)
src_y = center[1] + r * math.sin(theta)
if 0 <= src_x < cw and 0 <= src_y < ch:
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
imwrite_safe(polar_viz_path, polar_viz)
logger.info(f" - Fallback polar visualization saved: seal_polar_viz_{i}.png")
else:
logger.warning(f" Seal #{i}: Fallback polar unwarp also returned None")
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
imwrite_safe(marked_path, marked)
# OCR recognition with double verification
ocr_result = {'text': '', 'score': 0.0, 'success': False}
ocr_method_used = method_used
if unwarp is not None:
# Standard path: Recognize unwarp image
method_str = "FALLBACK" if used_fallback else "Standard"
logger.info(f" Seal #{i}: Running OCR ({method_str}, model={ocr_model}) on unwarp image...")
if ocr_model == "paddleocr_vl":
ocr_result = run_ocr_recognition_vl(unwarp_path, vl_pipeline)
else:
ocr_result = run_ocr_recognition(unwarp_path, rec_model)
ocr_method_used = f"{method_used}_unwarp"
logger.info(f" Seal #{i} OCR Result (unwarp):")
logger.info(f" - Text: '{ocr_result['text']}'")
logger.info(f" - Score: {ocr_result['score']:.4f}")
logger.info(f" - Success: {ocr_result['success']}")
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
if used_fallback:
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
# ============ DOUBLE VERIFICATION: Try PaddleOCRVL on crop if unwarp OCR fails ============
# If unwarp OCR failed (empty text or success=False), try PaddleOCRVL backup on crop
if (not ocr_result['success'] or len(ocr_result['text'].strip()) == 0) and vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
logger.warning(f" Seal #{i}: Unwarp OCR failed (empty result), trying PaddleOCRVL backup on crop image")
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
backup_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
logger.info(f" Seal #{i} PaddleOCRVL Backup Result (crop):")
logger.info(f" - Text: '{backup_result['text']}'")
logger.info(f" - Score: {backup_result['score']:.4f}")
logger.info(f" - Success: {backup_result['success']}")
logger.info(f" - Text length: {len(backup_result['text'])} chars")
# Use backup result if it's better (non-empty text)
if backup_result['success'] and len(backup_result['text'].strip()) > 0:
logger.info(f" Seal #{i}: ** Using PaddleOCRVL backup result (unwarp failed) **")
ocr_result = backup_result
ocr_method_used = f"{method_used}_crop_backup"
else:
logger.warning(f" Seal #{i}: ** Both unwarp and crop OCR failed **")
else:
# ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)")
if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
ocr_method_used = f"{method_used}_crop_backup"
logger.info(f" Seal #{i} PaddleOCRVL Backup Result:")
logger.info(f" - Text: '{ocr_result['text']}'")
logger.info(f" - Score: {ocr_result['score']:.4f}")
logger.info(f" - Success: {ocr_result['success']}")
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
logger.info(f" - ** Used PaddleOCRVL backup (direct crop recognition) **")
else:
logger.warning(f" Seal #{i}: No backup available (vl_pipeline=None or PaddleOCRVL not installed), skipping OCR")
seal_data = {
'index': int(i),
'box': [float(v) for v in box],
'crop_path': f"seal_crop_{i}.png",
'unwarp_path': f"seal_unwarp_{i}.png" if unwarp is not None else None,
'marked_path': f"seal_marked_{i}.png",
'polar_viz_path': f"seal_polar_viz_{i}.png" if unwarp is not None else None,
'text': ocr_result['text'],
'confidence': float(ocr_result['score']),
'success': bool(ocr_result['success']),
'method_used': ocr_method_used, # Track actual OCR method used
'used_fallback': used_fallback, # Track if fallback was used
'debug_info': {
'center': center,
'radius': radius,
'start_theta_deg': float(math.degrees(start_theta)),
'extent_deg': float(math.degrees(extent)),
'num_polygons': len(all_polygons),
'crop_size': (cw, ch),
'unwarp_size': (unwarp.shape[1], unwarp.shape[0]) if unwarp is not None else None
}
}
result['seals'].append(seal_data)
if ocr_result['success']:
# Clean the institution name before adding
cleaned_name = clean_institution_name(ocr_result['text'])
result['institutions'].append(cleaned_name)
logger.info(f" ✓ Seal #{i} SUCCESS: {cleaned_name[:50]}... (confidence: {ocr_result['score']:.4f})")
else:
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
result['processing_time'] = time.time() - start_time
return result
# ============ Text Cleaning Functions ============
def clean_institution_name(text: str) -> str:
"""
Clean extracted institution name by removing unwanted suffixes.
Removes common seal-related text that is not part of the institution name:
- 检验检测专用章
- 检验检测专用
- 专用章
- 及其他变体
Args:
text: Raw extracted institution name
Returns:
Cleaned institution name
"""
if not text:
return text
# Define patterns to remove (order matters: most specific first)
patterns_to_remove = [
'检验检测专用章',
'检验检测专用',
'检测专用章',
'检验专用章',
'专用章',
'(检验检测)',
'(检验检测)',
'【检验检测】',
'[检验检测]',
]
cleaned = text
for pattern in patterns_to_remove:
if pattern in cleaned:
cleaned = cleaned.replace(pattern, '')
logger.debug(f"Removed pattern '{pattern}' from institution name")
# Strip whitespace
cleaned = cleaned.strip()
# Log if cleaning occurred
if cleaned != text:
logger.info(f"Cleaned institution name: '{text}''{cleaned}'")
return cleaned
# ============ Similarity and Matching Functions ============
def calculate_similarity(str1: str, str2: str) -> float:
"""Calculate similarity percentage using Levenshtein distance"""
if not str1 or not str2:
return 0.0
max_len = max(len(str1), len(str2))
if max_len == 0:
return 100.0
edit_dist = levenshtein_distance(str1, str2)
similarity = (1 - edit_dist / max_len) * 100
return round(similarity, 2)
def classify_match(extracted: Optional[str], expected: str) -> Dict[str, Any]:
"""Classify match type between extracted and expected values"""
if extracted is None:
return {
'match_type': 'no_match',
'similarity': 0.0,
'edit_distance': len(expected)
}
similarity = calculate_similarity(extracted, expected)
edit_dist = levenshtein_distance(extracted, expected)
if similarity == 100.0:
match_type = 'exact'
elif similarity >= SIMILARITY_THRESHOLD:
match_type = 'partial'
else:
match_type = 'no_match'
return {
'match_type': match_type,
'similarity': similarity,
'edit_distance': edit_dist
}
# ============ PDF Processing Functions ============
def extract_pdf_page(pdf_path: str, page_num: int = 0) -> Optional[np.ndarray]:
"""Extract a page from PDF as image"""
try:
doc = fitz.open(pdf_path)
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
# Convert to BGR format for OpenCV
if pix.n == 4: # RGBA
img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
elif pix.n == 3: # RGB
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
elif pix.n == 1: # Grayscale
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
else:
logger.warning(f"Unexpected number of channels: {pix.n}")
# Assume RGB and convert
if pix.n >= 3:
img = img[:, :, :3]
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return img
except Exception as e:
logger.error(f"Failed to extract page from {pdf_path}: {e}")
return None
def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str,
pdf_dir: Path, output_dir: Path, ocr_engine,
ocr_model="ppocr_v5", vl_pipeline=None) -> Dict[str, Any]:
"""
Process a single PDF for CMA and institution extraction.
Args:
pdf_name: Name of PDF file
expected_cma: Expected CMA code from ground truth
expected_inst: Expected institution name from ground truth
pdf_dir: Directory containing PDFs
output_dir: Output directory for results
ocr_engine: Global PaddleOCR instance (not currently used)
ocr_model: OCR model to use ("ppocr_v5" or "paddleocr_vl")
vl_pipeline: PaddleOCRVL pipeline (required if ocr_model="paddleocr_vl")
Returns:
Result dictionary with extraction and comparison data
"""
pdf_path = pdf_dir / pdf_name
pdf_output_dir = output_dir / pdf_name
result = {
'pdf_name': pdf_name,
'expected': {
'cma': expected_cma,
'institution': expected_inst
},
'extracted': {
'cma': None,
'institution': None,
'cma_confidence': 0.0,
'cma_success': False,
'institutions_from_seals': []
},
'comparison': {
'cma': {},
'institution': {}
},
'performance': {
'total_time': 0.0,
'cma_time': 0.0,
'seal_time': 0.0
},
'seal_results': [],
'status': 'success',
'error': None,
'file_size': 0
}
# Check file exists
if not pdf_path.exists():
result['status'] = 'file_not_found'
result['error'] = f"PDF file not found: {pdf_path}"
logger.warning(result['error'])
return result
result['file_size'] = pdf_path.stat().st_size
# Clean output directory to ensure fresh processing
if pdf_output_dir.exists():
import shutil
try:
shutil.rmtree(pdf_output_dir)
logger.info(f"Cleaned existing output directory: {pdf_output_dir}")
except Exception as e:
logger.warning(f"Failed to clean output directory: {e}")
# Create fresh output directory
pdf_output_dir.mkdir(parents=True, exist_ok=True)
total_start = time.time()
# Extract page
logger.info(f"Extracting page 1 from {pdf_name}...")
page_img = extract_pdf_page(str(pdf_path), page_num=0)
if page_img is None:
result['status'] = 'extraction_failed'
result['error'] = "Failed to extract page from PDF"
return result
# Extract CMA code
logger.info(f"Running CMA extraction on {pdf_name}...")
cma_start = time.time()
cma_result = extract_cma_code_fullpage(page_img, ocr_engine, output_dir=str(pdf_output_dir))
result['performance']['cma_time'] = time.time() - cma_start
result['extracted']['cma'] = cma_result['code']
result['extracted']['cma_confidence'] = cma_result['confidence']
result['extracted']['cma_success'] = cma_result['success']
# Compare CMA
if expected_cma == "":
result['comparison']['cma']['notes'] = "Ground truth marked as 'None'"
else:
comparison = classify_match(cma_result['code'], expected_cma)
result['comparison']['cma'] = comparison
# Extract seals and institutions
logger.info(f"Running seal extraction on {pdf_name}...")
seal_start = time.time()
seal_result = extract_seals_and_institutions(page_img, str(pdf_output_dir),
ocr_model=ocr_model, vl_pipeline=vl_pipeline)
result['performance']['seal_time'] = time.time() - seal_start
result['seal_results'] = seal_result['seals']
result['extracted']['institutions_from_seals'] = seal_result['institutions']
# Select best institution match
if seal_result['institutions']:
logger.info(f" Institution Extraction:")
logger.info(f" - Expected: {expected_inst if expected_inst else 'N/A'}")
logger.info(f" - Found {len(seal_result['institutions'])} institution(s) from seals")
# Find best matching institution
best_inst = None
best_similarity = 0.0
for idx, inst in enumerate(seal_result['institutions']):
if expected_inst and expected_inst != "":
sim = calculate_similarity(inst, expected_inst)
logger.info(f" - Inst #{idx+1}: '{inst[:50]}...' → Similarity: {sim:.1f}%")
if sim > best_similarity:
best_similarity = sim
best_inst = inst
logger.info(f" → New best match! ({sim:.1f}% > {best_similarity:.1f}%)")
elif not best_inst:
best_inst = inst
logger.info(f" - Inst #{idx+1}: '{inst[:50]}...' (no expected value for comparison)")
# Fallback: if best_inst is still None (all similarities were 0), use first institution
if best_inst is None and seal_result['institutions']:
best_inst = seal_result['institutions'][0]
logger.warning(f" - All similarities were 0%, using first institution: '{best_inst[:50]}...'")
logger.info(f" - Selected: '{best_inst[:50]}...' (similarity: {best_similarity:.1f}%)")
result['extracted']['institution'] = best_inst
# Compare institution
if expected_inst and expected_inst != "":
inst_comparison = classify_match(best_inst, expected_inst)
result['comparison']['institution'] = inst_comparison
else:
result['comparison']['institution']['notes'] = "No expected institution"
result['performance']['total_time'] = time.time() - total_start
return result
def generate_individual_report(result: Dict[str, Any], output_dir: Path):
"""Generate individual HTML report for a single PDF"""
pdf_name = result['pdf_name']
expected_cma = result['expected']['cma']
expected_inst = result['expected']['institution']
extracted_cma = result['extracted']['cma']
extracted_inst = result['extracted']['institution']
cma_match = result['comparison'].get('cma', {}).get('match_type', 'no_match')
cma_sim = result['comparison'].get('cma', {}).get('similarity', 0)
inst_match = result['comparison'].get('institution', {}).get('match_type', 'no_match')
inst_sim = result['comparison'].get('institution', {}).get('similarity', 0)
total_time = result['performance']['total_time']
# Colors
cma_color = '#4caf50' if cma_match == 'exact' else '#ff9800' if cma_match == 'partial' else '#f44336'
inst_color = '#4caf50' if inst_match == 'exact' else '#ff9800' if inst_match == 'partial' else '#f44336'
# Build seals HTML
seals_html = ""
if result['seal_results']:
seals_html = "<h2>Detected Seals and Institution Names</h2>"
for seal in result['seal_results']:
status = "[OK]" if seal['success'] else "[FAIL]"
text = seal['text'] if seal['text'] else "No text recognized"
seals_html += f"""
<div style="background: white; padding: 15px; margin-bottom: 20px; border-radius: 6px; border-left: 4px solid #2196F3;">
<h3>Seal #{seal['index']}</h3>
<p><strong>Recognized Text:</strong> {text}</p>
<p><strong>Confidence:</strong> {seal['confidence']:.2%}</p>
<p><strong>Status:</strong> {status}</p>
<div style="display: flex; gap: 10px; margin-top: 10px;">
<div>
<p style="margin: 0;">Marked:</p>
<img src="{seal['marked_path']}" style="max-height: 200px; border: 1px solid #ddd;">
</div>
<div>
<p style="margin: 0;">Unwarped:</p>
{f'<img src="{seal["unwarp_path"]}" style="max-height: 200px; border: 1px solid #ddd;">' if seal.get('unwarp_path') else 'N/A'}
</div>
</div>
</div>"""
html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>Extraction Report - {pdf_name}</title>
<style>
body {{ font-family: 'Segoe UI', sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }}
.container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; }}
h1 {{ color: #333; border-bottom: 3px solid #4caf50; padding-bottom: 10px; }}
.info-grid {{ display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px; margin: 20px 0; }}
.info-box {{ background: #f9f9f9; padding: 15px; border-radius: 6px; }}
.info-box label {{ display: block; font-weight: bold; color: #666; margin-bottom: 5px; }}
.info-box .value {{ font-size: 18px; }}
.cma-box {{ border-left: 4px solid {cma_color}; }}
.inst-box {{ border-left: 4px solid {inst_color}; }}
.similarity {{ text-align: center; margin: 20px 0; }}
.similarity .score {{ font-size: 48px; font-weight: bold; }}
</style>
</head>
<body>
<div class="container">
<h1>CMA & Institution Extraction Report</h1>
<p><strong>PDF:</strong> {pdf_name}</p>
<p><strong>Processing Time:</strong> {total_time:.2f}s</p>
<h2>CMA Code Extraction</h2>
<div class="info-grid">
<div class="info-box cma-box">
<label>Expected CMA</label>
<div class="value">{expected_cma}</div>
</div>
<div class="info-box cma-box">
<label>Extracted CMA</label>
<div class="value">{extracted_cma if extracted_cma else 'N/A'}</div>
</div>
<div class="info-box">
<label>Match Type</label>
<div class="value" style="color: {cma_color};">{cma_match.upper()}</div>
</div>
<div class="info-box">
<label>Similarity</label>
<div class="value">{cma_sim:.1f}%</div>
</div>
</div>
<h2>Institution Name Extraction</h2>
<div class="info-grid">
<div class="info-box inst-box">
<label>Expected Institution</label>
<div class="value">{expected_inst}</div>
</div>
<div class="info-box inst-box">
<label>Extracted Institution</label>
<div class="value">{extracted_inst if extracted_inst else 'N/A'}</div>
</div>
<div class="info-box">
<label>Match Type</label>
<div class="value" style="color: {inst_color};">{inst_match.upper()}</div>
</div>
<div class="info-box">
<label>Similarity</label>
<div class="value">{inst_sim:.1f}%</div>
</div>
</div>
<h2>Performance</h2>
<div class="info-grid">
<div class="info-box">
<label>Total Time</label>
<div class="value">{total_time:.2f}s</div>
</div>
<div class="info-box">
<label>CMA Extraction Time</label>
<div class="value">{result['performance']['cma_time']:.2f}s</div>
</div>
<div class="info-box">
<label>Seal Extraction Time</label>
<div class="value">{result['performance']['seal_time']:.2f}s</div>
</div>
<div class="info-box">
<label>Seals Detected</label>
<div class="value">{len(result['seal_results'])}</div>
</div>
</div>
{seals_html}
<h2>Visualizations</h2>
<div style="background: white; padding: 15px; border-radius: 6px;">
<p style="margin: 0 0 10px 0;">CMA Detection:</p>
<img src="cma_detection_fullpage.png" style="max-width: 100%; border: 1px solid #ddd;">
</div>
<div style="background: white; padding: 15px; border-radius: 6px; margin-top: 10px;">
<p style="margin: 0 0 10px 0;">Layout Detection:</p>
<img src="doc_layout_viz.png" style="max-width: 100%; border: 1px solid #ddd;">
</div>
</div>
</body>
</html>"""
os.makedirs(output_dir, exist_ok=True)
with open(output_dir / 'index.html', 'w', encoding='utf-8') as f:
f.write(html)
def generate_summary_report(all_results: List[Dict[str, Any]], output_dir: Path):
"""Generate summary HTML report"""
# Calculate statistics
total = len(all_results)
valid_cma = [r for r in all_results if r['expected']['cma'] not in ['', None]]
valid_inst = [r for r in all_results if r['expected']['institution'] not in ['', None]]
cma_exact = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'exact')
cma_partial = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'partial')
cma_no = len(valid_cma) - cma_exact - cma_partial
inst_exact = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'exact')
inst_partial = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'partial')
inst_no = len(valid_inst) - inst_exact - inst_partial
cma_acc = (cma_exact / len(valid_cma) * 100) if valid_cma else 0
inst_acc = (inst_exact / len(valid_inst) * 100) if valid_inst else 0
avg_time = np.mean([r['performance']['total_time'] for r in all_results])
html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>Batch Test Summary - CMA & Institution Extraction</title>
<style>
body {{ font-family: 'Segoe UI', sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }}
.container {{ max-width: 1400px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; }}
h1 {{ color: #333; }}
.summary {{ display: grid; grid-template-columns: repeat(4, 1fr); gap: 20px; margin: 20px 0; }}
.summary-card {{ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 8px; color: white; text-align: center; }}
.summary-card .label {{ font-size: 14px; opacity: 0.9; }}
.summary-card .value {{ font-size: 32px; font-weight: bold; }}
table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
th, td {{ padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }}
th {{ background: #f5f5f5; }}
</style>
</head>
<body>
<div class="container">
<h1>CMA & Institution Extraction - Batch Test Summary</h1>
<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<h2>CMA Code Results</h2>
<div class="summary">
<div class="summary-card" style="background: linear-gradient(135deg, #4caf50 0%, #45a049 100%);">
<div class="label">Exact Match</div>
<div class="value">{cma_exact}/{len(valid_cma)}</div>
</div>
<div class="summary-card" style="background: linear-gradient(135deg, #ff9800 0%, #f57c00 100%);">
<div class="label">Partial Match</div>
<div class="value">{cma_partial}/{len(valid_cma)}</div>
</div>
<div class="summary-card" style="background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%);">
<div class="label">No Match</div>
<div class="value">{cma_no}/{len(valid_cma)}</div>
</div>
<div class="summary-card" style="background: linear-gradient(135deg, #2196F3 0%, #1976D2 100%);">
<div class="label">Accuracy</div>
<div class="value">{cma_acc:.1f}%</div>
</div>
</div>
<h2>Institution Name Results</h2>
<div class="summary">
<div class="summary-card" style="background: linear-gradient(135deg, #4caf50 0%, #45a049 100%);">
<div class="label">Exact Match</div>
<div class="value">{inst_exact}/{len(valid_inst)}</div>
</div>
<div class="summary-card" style="background: linear-gradient(135deg, #ff9800 0%, #f57c00 100%);">
<div class="label">Partial Match</div>
<div class="value">{inst_partial}/{len(valid_inst)}</div>
</div>
<div class="summary-card" style="background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%);">
<div class="label">No Match</div>
<div class="value">{inst_no}/{len(valid_inst)}</div>
</div>
<div class="summary-card" style="background: linear-gradient(135deg, #2196F3 0%, #1976D2 100%);">
<div class="label">Accuracy</div>
<div class="value">{inst_acc:.1f}%</div>
</div>
</div>
<h2>Performance</h2>
<p>Average processing time: {avg_time:.1f}s per PDF</p>
<h2>Complete Results</h2>
<table>
<thead>
<tr>
<th>PDF</th>
<th>Expected CMA</th>
<th>Extracted CMA</th>
<th>CMA Match</th>
<th>Expected Inst</th>
<th>Extracted Inst</th>
<th>Inst Match</th>
<th>Seals</th>
<th>Time</th>
</tr>
</thead>
<tbody>"""
for r in all_results:
cma_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(r['comparison'].get('cma', {}).get('match_type', 'no_match'), '[?]')
inst_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(r['comparison'].get('institution', {}).get('match_type', 'no_match'), '[?]')
seals_count = len(r['seal_results'])
html += f"""
<tr>
<td>{r['pdf_name']}</td>
<td>{r['expected']['cma']}</td>
<td>{r['extracted']['cma'] or 'N/A'}</td>
<td>{cma_symbol}</td>
<td>{r['expected']['institution'][:30]}...</td>
<td>{(r['extracted']['institution'] or 'N/A')[:30]}...</td>
<td>{inst_symbol}</td>
<td>{seals_count}</td>
<td>{r['performance']['total_time']:.1f}s</td>
</tr>"""
html += """
</tbody>
</table>
</div>
</body>
</html>"""
with open(output_dir / 'summary.html', 'w', encoding='utf-8') as f:
f.write(html)
def main():
"""Main execution function"""
# Parse command line arguments
import argparse
parser = argparse.ArgumentParser(description='CMA & Institution Extraction - Batch Accuracy Test')
parser.add_argument('--ocr-model', type=str, default=OCR_MODEL,
choices=['ppocr_v5', 'paddleocr_vl'],
help='OCR model to use (default: from OCR_MODEL env var or ppocr_v5)')
parser.add_argument('--batch-size', type=int, default=BATCH_SIZE,
help=f'Number of PDFs to process (default: {BATCH_SIZE})')
parser.add_argument('--pdf-names', type=str, default=None,
help='Comma-separated list of PDF names to process (e.g., "1.pdf,2.pdf"). Overrides --batch-size')
args = parser.parse_args()
# Use command line argument if provided
ocr_model = args.ocr_model
batch_size = args.batch_size
pdf_names_filter = args.pdf_names
print("=" * 80)
print("CMA & INSTITUTION EXTRACTION - BATCH ACCURACY TEST")
print("=" * 80)
print(f"OCR Model: {ocr_model.upper()}")
print(f"Processing first {batch_size} PDFs from results.json...")
print(f"PDF directory: {PDF_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print()
# Load ground truth
if not RESULTS_JSON.exists():
logger.error(f"Ground truth file not found: {RESULTS_JSON}")
return
with open(RESULTS_JSON, 'r', encoding='utf-8') as f:
ground_truth = json.load(f)
# Filter PDFs: either by name filter or by batch size
if pdf_names_filter:
# Split comma-separated names and strip whitespace
requested_names = [name.strip() for name in pdf_names_filter.split(',')]
pdf_list = [(name, ground_truth[name]) for name in requested_names if name in ground_truth]
if not pdf_list:
logger.error(f"None of the specified PDFs found in results.json: {requested_names}")
print(f"ERROR: None of the specified PDFs found in results.json: {requested_names}")
return
print(f"Processing {len(pdf_list)} specified PDF(s): {[name for name, _ in pdf_list]}")
else:
# Get first N PDFs
pdf_list = list(ground_truth.items())[:batch_size]
# Initialize OCR engines
# Note: We ALWAYS initialize ocr_engine for CMA recognition
# We ALWAYS try to initialize vl_pipeline for backup seal recognition (when unwarp fails)
ocr_engine = None
vl_pipeline = None
logger.info("Initializing PaddleOCR engine for CMA recognition...")
print("Initializing PaddleOCR engine (required for CMA extraction)...")
ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch')
logger.info("PaddleOCR initialized successfully")
print("PaddleOCR initialized successfully\n")
# Initialize PaddleOCRVL for backup seal recognition (always try if available)
# This provides a fallback when polar unwarping fails
if PADDLEOCRVL_AVAILABLE:
logger.info("Initializing PaddleOCRVL for backup seal recognition...")
print("Initializing PaddleOCRVL for backup seal recognition (this may take a while)...")
try:
vl_pipeline = PaddleOCRVL(
use_seal_recognition=True,
use_ocr_for_image_block=True,
use_layout_detection=True
)
# Verify initialization
if vl_pipeline is None:
raise RuntimeError("PaddleOCRVL initialization returned None")
logger.info("PaddleOCRVL initialized successfully (backup ready)")
print("PaddleOCRVL backup ready - will be used when polar unwarping fails\n")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCRVL: {e}")
logger.error(f"Exception type: {type(e).__name__}")
print(f"WARNING: Failed to initialize PaddleOCRVL: {e}")
print("Polar unwarping failures will skip OCR (no backup available)\n")
else:
logger.info("PaddleOCRVL not available - polar unwarping failures will skip OCR")
print("Note: PaddleOCRVL not installed - polar unwarping failures will skip OCR")
print(" To enable backup: pip install paddleocr[doc-parser]\n")
# Validate OCR model selection
if ocr_model == "paddleocr_vl" and vl_pipeline is None:
print("WARNING: PaddleOCRVL requested for primary seal recognition but not available!")
print("Falling back to PP-OCRv5 for seal recognition")
print("Please install: pip install paddleocr[doc-parser]")
ocr_model = "ppocr_v5"
# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True)
# Process each PDF
all_results = []
start_time = time.time()
total_pdfs = len(pdf_list)
for i, (pdf_name, expected_data) in enumerate(pdf_list, 1):
expected_cma = expected_data.get('CMA', '')
expected_inst = expected_data.get('机构名', '')
print(f"\n[{i}/{total_pdfs}] Processing: {pdf_name}")
print(" + Loading PDF and extracting page...")
result = process_single_pdf(
pdf_name, expected_cma, expected_inst,
PDF_DIR, OUTPUT_DIR, ocr_engine,
ocr_model=ocr_model, vl_pipeline=vl_pipeline
)
all_results.append(result)
# Print result summary
if result['status'] == 'file_not_found':
print(f" + [!] File not found, skipping")
else:
cma_match = result['comparison']['cma'].get('match_type', 'unknown')
cma_sim = result['comparison']['cma'].get('similarity', 0)
cma_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(cma_match, '[?]')
print(f" + CMA Extraction:")
print(f" + Extracted: {result['extracted']['cma'] or 'N/A'}")
print(f" + Expected: {expected_cma}")
print(f" + Match: {cma_symbol} {cma_match.upper()} ({cma_sim:.1f}%)")
if result['extracted']['institution']:
inst_match = result['comparison']['institution'].get('match_type', 'unknown')
inst_sim = result['comparison']['institution'].get('similarity', 0)
inst_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(inst_match, '[?]')
print(f" + Institution Extraction:")
print(f" + Extracted: {result['extracted']['institution'][:50]}...")
print(f" + Expected: {expected_inst[:50]}...")
print(f" + Match: {inst_symbol} {inst_match.upper()} ({inst_sim:.1f}%)")
print(f" + Seals detected: {len(result['seal_results'])}")
print(f" + Completed in {result['performance']['total_time']:.2f}s")
# Generate individual report
generate_individual_report(result, OUTPUT_DIR / pdf_name)
# Interim results every 5
if i % 5 == 0:
valid_cma = [r for r in all_results if r['expected']['cma'] not in ['', None]]
cma_exact = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'exact')
cma_acc = (cma_exact / len(valid_cma) * 100) if valid_cma else 0
valid_inst = [r for r in all_results if r['expected']['institution'] not in ['', None] and r['extracted']['institution']]
inst_exact = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'exact')
inst_acc = (inst_exact / len(valid_inst) * 100) if valid_inst else 0
print()
print("=" * 80)
print(f"INTERIM RESULTS ({i}/{BATCH_SIZE} completed)")
print("=" * 80)
print(f"CMA Accuracy: {cma_acc:.1f}% ({cma_exact}/{len(valid_cma)} exact)")
print(f"Institution Accuracy: {inst_acc:.1f}% ({inst_exact}/{len(valid_inst)} exact)")
print("=" * 80)
print()
total_time = time.time() - start_time
# Calculate final statistics
valid_cma = [r for r in all_results if r['expected']['cma'] not in ['', None]]
cma_exact = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'exact')
cma_partial = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'partial')
cma_no = len(valid_cma) - cma_exact - cma_partial
cma_acc = (cma_exact / len(valid_cma) * 100) if valid_cma else 0
valid_inst = [r for r in all_results if r['expected']['institution'] not in ['', None] and r['extracted']['institution']]
inst_exact = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'exact')
inst_partial = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'partial')
inst_no = len(valid_inst) - inst_exact - inst_partial
inst_acc = (inst_exact / len(valid_inst) * 100) if valid_inst else 0
# Generate summary report
print("\nGenerating summary report...")
generate_summary_report(all_results, OUTPUT_DIR)
# Save JSON
json_output = {
'summary': {
'total_processed': len(all_results),
'cma': {
'exact': cma_exact,
'partial': cma_partial,
'no_match': cma_no,
'accuracy': cma_acc / 100
},
'institution': {
'exact': inst_exact,
'partial': inst_partial,
'no_match': inst_no,
'accuracy': inst_acc / 100
},
'avg_processing_time': np.mean([r['performance']['total_time'] for r in all_results])
},
'results': all_results
}
with open(OUTPUT_DIR / 'test_report.json', 'w', encoding='utf-8') as f:
json.dump(json_output, f, ensure_ascii=False, indent=2, cls=NumpyEncoder)
# Print final summary
print("\n" + "=" * 80)
print("BATCH TEST COMPLETED - FINAL RESULTS")
print("=" * 80)
print(f"Total Processed: {len(all_results)}")
print()
print("CMA Code Results:")
print(f" Exact Match: {cma_exact}/{len(valid_cma)} ({cma_exact/len(valid_cma)*100:.1f}%)")
print(f" Partial Match: {cma_partial}/{len(valid_cma)} ({cma_partial/len(valid_cma)*100:.1f}%)")
print(f" No Match: {cma_no}/{len(valid_cma)} ({cma_no/len(valid_cma)*100:.1f}%)")
print(f" ** CMA Accuracy: {cma_acc:.1f}% **")
print()
print("Institution Name Results:")
print(f" Exact Match: {inst_exact}/{len(valid_inst)} ({inst_exact/len(valid_inst)*100:.1f}%)")
print(f" Partial Match: {inst_partial}/{len(valid_inst)} ({inst_partial/len(valid_inst)*100:.1f}%)")
print(f" No Match: {inst_no}/{len(valid_inst)} ({inst_no/len(valid_inst)*100:.1f}%)")
print(f" ** Institution Accuracy: {inst_acc:.1f}% **")
print()
print("Performance:")
print(f" Total Time: {total_time:.1f}s ({total_time/60:.1f}min)")
print(f" Average Time: {total_time/len(all_results):.1f}s per PDF")
print()
print("Reports Generated:")
print(f" - {OUTPUT_DIR / 'summary.html'}")
print(f" - {OUTPUT_DIR / 'test_report.json'}")
print(f" - Individual reports: {OUTPUT_DIR / '{pdf_name}/'}")
print()
print("=" * 80)
if __name__ == "__main__":
main()