2026-02-07 14:03:10 +08:00
|
|
|
"""
|
|
|
|
|
CMA Code Extraction & Institution Name - Batch Accuracy Testing Script (Enhanced)
|
|
|
|
|
|
|
|
|
|
This script implements comprehensive batch accuracy testing for BOTH:
|
|
|
|
|
1. CMA code extraction
|
|
|
|
|
2. Institution name extraction from seals
|
|
|
|
|
|
|
|
|
|
Uses the complete workflow from v_verify_logic.py including:
|
|
|
|
|
- Layout detection (Paddlex PP-DocLayout-L)
|
|
|
|
|
- Seal detection and refinement
|
|
|
|
|
- Polar unwarping
|
|
|
|
|
- OCR text recognition for institution names
|
|
|
|
|
|
|
|
|
|
Author: Claude Code
|
|
|
|
|
Date: 2025-02-05
|
|
|
|
|
Version: 2.0 (Enhanced with seal/institution extraction)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import logging
|
|
|
|
|
import re
|
|
|
|
|
import math
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
# Set UTF-8 encoding for Windows console
|
|
|
|
|
if sys.platform == 'win32':
|
|
|
|
|
import codecs
|
|
|
|
|
try:
|
|
|
|
|
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
|
|
|
|
|
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NumpyEncoder(json.JSONEncoder):
|
|
|
|
|
"""Custom JSON encoder for numpy types"""
|
|
|
|
|
def default(self, obj):
|
|
|
|
|
if isinstance(obj, np.integer):
|
|
|
|
|
return int(obj)
|
|
|
|
|
if isinstance(obj, np.floating):
|
|
|
|
|
return float(obj)
|
|
|
|
|
if isinstance(obj, np.ndarray):
|
|
|
|
|
return obj.tolist()
|
|
|
|
|
return super().default(obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
import cv2
|
|
|
|
|
from paddleocr import PaddleOCR, SealTextDetection, TextRecognition
|
|
|
|
|
try:
|
|
|
|
|
from paddleocr import PaddleOCRVL
|
|
|
|
|
PADDLEOCRVL_AVAILABLE = True
|
|
|
|
|
except ImportError:
|
|
|
|
|
PADDLEOCRVL_AVAILABLE = False
|
|
|
|
|
print("Warning: PaddleOCRVL not available. Install with: pip install paddleocr[doc-parser]")
|
|
|
|
|
import paddlex as px
|
|
|
|
|
from Levenshtein import distance as levenshtein_distance
|
|
|
|
|
except ImportError as e:
|
|
|
|
|
print(f"Error: Required dependency not found: {e}")
|
|
|
|
|
print("Please install: pip install python-Levenshtein paddleocr paddlex pymupdf-ng opencv-python numpy")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
# Import CMA extraction module
|
|
|
|
|
try:
|
|
|
|
|
from cma_extraction_final import extract_cma_code_fullpage, imread_unicode
|
|
|
|
|
except ImportError:
|
|
|
|
|
print("Error: cma_extraction_final.py not found in current directory")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
# Configure logging
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
|
|
handlers=[
|
|
|
|
|
logging.FileHandler('test_accuracy_full.log', encoding='utf-8'),
|
|
|
|
|
logging.StreamHandler()
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
# Constants
|
|
|
|
|
PDF_DIR = Path(r"src/test/resources/data/pdfs")
|
|
|
|
|
RESULTS_JSON = Path(r"src/test/resources/data/results.json")
|
|
|
|
|
OUTPUT_DIR = Path("test_reports_full")
|
|
|
|
|
BATCH_SIZE = 20
|
|
|
|
|
SIMILARITY_THRESHOLD = 85.0
|
|
|
|
|
|
|
|
|
|
# OCR Model Configuration
|
|
|
|
|
# Options: "ppocr_v5" (default), "paddleocr_vl"
|
|
|
|
|
OCR_MODEL = os.environ.get("OCR_MODEL", "ppocr_v5")
|
|
|
|
|
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
# ============ Helper Functions ============
|
|
|
|
|
|
|
|
|
|
def imwrite_safe(file_path, img):
|
|
|
|
|
"""
|
|
|
|
|
Write image file safely, handling Chinese paths on Windows.
|
|
|
|
|
|
|
|
|
|
On Windows, cv2.imwrite fails with Chinese paths. This function uses
|
|
|
|
|
cv2.imencode + tofile as a fallback.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
file_path: Path to save the image
|
|
|
|
|
img: Image data (numpy array)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
bool: True if successful, False otherwise
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
# Try standard cv2.imwrite first
|
|
|
|
|
success = cv2.imwrite(file_path, img)
|
|
|
|
|
if success:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
# Fallback: Use imencode + tofile for Chinese paths
|
|
|
|
|
is_success, buffer = cv2.imencode(".png", img)
|
|
|
|
|
if is_success:
|
|
|
|
|
buffer.tofile(file_path)
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to write image to {file_path}: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2026-02-07 14:03:10 +08:00
|
|
|
# ============ Seal Processing Functions (from v_verify_logic.py) ============
|
|
|
|
|
|
|
|
|
|
def polar_unwarp(img, center, radius, start_theta, angular_extent):
|
|
|
|
|
"""
|
|
|
|
|
Polar Unwarp with Canvas Padding for Partial Seals
|
|
|
|
|
|
|
|
|
|
Extended version:
|
|
|
|
|
- Creates a padded canvas to handle partial seals (seals cut off at edges)
|
|
|
|
|
- Samples both inward (toward center) and outward (away from center)
|
|
|
|
|
- Uses white padding for areas outside the original image boundary
|
|
|
|
|
- This ensures we can always sample at the full radius even if seal is cut off
|
|
|
|
|
"""
|
|
|
|
|
if angular_extent <= 0: return None
|
|
|
|
|
|
|
|
|
|
strip_w = int(angular_extent * radius)
|
|
|
|
|
|
|
|
|
|
# Extended sampling range:
|
|
|
|
|
# - Inward: 100% of radius (toward center) - all the way to center
|
|
|
|
|
# - Outward: 20% beyond radius (away from center)
|
|
|
|
|
inward_range = int(radius * 0.85) # 向内到圆心
|
|
|
|
|
outward_range = int(radius * 0.2) # 向外20%
|
|
|
|
|
strip_h = inward_range + outward_range
|
|
|
|
|
|
|
|
|
|
if strip_w <= 0 or strip_h <= 0: return None
|
|
|
|
|
|
|
|
|
|
ch, cw = img.shape[:2]
|
|
|
|
|
|
|
|
|
|
# Calculate padding needed to ensure all sampling points are within bounds
|
|
|
|
|
# Maximum distance from center will be radius + outward_range
|
|
|
|
|
max_distance = radius + outward_range
|
|
|
|
|
|
|
|
|
|
# Calculate padding needed on each side
|
|
|
|
|
pad_top = max(0, max_distance - center[1])
|
|
|
|
|
pad_bottom = max(0, max_distance - (ch - center[1]))
|
|
|
|
|
pad_left = max(0, max_distance - center[0])
|
|
|
|
|
pad_right = max(0, max_distance - (cw - center[0]))
|
|
|
|
|
|
|
|
|
|
# Create padded canvas with white background
|
|
|
|
|
padded_h = ch + pad_top + pad_bottom
|
|
|
|
|
padded_w = cw + pad_left + pad_right
|
|
|
|
|
padded_canvas = np.ones((padded_h, padded_w, 3), dtype=np.uint8) * 255
|
|
|
|
|
|
|
|
|
|
# Place original image in center
|
|
|
|
|
padded_canvas[pad_top:pad_top+ch, pad_left:pad_left+cw] = img
|
|
|
|
|
|
|
|
|
|
# Adjust center position for padded canvas
|
|
|
|
|
center_padded = [center[0] + pad_left, center[1] + pad_top]
|
|
|
|
|
|
|
|
|
|
strip = np.zeros((strip_h, strip_w, 3), dtype=np.uint8)
|
|
|
|
|
|
|
|
|
|
for y in range(strip_h):
|
|
|
|
|
# Calculate radius at this row
|
|
|
|
|
# Start from radius + outward_range (outside)
|
|
|
|
|
# Move inward toward center
|
|
|
|
|
r = radius + outward_range - y
|
|
|
|
|
|
|
|
|
|
for x in range(strip_w):
|
|
|
|
|
theta = start_theta + angular_extent * (x / strip_w)
|
|
|
|
|
src_x = center_padded[0] + r * math.cos(theta)
|
|
|
|
|
src_y = center_padded[1] + r * math.sin(theta)
|
|
|
|
|
|
|
|
|
|
# Sample from padded canvas (all points should be within bounds now)
|
|
|
|
|
sx, sy = int(src_x), int(src_y)
|
|
|
|
|
if 0 <= sx < padded_w and 0 <= sy < padded_h:
|
|
|
|
|
strip[y, x] = padded_canvas[sy, sx]
|
|
|
|
|
else:
|
|
|
|
|
strip[y, x] = [255, 255, 255]
|
|
|
|
|
|
|
|
|
|
return strip
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_precise_arc(polygons, center):
|
|
|
|
|
"""Calculate precise arc parameters for seal text"""
|
|
|
|
|
initial_clusters = []
|
|
|
|
|
gap_thresh = math.radians(15)
|
|
|
|
|
for poly in polygons:
|
|
|
|
|
thetas = sorted([math.atan2(p[1] - center[1], p[0] - center[0]) for i, p in enumerate(poly)])
|
|
|
|
|
if not thetas: continue
|
|
|
|
|
max_gap = 0
|
|
|
|
|
gap_idx = -1
|
|
|
|
|
for i in range(len(thetas)):
|
|
|
|
|
gap = (thetas[0] + 2*math.pi - thetas[i]) if i == len(thetas)-1 else (thetas[i+1]-thetas[i])
|
|
|
|
|
if gap > max_gap: max_gap = gap; gap_idx = i
|
|
|
|
|
if gap_idx == len(thetas) - 1:
|
|
|
|
|
t_arc = thetas
|
|
|
|
|
else:
|
|
|
|
|
t_arc = thetas[gap_idx+1:] + [t + 2*math.pi for t in thetas[:gap_idx+1]]
|
|
|
|
|
if not t_arc: continue
|
|
|
|
|
curr = [t_arc[0]]
|
|
|
|
|
for i in range(1, len(t_arc)):
|
|
|
|
|
if t_arc[i] - t_arc[i-1] > gap_thresh:
|
|
|
|
|
initial_clusters.append({'start': curr[0], 'end': curr[-1]})
|
|
|
|
|
curr = [t_arc[i]]
|
|
|
|
|
else:
|
|
|
|
|
curr.append(t_arc[i])
|
|
|
|
|
initial_clusters.append({'start': curr[0], 'end': curr[-1]})
|
|
|
|
|
if not initial_clusters: return 0.0, 0.0
|
|
|
|
|
initial_clusters.sort(key=lambda x: x['start'])
|
|
|
|
|
merged = []
|
|
|
|
|
merge_thresh = math.radians(45)
|
|
|
|
|
if initial_clusters:
|
|
|
|
|
curr = initial_clusters[0]
|
|
|
|
|
for i in range(1, len(initial_clusters)):
|
|
|
|
|
nxt = initial_clusters[i]
|
|
|
|
|
if nxt['start'] - curr['end'] < merge_thresh:
|
|
|
|
|
curr['end'] = max(curr['end'], nxt['end'])
|
|
|
|
|
else:
|
|
|
|
|
merged.append(curr)
|
|
|
|
|
curr = nxt
|
|
|
|
|
merged.append(curr)
|
|
|
|
|
candidates = []
|
|
|
|
|
for m in merged:
|
|
|
|
|
st, en = m['start'], m['end']
|
|
|
|
|
ex = en - st
|
|
|
|
|
mid = (st + en) / 2
|
|
|
|
|
dist_to_top = abs(((mid + math.pi/2 + math.pi) % (2*math.pi)) - math.pi)
|
|
|
|
|
weight = math.exp(-0.5 * (dist_to_top / (math.pi/2))**2)
|
|
|
|
|
candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight})
|
|
|
|
|
candidates.sort(key=lambda x: x['score'], reverse=True)
|
|
|
|
|
best = candidates[0]
|
2026-02-07 23:13:03 +08:00
|
|
|
|
|
|
|
|
# FIX: Limit extent to max 350° to avoid overlap and distortion
|
|
|
|
|
# Extent > 360° causes severe image distortion in polar unwarping
|
|
|
|
|
MAX_EXTENT_DEG = 350.0
|
|
|
|
|
start_theta = best['start']
|
|
|
|
|
extent = best['end'] - best['start']
|
|
|
|
|
|
|
|
|
|
if math.degrees(extent) > MAX_EXTENT_DEG:
|
|
|
|
|
logger.warning(f"Arc extent {math.degrees(extent):.2f}° exceeds {MAX_EXTENT_DEG}°, clamping to avoid distortion")
|
|
|
|
|
extent = math.radians(MAX_EXTENT_DEG)
|
|
|
|
|
|
|
|
|
|
return start_theta, extent
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def fit_circle_from_text_polygons(all_polygons):
|
|
|
|
|
"""
|
|
|
|
|
Fit circle from text polygons using least squares method.
|
|
|
|
|
|
|
|
|
|
Equation: (x - a)² + (y - b)² = r²
|
|
|
|
|
Expanded: x² + y² - 2ax - 2by + (a² + b² - r²) = 0
|
|
|
|
|
Let: c = a² + b² - r²
|
|
|
|
|
Then: x² + y² = 2ax + 2by - c
|
|
|
|
|
|
|
|
|
|
This is a linear system: [2x, 2y, -1] * [a, b, c]ᵀ = x² + y²
|
|
|
|
|
"""
|
|
|
|
|
if len(all_polygons) == 0:
|
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
# Collect all points from polygons
|
|
|
|
|
points = []
|
|
|
|
|
for poly in all_polygons:
|
|
|
|
|
for p in poly:
|
|
|
|
|
points.append([float(p[0]), float(p[1])])
|
|
|
|
|
|
|
|
|
|
if len(points) < 5:
|
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
points = np.array(points)
|
|
|
|
|
|
|
|
|
|
# Build linear system
|
|
|
|
|
# A * [a, b, c]ᵀ = b
|
|
|
|
|
A = np.column_stack([2 * points[:, 0], 2 * points[:, 1], -np.ones(len(points))])
|
|
|
|
|
b_vec = np.sum(points ** 2, axis=1)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Solve least squares
|
|
|
|
|
sol, residuals, rank, singular_values = np.linalg.lstsq(A, b_vec, rcond=None)
|
|
|
|
|
|
|
|
|
|
a, b, c = sol
|
|
|
|
|
center_x = a
|
|
|
|
|
center_y = b
|
|
|
|
|
radius = np.sqrt(a**2 + b**2 - c)
|
|
|
|
|
|
|
|
|
|
# Calculate fitting error (RMSE)
|
|
|
|
|
if len(residuals) > 0:
|
|
|
|
|
rmse = np.sqrt(residuals[0] / len(points))
|
|
|
|
|
else:
|
|
|
|
|
# Calculate manually
|
|
|
|
|
predicted = A @ sol
|
|
|
|
|
errors = predicted - b_vec
|
|
|
|
|
rmse = np.sqrt(np.mean(errors ** 2))
|
|
|
|
|
|
|
|
|
|
return (int(center_x), int(center_y)), int(radius), rmse
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Circle fitting failed: {e}")
|
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_seal_center_dual_method(seal_crop, all_polygons):
|
|
|
|
|
"""
|
|
|
|
|
Dual strategy: Automatically select the best center detection method.
|
|
|
|
|
|
|
|
|
|
Strategy:
|
|
|
|
|
1. Try circle fitting
|
|
|
|
|
2. Check fitting quality (RMSE, offset distance)
|
|
|
|
|
3. If fitting quality is good → use fitted center
|
|
|
|
|
4. Otherwise → use crop center
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
center: [x, y] - detected center
|
|
|
|
|
radius: int - detected radius
|
|
|
|
|
method: str - "crop_center" or "circle_fitting"
|
|
|
|
|
"""
|
|
|
|
|
ch, cw = seal_crop.shape[:2]
|
|
|
|
|
|
|
|
|
|
# Method 1: Crop center (default method)
|
|
|
|
|
center_crop = [cw // 2, ch // 2]
|
|
|
|
|
radius_crop = min(cw, ch) // 2 - 10
|
|
|
|
|
|
|
|
|
|
# Method 2: Circle fitting
|
|
|
|
|
center_fit, radius_fit, rmse = fit_circle_from_text_polygons(all_polygons)
|
|
|
|
|
|
|
|
|
|
if center_fit is None:
|
|
|
|
|
logger.info(" Circle fitting failed, using crop center")
|
|
|
|
|
return center_crop, radius_crop, "crop_center"
|
|
|
|
|
|
|
|
|
|
# Calculate offset between fitted center and crop center
|
|
|
|
|
offset = math.sqrt((center_fit[0] - center_crop[0])**2 +
|
|
|
|
|
(center_fit[1] - center_crop[1])**2)
|
|
|
|
|
offset_ratio = offset / min(cw, ch)
|
|
|
|
|
|
|
|
|
|
# Quality check criteria
|
|
|
|
|
# 1. RMSE should be low (good fit)
|
|
|
|
|
# 2. Offset should not be too large (center should be reasonable)
|
|
|
|
|
# 3. Need enough polygons for reliable fitting
|
|
|
|
|
rmse_threshold = 3000
|
|
|
|
|
offset_threshold = 0.2 # 20% of crop size
|
|
|
|
|
min_polygons = 3
|
|
|
|
|
|
|
|
|
|
is_fit_good = (
|
|
|
|
|
rmse < rmse_threshold and
|
|
|
|
|
offset_ratio < offset_threshold and
|
|
|
|
|
len(all_polygons) >= min_polygons
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if is_fit_good:
|
|
|
|
|
logger.info(f" Using circle fitting: RMSE={rmse:.2f}, offset_ratio={offset_ratio:.2f}")
|
|
|
|
|
return center_fit, radius_fit, "circle_fitting"
|
|
|
|
|
else:
|
|
|
|
|
reasons = []
|
|
|
|
|
if rmse >= rmse_threshold:
|
|
|
|
|
reasons.append(f"RMSE too high ({rmse:.2f} >= {rmse_threshold})")
|
|
|
|
|
if offset_ratio >= offset_threshold:
|
|
|
|
|
reasons.append(f"offset too large ({offset_ratio:.2f} >= {offset_threshold})")
|
|
|
|
|
if len(all_polygons) < min_polygons:
|
|
|
|
|
reasons.append(f"not enough polygons ({len(all_polygons)} < {min_polygons})")
|
|
|
|
|
logger.info(f" Circle fitting unreliable ({', '.join(reasons)}), using crop center")
|
|
|
|
|
return center_crop, radius_crop, "crop_center"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_layout_detection(image_path):
|
|
|
|
|
"""Run Paddlex PP-DocLayout-L for layout analysis"""
|
|
|
|
|
try:
|
|
|
|
|
model = px.create_model("PP-DocLayout-L")
|
|
|
|
|
output = model.predict(image_path, batch_size=1)
|
|
|
|
|
all_regions = []
|
|
|
|
|
for res in output:
|
|
|
|
|
boxes = res.get('boxes', [])
|
|
|
|
|
for box in boxes:
|
|
|
|
|
label_name = box.get('label_name', box.get('label', 'unknown'))
|
|
|
|
|
score = box.get('score', 0.0)
|
|
|
|
|
coords = box.get('coordinate')
|
|
|
|
|
all_regions.append({
|
|
|
|
|
'label': label_name,
|
|
|
|
|
'score': score,
|
|
|
|
|
'box': coords
|
|
|
|
|
})
|
|
|
|
|
return all_regions
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Layout detection failed: {e}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_ocr_recognition(image_path, rec_model):
|
|
|
|
|
"""Run OCR recognition on unwarp seal image"""
|
|
|
|
|
try:
|
|
|
|
|
output = rec_model.predict(input=image_path, batch_size=1)
|
|
|
|
|
if output and len(output) > 0:
|
|
|
|
|
res = output[0]
|
|
|
|
|
text = res.get('rec_text', '').strip()
|
|
|
|
|
score = res.get('rec_score', 0.0)
|
|
|
|
|
return {
|
|
|
|
|
'text': text,
|
|
|
|
|
'score': score,
|
|
|
|
|
'success': len(text) > 0
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
return {'text': '', 'score': 0.0, 'success': False}
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"OCR recognition failed: {e}")
|
|
|
|
|
return {'text': '', 'score': 0.0, 'success': False}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_ocr_recognition_vl(image_path, vl_pipeline):
|
|
|
|
|
"""
|
2026-02-07 23:13:03 +08:00
|
|
|
Run OCR recognition using PaddleOCRVL on seal image.
|
|
|
|
|
|
|
|
|
|
Can be used on both unwarp images and crop images (backup mode).
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
Args:
|
2026-02-07 23:13:03 +08:00
|
|
|
image_path: Path to seal image (unwarp or crop)
|
2026-02-07 14:03:10 +08:00
|
|
|
vl_pipeline: Initialized PaddleOCRVL pipeline
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with 'text', 'score', 'success' keys
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
# Create temp output directory for VL results
|
|
|
|
|
temp_output_dir = Path("temp_paddleocr_vl")
|
|
|
|
|
temp_output_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# Run prediction
|
|
|
|
|
output = vl_pipeline.predict(image_path)
|
|
|
|
|
|
|
|
|
|
if output and len(output) > 0:
|
|
|
|
|
res = output[0]
|
|
|
|
|
|
|
|
|
|
# Save JSON to extract text
|
|
|
|
|
res.save_to_json(save_path=str(temp_output_dir))
|
|
|
|
|
|
|
|
|
|
# Read JSON to find seal text
|
|
|
|
|
json_file = temp_output_dir / f"{Path(image_path).stem}_res.json"
|
|
|
|
|
|
|
|
|
|
if json_file.exists():
|
|
|
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
|
|
# Find seal block and extract content
|
|
|
|
|
for block in data.get('parsing_res_list', []):
|
|
|
|
|
if block.get('block_label') == 'seal':
|
|
|
|
|
text = block.get('block_content', '').strip()
|
|
|
|
|
# Clean up temp files
|
|
|
|
|
import shutil
|
|
|
|
|
if temp_output_dir.exists():
|
|
|
|
|
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'text': text,
|
|
|
|
|
'score': 1.0, # PaddleOCRVL doesn't provide confidence score
|
|
|
|
|
'success': len(text) > 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Clean up temp files
|
|
|
|
|
import shutil
|
|
|
|
|
if temp_output_dir.exists():
|
|
|
|
|
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
return {'text': '', 'score': 0.0, 'success': False}
|
|
|
|
|
else:
|
|
|
|
|
return {'text': '', 'score': 0.0, 'success': False}
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"PaddleOCRVL recognition failed: {e}")
|
|
|
|
|
import traceback
|
|
|
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
return {'text': '', 'score': 0.0, 'success': False}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", vl_pipeline=None):
|
|
|
|
|
"""
|
|
|
|
|
Extract seals and recognize institution names from page image.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
page_img: Input page image
|
|
|
|
|
output_dir: Directory to save intermediate results
|
|
|
|
|
ocr_model: OCR model to use ("ppocr_v5" or "paddleocr_vl")
|
|
|
|
|
vl_pipeline: PaddleOCRVL pipeline (required if ocr_model="paddleocr_vl")
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict with:
|
|
|
|
|
- 'seals': list of seal results
|
|
|
|
|
- 'institutions': list of recognized institution names
|
|
|
|
|
- 'processing_time': time taken
|
|
|
|
|
"""
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
result = {
|
|
|
|
|
'seals': [],
|
|
|
|
|
'institutions': [],
|
|
|
|
|
'processing_time': 0.0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Validate input image
|
|
|
|
|
if page_img is None:
|
|
|
|
|
logger.error("Input page_img is None")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
if not isinstance(page_img, np.ndarray):
|
|
|
|
|
logger.error(f"Input page_img is not numpy array, type: {type(page_img)}")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
if page_img.size == 0:
|
|
|
|
|
logger.error("Input page_img is empty")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
logger.info(f"Input image shape: {page_img.shape}, dtype: {page_img.dtype}")
|
|
|
|
|
|
|
|
|
|
# Create output directory if it doesn't exist
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# Save page image
|
|
|
|
|
doc_path = os.path.join(output_dir, "doc_page.png")
|
|
|
|
|
try:
|
2026-02-07 23:13:03 +08:00
|
|
|
success = imwrite_safe(doc_path, page_img)
|
2026-02-07 14:03:10 +08:00
|
|
|
if not success:
|
2026-02-07 23:13:03 +08:00
|
|
|
logger.error(f"imwrite_safe returned False for {doc_path}")
|
2026-02-07 14:03:10 +08:00
|
|
|
# Try alternative save method using PIL
|
|
|
|
|
try:
|
|
|
|
|
from PIL import Image
|
|
|
|
|
img_rgb = cv2.cvtColor(page_img, cv2.COLOR_BGR2RGB)
|
|
|
|
|
pil_img = Image.fromarray(img_rgb)
|
|
|
|
|
pil_img.save(doc_path)
|
|
|
|
|
logger.info(f"Saved using PIL as fallback: {doc_path}")
|
|
|
|
|
|
|
|
|
|
# Verify PIL save worked
|
|
|
|
|
if not os.path.exists(doc_path):
|
|
|
|
|
logger.error(f"PIL save also failed, file not found: {doc_path}")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
except Exception as pil_e:
|
|
|
|
|
logger.error(f"PIL fallback also failed: {pil_e}")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to save page image: {e}")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
# Verify file exists before proceeding
|
|
|
|
|
if not os.path.exists(doc_path):
|
|
|
|
|
logger.error(f"Page image file not found after save: {doc_path}")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
# Run layout detection
|
|
|
|
|
logger.info("Running layout detection...")
|
|
|
|
|
all_regions = run_layout_detection(doc_path)
|
|
|
|
|
|
|
|
|
|
# Extract seal boxes
|
|
|
|
|
seal_boxes = []
|
|
|
|
|
page_viz = page_img.copy()
|
|
|
|
|
for reg in all_regions:
|
|
|
|
|
box = reg.get('box')
|
|
|
|
|
label = reg.get('label')
|
|
|
|
|
score = reg.get('score', 0.0)
|
|
|
|
|
is_seal = (label == 'seal')
|
|
|
|
|
|
|
|
|
|
if score > 0.2:
|
|
|
|
|
x1, y1, x2, y2 = [int(v) for v in box]
|
|
|
|
|
color = (0, 0, 255) if is_seal else (0, 255, 0)
|
|
|
|
|
cv2.rectangle(page_viz, (x1, y1), (x2, y2), color, 2)
|
|
|
|
|
|
|
|
|
|
if is_seal:
|
|
|
|
|
seal_boxes.append(box)
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
imwrite_safe(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
if not seal_boxes:
|
|
|
|
|
logger.warning("No seals detected")
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
# Process each seal
|
|
|
|
|
logger.info(f"Processing {len(seal_boxes)} detected seals...")
|
|
|
|
|
det_model = SealTextDetection(model_name="PP-OCRv4_server_seal_det")
|
|
|
|
|
|
|
|
|
|
# Initialize OCR model based on selection
|
|
|
|
|
if ocr_model == "paddleocr_vl":
|
|
|
|
|
if not PADDLEOCRVL_AVAILABLE:
|
|
|
|
|
logger.error("PaddleOCRVL requested but not available. Falling back to PP-OCRv5.")
|
|
|
|
|
ocr_model = "ppocr_v5"
|
|
|
|
|
rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
|
|
|
|
|
elif vl_pipeline is None:
|
|
|
|
|
logger.error("PaddleOCRVL requested but vl_pipeline is None. Falling back to PP-OCRv5.")
|
|
|
|
|
ocr_model = "ppocr_v5"
|
|
|
|
|
rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
|
|
|
|
|
else:
|
|
|
|
|
logger.info("Using PaddleOCRVL for seal text recognition")
|
|
|
|
|
rec_model = None # Not used for PaddleOCRVL
|
|
|
|
|
else:
|
|
|
|
|
logger.info("Using PP-OCRv5_server_rec for seal text recognition")
|
|
|
|
|
rec_model = TextRecognition(model_name="PP-OCRv5_server_rec")
|
|
|
|
|
|
|
|
|
|
for i, box in enumerate(seal_boxes):
|
|
|
|
|
x1, y1, x2, y2 = [int(v) for v in box]
|
|
|
|
|
pad = 40
|
|
|
|
|
y1_p, y2_p = max(0, y1-pad), min(page_img.shape[0], y2+pad)
|
|
|
|
|
x1_p, x2_p = max(0, x1-pad), min(page_img.shape[1], x2+pad)
|
|
|
|
|
seal_crop = page_img[y1_p:y2_p, x1_p:x2_p]
|
|
|
|
|
|
|
|
|
|
# Validate crop
|
|
|
|
|
if seal_crop.size == 0 or seal_crop.shape[0] == 0 or seal_crop.shape[1] == 0:
|
|
|
|
|
logger.warning(f"Invalid seal crop dimensions: {seal_crop.shape}, skipping seal {i}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
2026-02-07 23:13:03 +08:00
|
|
|
success = imwrite_safe(crop_path, seal_crop)
|
2026-02-07 14:03:10 +08:00
|
|
|
if not success:
|
|
|
|
|
# Try PIL fallback
|
|
|
|
|
try:
|
|
|
|
|
from PIL import Image
|
|
|
|
|
crop_rgb = cv2.cvtColor(seal_crop, cv2.COLOR_BGR2RGB)
|
|
|
|
|
pil_img = Image.fromarray(crop_rgb)
|
|
|
|
|
pil_img.save(crop_path)
|
|
|
|
|
logger.info(f"Saved seal crop using PIL fallback: {crop_path}")
|
|
|
|
|
except Exception as pil_e:
|
|
|
|
|
logger.error(f"Failed to save seal crop to {crop_path}: {pil_e}, skipping seal {i}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Verify file exists
|
|
|
|
|
if not os.path.exists(crop_path):
|
|
|
|
|
logger.error(f"Seal crop file not found after save: {crop_path}, skipping seal {i}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Detect text polygons
|
|
|
|
|
output = det_model.predict(crop_path, batch_size=1)
|
|
|
|
|
all_polygons = []
|
|
|
|
|
for res in output:
|
|
|
|
|
polys = res.get('dt_polys') if isinstance(res, dict) else None
|
|
|
|
|
if polys:
|
|
|
|
|
all_polygons.extend(polys)
|
|
|
|
|
|
|
|
|
|
ch, cw = seal_crop.shape[:2]
|
|
|
|
|
|
|
|
|
|
# ============ DUAL STRATEGY: Choose best center detection method ============
|
|
|
|
|
logger.info(f" Seal #{i} Geometry:")
|
|
|
|
|
logger.info(f" - Crop size: {cw}x{ch}")
|
|
|
|
|
logger.info(f" - Text polygons detected: {len(all_polygons)}")
|
|
|
|
|
|
|
|
|
|
center, radius, method_used = detect_seal_center_dual_method(seal_crop, all_polygons)
|
|
|
|
|
logger.info(f" - Method used: {method_used}")
|
|
|
|
|
logger.info(f" - Center: ({center[0]}, {center[1]})")
|
|
|
|
|
logger.info(f" - Radius: {radius}")
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
# ============ INSUFFICIENT POLYGONS CHECK ============
|
|
|
|
|
# If too few text polygons detected, polar unwarping will likely fail
|
|
|
|
|
# Skip directly to PaddleOCRVL backup in this case
|
|
|
|
|
MIN_POLYGONS_FOR_UNWARP = 3
|
|
|
|
|
if len(all_polygons) < MIN_POLYGONS_FOR_UNWARP:
|
|
|
|
|
logger.warning(f" Seal #{i}: Only {len(all_polygons)} text polygons detected (< {MIN_POLYGONS_FOR_UNWARP})")
|
|
|
|
|
logger.warning(f" Seal #{i}: Skipping polar unwarping (insufficient polygon data)")
|
|
|
|
|
logger.info(f" Seal #{i}: Using PaddleOCRVL backup instead")
|
|
|
|
|
|
|
|
|
|
# Save crop image
|
|
|
|
|
imwrite_safe(crop_path, seal_crop)
|
|
|
|
|
|
|
|
|
|
# Use PaddleOCRVL directly on crop (no unwarp)
|
|
|
|
|
if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
|
|
|
|
|
ocr_result = run_ocr_recognition_vl(crop_path, vl_pipeline)
|
|
|
|
|
logger.info(f" Seal #{i} PaddleOCRVL Result (direct crop):")
|
|
|
|
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
|
|
|
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
|
|
|
|
logger.info(f" - Success: {ocr_result['success']}")
|
|
|
|
|
logger.info(f" - ** Used PaddleOCRVL (insufficient polygons for unwarping) **")
|
|
|
|
|
|
|
|
|
|
# Create debug info without unwarp
|
|
|
|
|
seal_data = {
|
|
|
|
|
'index': i,
|
|
|
|
|
'box': box,
|
|
|
|
|
'crop_path': Path(crop_path).name,
|
|
|
|
|
'unwarp_path': None, # No unwarp performed
|
|
|
|
|
'marked_path': None, # No marked image
|
|
|
|
|
'polar_viz_path': None, # No polar visualization
|
|
|
|
|
'text': ocr_result['text'],
|
|
|
|
|
'confidence': float(ocr_result['score']),
|
|
|
|
|
'success': bool(ocr_result['success']),
|
|
|
|
|
'method_used': f'{method_used}_skip_unwarp',
|
|
|
|
|
'used_fallback': True,
|
|
|
|
|
'debug_info': {
|
|
|
|
|
'center': center,
|
|
|
|
|
'radius': radius,
|
|
|
|
|
'start_theta_deg': None,
|
|
|
|
|
'extent_deg': None,
|
|
|
|
|
'num_polygons': len(all_polygons),
|
|
|
|
|
'crop_size': (cw, ch),
|
|
|
|
|
'unwarp_size': None,
|
|
|
|
|
'skip_reason': f'Insufficient polygons ({len(all_polygons)} < {MIN_POLYGONS_FOR_UNWARP})'
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result['seals'].append(seal_data)
|
|
|
|
|
|
|
|
|
|
if ocr_result['success']:
|
|
|
|
|
result['institutions'].append(ocr_result['text'])
|
|
|
|
|
logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
|
|
|
|
|
|
|
|
|
continue # Skip to next seal
|
|
|
|
|
else:
|
|
|
|
|
logger.error(f" Seal #{i}: PaddleOCRVL not available, cannot extract text")
|
|
|
|
|
seal_data = {
|
|
|
|
|
'index': i,
|
|
|
|
|
'box': box,
|
|
|
|
|
'crop_path': Path(crop_path).name,
|
|
|
|
|
'unwarp_path': None,
|
|
|
|
|
'marked_path': None,
|
|
|
|
|
'polar_viz_path': None,
|
|
|
|
|
'text': '',
|
|
|
|
|
'confidence': 0.0,
|
|
|
|
|
'success': False,
|
|
|
|
|
'method_used': f'{method_used}_skip_unwarp',
|
|
|
|
|
'used_fallback': True,
|
|
|
|
|
'debug_info': {
|
|
|
|
|
'center': center,
|
|
|
|
|
'radius': radius,
|
|
|
|
|
'start_theta_deg': None,
|
|
|
|
|
'extent_deg': None,
|
|
|
|
|
'num_polygons': len(all_polygons),
|
|
|
|
|
'crop_size': (cw, ch),
|
|
|
|
|
'unwarp_size': None,
|
|
|
|
|
'skip_reason': f'Insufficient polygons and no PaddleOCRVL backup'
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result['seals'].append(seal_data)
|
|
|
|
|
continue
|
|
|
|
|
|
2026-02-07 14:03:10 +08:00
|
|
|
# Calculate arc and unwarp
|
|
|
|
|
start_theta, extent = calculate_precise_arc(all_polygons, center)
|
|
|
|
|
logger.info(f" Seal #{i} Arc Parameters:")
|
|
|
|
|
logger.info(f" - Start theta: {math.degrees(start_theta):.2f}°")
|
|
|
|
|
logger.info(f" - Extent: {math.degrees(extent):.2f}° ({math.degrees(extent)*radius:.1f} pixels width)")
|
|
|
|
|
|
|
|
|
|
marked = seal_crop.copy()
|
|
|
|
|
|
|
|
|
|
# Draw all text polygons in green
|
|
|
|
|
for p in all_polygons:
|
|
|
|
|
cv2.polylines(marked, [np.array(p, dtype=np.int32)], True, (0, 255, 0), 2)
|
|
|
|
|
|
|
|
|
|
# Draw center point (yellow cross)
|
|
|
|
|
center_x, center_y = int(center[0]), int(center[1])
|
|
|
|
|
cv2.drawMarker(marked, (center_x, center_y), (0, 255, 255),
|
|
|
|
|
markerType=cv2.MARKER_CROSS, markerSize=20, thickness=2)
|
|
|
|
|
cv2.circle(marked, (center_x, center_y), 5, (0, 255, 255), -1)
|
|
|
|
|
|
|
|
|
|
# Draw estimated radius circle (cyan)
|
|
|
|
|
cv2.circle(marked, (center_x, center_y), radius, (255, 255, 0), 2)
|
|
|
|
|
|
|
|
|
|
# Draw polar sampling visualization
|
|
|
|
|
polar_viz = seal_crop.copy()
|
|
|
|
|
cv2.drawMarker(polar_viz, (center_x, center_y), (0, 255, 255),
|
|
|
|
|
markerType=cv2.MARKER_CROSS, markerSize=20, thickness=2)
|
|
|
|
|
cv2.circle(polar_viz, (center_x, center_y), radius, (255, 255, 0), 2)
|
|
|
|
|
|
|
|
|
|
unwarp_path = os.path.join(output_dir, f"seal_unwarp_{i}.png")
|
|
|
|
|
unwarp = None
|
|
|
|
|
used_fallback = False
|
|
|
|
|
|
|
|
|
|
if extent > 0:
|
|
|
|
|
logger.info(f" Seal #{i}: Performing polar unwarping with detected text polygons...")
|
|
|
|
|
unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent)
|
|
|
|
|
if unwarp is not None:
|
2026-02-07 23:13:03 +08:00
|
|
|
imwrite_safe(unwarp_path, unwarp)
|
2026-02-07 14:03:10 +08:00
|
|
|
logger.info(f" - Unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
|
|
|
|
|
|
|
|
|
|
def draw_line(m, theta, color):
|
|
|
|
|
x = center[0] + radius * math.cos(theta)
|
|
|
|
|
y = center[1] + radius * math.sin(theta)
|
|
|
|
|
cv2.line(m, (int(center[0]), int(center[1])), (int(x), int(y)), color, 2)
|
|
|
|
|
|
|
|
|
|
# Draw start angle line (blue)
|
|
|
|
|
draw_line(marked, start_theta, (255, 0, 0))
|
|
|
|
|
# Draw end angle line (red)
|
|
|
|
|
draw_line(marked, start_theta + extent, (0, 0, 255))
|
|
|
|
|
|
|
|
|
|
# Draw sampling points on polar_viz (show where polar samples come from)
|
|
|
|
|
num_sample_points = min(50, int(extent * radius)) # Show up to 50 sample points
|
|
|
|
|
for r_idx in range(5): # 5 different radii
|
|
|
|
|
r = radius - r_idx * (radius * 0.6 / 5)
|
|
|
|
|
for theta_idx in range(num_sample_points):
|
|
|
|
|
theta = start_theta + extent * (theta_idx / num_sample_points)
|
|
|
|
|
src_x = center[0] + r * math.cos(theta)
|
|
|
|
|
src_y = center[1] + r * math.sin(theta)
|
|
|
|
|
if 0 <= src_x < cw and 0 <= src_y < ch:
|
|
|
|
|
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
|
|
|
|
|
|
|
|
|
|
# Save polar visualization
|
|
|
|
|
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
|
2026-02-07 23:13:03 +08:00
|
|
|
imwrite_safe(polar_viz_path, polar_viz)
|
2026-02-07 14:03:10 +08:00
|
|
|
logger.info(f" - Polar visualization saved: seal_polar_viz_{i}.png")
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f" Seal #{i}: Polar unwarp returned None")
|
|
|
|
|
|
|
|
|
|
# ============ FALLBACK: Use fixed angle range when no text detected ============
|
|
|
|
|
if unwarp is None and extent <= 0 and len(all_polygons) == 0:
|
|
|
|
|
logger.warning(f" Seal #{i}: No text polygons detected, using fallback angle range (7:30 to 4:30 clockwise)")
|
|
|
|
|
used_fallback = True
|
|
|
|
|
|
|
|
|
|
# 7:30 direction (left-bottom) to 4:30 direction (right-bottom) clockwise
|
|
|
|
|
# In standard math angle (0 = 3 o'clock, CCW):
|
|
|
|
|
# 7:30 = 225 degrees = 3.927 rad
|
|
|
|
|
# 4:30 = 135 degrees = 2.356 rad
|
|
|
|
|
# Clockwise from 7:30 to 4:30 covers 270 degrees
|
|
|
|
|
# We start at 4:30 (135 degrees) and go counter-clockwise 270 degrees
|
|
|
|
|
fallback_start_theta = math.radians(135) # 4:30 position
|
|
|
|
|
fallback_extent = math.radians(270) # 270 degree coverage
|
|
|
|
|
|
|
|
|
|
logger.info(f" Seal #{i}: Fallback - Start: 135.00° (4:30), Extent: 270.00°")
|
|
|
|
|
|
|
|
|
|
unwarp = polar_unwarp(seal_crop, center, radius, fallback_start_theta, fallback_extent)
|
|
|
|
|
if unwarp is not None:
|
2026-02-07 23:13:03 +08:00
|
|
|
imwrite_safe(unwarp_path, unwarp)
|
2026-02-07 14:03:10 +08:00
|
|
|
logger.info(f" - Fallback unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
|
|
|
|
|
|
|
|
|
|
# Update start_theta and extent for visualization
|
|
|
|
|
start_theta = fallback_start_theta
|
|
|
|
|
extent = fallback_extent
|
|
|
|
|
|
|
|
|
|
def draw_line(m, theta, color):
|
|
|
|
|
x = center[0] + radius * math.cos(theta)
|
|
|
|
|
y = center[1] + radius * math.sin(theta)
|
|
|
|
|
cv2.line(m, (int(center[0]), int(center[1])), (int(x), int(y)), color, 2)
|
|
|
|
|
|
|
|
|
|
# Draw start angle line (blue) - 4:30 position
|
|
|
|
|
draw_line(marked, start_theta, (255, 0, 0))
|
|
|
|
|
# Draw end angle line (red) - 7:30 position
|
|
|
|
|
draw_line(marked, start_theta + extent, (0, 0, 255))
|
|
|
|
|
|
|
|
|
|
# Draw sampling points
|
|
|
|
|
num_sample_points = 50
|
|
|
|
|
for r_idx in range(5):
|
|
|
|
|
r = radius - r_idx * (radius * 0.6 / 5)
|
|
|
|
|
for theta_idx in range(num_sample_points):
|
|
|
|
|
theta = start_theta + extent * (theta_idx / num_sample_points)
|
|
|
|
|
src_x = center[0] + r * math.cos(theta)
|
|
|
|
|
src_y = center[1] + r * math.sin(theta)
|
|
|
|
|
if 0 <= src_x < cw and 0 <= src_y < ch:
|
|
|
|
|
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
|
|
|
|
|
|
|
|
|
|
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
|
2026-02-07 23:13:03 +08:00
|
|
|
imwrite_safe(polar_viz_path, polar_viz)
|
2026-02-07 14:03:10 +08:00
|
|
|
logger.info(f" - Fallback polar visualization saved: seal_polar_viz_{i}.png")
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f" Seal #{i}: Fallback polar unwarp also returned None")
|
|
|
|
|
|
|
|
|
|
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
|
2026-02-07 23:13:03 +08:00
|
|
|
imwrite_safe(marked_path, marked)
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
# OCR recognition
|
|
|
|
|
ocr_result = {'text': '', 'score': 0.0, 'success': False}
|
2026-02-07 23:13:03 +08:00
|
|
|
|
2026-02-07 14:03:10 +08:00
|
|
|
if unwarp is not None:
|
2026-02-07 23:13:03 +08:00
|
|
|
# Standard path: Recognize unwarp image
|
2026-02-07 14:03:10 +08:00
|
|
|
method_str = "FALLBACK" if used_fallback else "Standard"
|
|
|
|
|
logger.info(f" Seal #{i}: Running OCR ({method_str}, model={ocr_model}) on unwarp image...")
|
|
|
|
|
|
|
|
|
|
if ocr_model == "paddleocr_vl":
|
|
|
|
|
ocr_result = run_ocr_recognition_vl(unwarp_path, vl_pipeline)
|
|
|
|
|
else:
|
|
|
|
|
ocr_result = run_ocr_recognition(unwarp_path, rec_model)
|
|
|
|
|
|
|
|
|
|
logger.info(f" Seal #{i} OCR Result:")
|
|
|
|
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
|
|
|
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
|
|
|
|
logger.info(f" - Success: {ocr_result['success']}")
|
|
|
|
|
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
|
|
|
|
|
if used_fallback:
|
|
|
|
|
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
|
|
|
|
|
else:
|
2026-02-07 23:13:03 +08:00
|
|
|
# ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
|
|
|
|
|
logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)")
|
|
|
|
|
|
|
|
|
|
if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
|
|
|
|
|
logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
|
|
|
|
|
seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
|
|
|
|
|
ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
|
|
|
|
|
logger.info(f" Seal #{i} PaddleOCRVL Backup Result:")
|
|
|
|
|
logger.info(f" - Text: '{ocr_result['text']}'")
|
|
|
|
|
logger.info(f" - Score: {ocr_result['score']:.4f}")
|
|
|
|
|
logger.info(f" - Success: {ocr_result['success']}")
|
|
|
|
|
logger.info(f" - Text length: {len(ocr_result['text'])} chars")
|
|
|
|
|
logger.info(f" - ** Used PaddleOCRVL backup (direct crop recognition) **")
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f" Seal #{i}: No backup available (vl_pipeline=None or PaddleOCRVL not installed), skipping OCR")
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
seal_data = {
|
|
|
|
|
'index': int(i),
|
|
|
|
|
'box': [float(v) for v in box],
|
|
|
|
|
'crop_path': f"seal_crop_{i}.png",
|
|
|
|
|
'unwarp_path': f"seal_unwarp_{i}.png" if unwarp is not None else None,
|
|
|
|
|
'marked_path': f"seal_marked_{i}.png",
|
|
|
|
|
'polar_viz_path': f"seal_polar_viz_{i}.png" if unwarp is not None else None,
|
|
|
|
|
'text': ocr_result['text'],
|
|
|
|
|
'confidence': float(ocr_result['score']),
|
|
|
|
|
'success': bool(ocr_result['success']),
|
|
|
|
|
'method_used': method_used, # Add method tracking
|
|
|
|
|
'used_fallback': used_fallback, # Track if fallback was used
|
|
|
|
|
'debug_info': {
|
|
|
|
|
'center': center,
|
|
|
|
|
'radius': radius,
|
|
|
|
|
'start_theta_deg': float(math.degrees(start_theta)),
|
|
|
|
|
'extent_deg': float(math.degrees(extent)),
|
|
|
|
|
'num_polygons': len(all_polygons),
|
|
|
|
|
'crop_size': (cw, ch),
|
|
|
|
|
'unwarp_size': (unwarp.shape[1], unwarp.shape[0]) if unwarp is not None else None
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result['seals'].append(seal_data)
|
|
|
|
|
|
|
|
|
|
if ocr_result['success']:
|
|
|
|
|
result['institutions'].append(ocr_result['text'])
|
|
|
|
|
logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
|
|
|
|
|
|
|
|
|
|
result['processing_time'] = time.time() - start_time
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ============ Similarity and Matching Functions ============
|
|
|
|
|
|
|
|
|
|
def calculate_similarity(str1: str, str2: str) -> float:
|
|
|
|
|
"""Calculate similarity percentage using Levenshtein distance"""
|
|
|
|
|
if not str1 or not str2:
|
|
|
|
|
return 0.0
|
|
|
|
|
max_len = max(len(str1), len(str2))
|
|
|
|
|
if max_len == 0:
|
|
|
|
|
return 100.0
|
|
|
|
|
edit_dist = levenshtein_distance(str1, str2)
|
|
|
|
|
similarity = (1 - edit_dist / max_len) * 100
|
|
|
|
|
return round(similarity, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify_match(extracted: Optional[str], expected: str) -> Dict[str, Any]:
|
|
|
|
|
"""Classify match type between extracted and expected values"""
|
|
|
|
|
if extracted is None:
|
|
|
|
|
return {
|
|
|
|
|
'match_type': 'no_match',
|
|
|
|
|
'similarity': 0.0,
|
|
|
|
|
'edit_distance': len(expected)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
similarity = calculate_similarity(extracted, expected)
|
|
|
|
|
edit_dist = levenshtein_distance(extracted, expected)
|
|
|
|
|
|
|
|
|
|
if similarity == 100.0:
|
|
|
|
|
match_type = 'exact'
|
|
|
|
|
elif similarity >= SIMILARITY_THRESHOLD:
|
|
|
|
|
match_type = 'partial'
|
|
|
|
|
else:
|
|
|
|
|
match_type = 'no_match'
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'match_type': match_type,
|
|
|
|
|
'similarity': similarity,
|
|
|
|
|
'edit_distance': edit_dist
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ============ PDF Processing Functions ============
|
|
|
|
|
|
|
|
|
|
def extract_pdf_page(pdf_path: str, page_num: int = 0) -> Optional[np.ndarray]:
|
|
|
|
|
"""Extract a page from PDF as image"""
|
|
|
|
|
try:
|
|
|
|
|
doc = fitz.open(pdf_path)
|
|
|
|
|
page = doc.load_page(page_num)
|
|
|
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
|
|
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
|
|
|
|
|
|
|
|
|
# Convert to BGR format for OpenCV
|
|
|
|
|
if pix.n == 4: # RGBA
|
|
|
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
|
|
|
|
|
elif pix.n == 3: # RGB
|
|
|
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
|
|
|
|
elif pix.n == 1: # Grayscale
|
|
|
|
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f"Unexpected number of channels: {pix.n}")
|
|
|
|
|
# Assume RGB and convert
|
|
|
|
|
if pix.n >= 3:
|
|
|
|
|
img = img[:, :, :3]
|
|
|
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
|
|
|
|
|
|
|
|
|
return img
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to extract page from {pdf_path}: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str,
|
|
|
|
|
pdf_dir: Path, output_dir: Path, ocr_engine,
|
|
|
|
|
ocr_model="ppocr_v5", vl_pipeline=None) -> Dict[str, Any]:
|
|
|
|
|
"""
|
|
|
|
|
Process a single PDF for CMA and institution extraction.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
pdf_name: Name of PDF file
|
|
|
|
|
expected_cma: Expected CMA code from ground truth
|
|
|
|
|
expected_inst: Expected institution name from ground truth
|
|
|
|
|
pdf_dir: Directory containing PDFs
|
|
|
|
|
output_dir: Output directory for results
|
|
|
|
|
ocr_engine: Global PaddleOCR instance (not currently used)
|
|
|
|
|
ocr_model: OCR model to use ("ppocr_v5" or "paddleocr_vl")
|
|
|
|
|
vl_pipeline: PaddleOCRVL pipeline (required if ocr_model="paddleocr_vl")
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Result dictionary with extraction and comparison data
|
|
|
|
|
"""
|
|
|
|
|
pdf_path = pdf_dir / pdf_name
|
|
|
|
|
pdf_output_dir = output_dir / pdf_name
|
|
|
|
|
|
|
|
|
|
result = {
|
|
|
|
|
'pdf_name': pdf_name,
|
|
|
|
|
'expected': {
|
|
|
|
|
'cma': expected_cma,
|
|
|
|
|
'institution': expected_inst
|
|
|
|
|
},
|
|
|
|
|
'extracted': {
|
|
|
|
|
'cma': None,
|
|
|
|
|
'institution': None,
|
|
|
|
|
'cma_confidence': 0.0,
|
|
|
|
|
'cma_success': False,
|
|
|
|
|
'institutions_from_seals': []
|
|
|
|
|
},
|
|
|
|
|
'comparison': {
|
|
|
|
|
'cma': {},
|
|
|
|
|
'institution': {}
|
|
|
|
|
},
|
|
|
|
|
'performance': {
|
|
|
|
|
'total_time': 0.0,
|
|
|
|
|
'cma_time': 0.0,
|
|
|
|
|
'seal_time': 0.0
|
|
|
|
|
},
|
|
|
|
|
'seal_results': [],
|
|
|
|
|
'status': 'success',
|
|
|
|
|
'error': None,
|
|
|
|
|
'file_size': 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Check file exists
|
|
|
|
|
if not pdf_path.exists():
|
|
|
|
|
result['status'] = 'file_not_found'
|
|
|
|
|
result['error'] = f"PDF file not found: {pdf_path}"
|
|
|
|
|
logger.warning(result['error'])
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
result['file_size'] = pdf_path.stat().st_size
|
|
|
|
|
|
|
|
|
|
# Clean output directory to ensure fresh processing
|
|
|
|
|
if pdf_output_dir.exists():
|
|
|
|
|
import shutil
|
|
|
|
|
try:
|
|
|
|
|
shutil.rmtree(pdf_output_dir)
|
|
|
|
|
logger.info(f"Cleaned existing output directory: {pdf_output_dir}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning(f"Failed to clean output directory: {e}")
|
|
|
|
|
|
|
|
|
|
# Create fresh output directory
|
|
|
|
|
pdf_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
total_start = time.time()
|
|
|
|
|
|
|
|
|
|
# Extract page
|
|
|
|
|
logger.info(f"Extracting page 1 from {pdf_name}...")
|
|
|
|
|
page_img = extract_pdf_page(str(pdf_path), page_num=0)
|
|
|
|
|
if page_img is None:
|
|
|
|
|
result['status'] = 'extraction_failed'
|
|
|
|
|
result['error'] = "Failed to extract page from PDF"
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
# Extract CMA code
|
|
|
|
|
logger.info(f"Running CMA extraction on {pdf_name}...")
|
|
|
|
|
cma_start = time.time()
|
|
|
|
|
cma_result = extract_cma_code_fullpage(page_img, ocr_engine, output_dir=str(pdf_output_dir))
|
|
|
|
|
result['performance']['cma_time'] = time.time() - cma_start
|
|
|
|
|
|
|
|
|
|
result['extracted']['cma'] = cma_result['code']
|
|
|
|
|
result['extracted']['cma_confidence'] = cma_result['confidence']
|
|
|
|
|
result['extracted']['cma_success'] = cma_result['success']
|
|
|
|
|
|
|
|
|
|
# Compare CMA
|
|
|
|
|
if expected_cma == "无":
|
|
|
|
|
result['comparison']['cma']['notes'] = "Ground truth marked as 'None'"
|
|
|
|
|
else:
|
|
|
|
|
comparison = classify_match(cma_result['code'], expected_cma)
|
|
|
|
|
result['comparison']['cma'] = comparison
|
|
|
|
|
|
|
|
|
|
# Extract seals and institutions
|
|
|
|
|
logger.info(f"Running seal extraction on {pdf_name}...")
|
|
|
|
|
seal_start = time.time()
|
|
|
|
|
seal_result = extract_seals_and_institutions(page_img, str(pdf_output_dir),
|
|
|
|
|
ocr_model=ocr_model, vl_pipeline=vl_pipeline)
|
|
|
|
|
result['performance']['seal_time'] = time.time() - seal_start
|
|
|
|
|
|
|
|
|
|
result['seal_results'] = seal_result['seals']
|
|
|
|
|
result['extracted']['institutions_from_seals'] = seal_result['institutions']
|
|
|
|
|
|
|
|
|
|
# Select best institution match
|
|
|
|
|
if seal_result['institutions']:
|
|
|
|
|
# Find best matching institution
|
|
|
|
|
best_inst = None
|
|
|
|
|
best_similarity = 0.0
|
|
|
|
|
|
|
|
|
|
for inst in seal_result['institutions']:
|
|
|
|
|
if expected_inst and expected_inst != "无":
|
|
|
|
|
sim = calculate_similarity(inst, expected_inst)
|
|
|
|
|
if sim > best_similarity:
|
|
|
|
|
best_similarity = sim
|
|
|
|
|
best_inst = inst
|
|
|
|
|
elif not best_inst:
|
|
|
|
|
best_inst = inst
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
# Fallback: if best_inst is still None (all similarities were 0), use first institution
|
|
|
|
|
if best_inst is None and seal_result['institutions']:
|
|
|
|
|
best_inst = seal_result['institutions'][0]
|
|
|
|
|
|
2026-02-07 14:03:10 +08:00
|
|
|
result['extracted']['institution'] = best_inst
|
|
|
|
|
|
|
|
|
|
# Compare institution
|
|
|
|
|
if expected_inst and expected_inst != "无":
|
|
|
|
|
inst_comparison = classify_match(best_inst, expected_inst)
|
|
|
|
|
result['comparison']['institution'] = inst_comparison
|
|
|
|
|
else:
|
|
|
|
|
result['comparison']['institution']['notes'] = "No expected institution"
|
|
|
|
|
|
|
|
|
|
result['performance']['total_time'] = time.time() - total_start
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_individual_report(result: Dict[str, Any], output_dir: Path):
|
|
|
|
|
"""Generate individual HTML report for a single PDF"""
|
|
|
|
|
pdf_name = result['pdf_name']
|
|
|
|
|
expected_cma = result['expected']['cma']
|
|
|
|
|
expected_inst = result['expected']['institution']
|
|
|
|
|
extracted_cma = result['extracted']['cma']
|
|
|
|
|
extracted_inst = result['extracted']['institution']
|
|
|
|
|
|
|
|
|
|
cma_match = result['comparison'].get('cma', {}).get('match_type', 'no_match')
|
|
|
|
|
cma_sim = result['comparison'].get('cma', {}).get('similarity', 0)
|
|
|
|
|
inst_match = result['comparison'].get('institution', {}).get('match_type', 'no_match')
|
|
|
|
|
inst_sim = result['comparison'].get('institution', {}).get('similarity', 0)
|
|
|
|
|
|
|
|
|
|
total_time = result['performance']['total_time']
|
|
|
|
|
|
|
|
|
|
# Colors
|
|
|
|
|
cma_color = '#4caf50' if cma_match == 'exact' else '#ff9800' if cma_match == 'partial' else '#f44336'
|
|
|
|
|
inst_color = '#4caf50' if inst_match == 'exact' else '#ff9800' if inst_match == 'partial' else '#f44336'
|
|
|
|
|
|
|
|
|
|
# Build seals HTML
|
|
|
|
|
seals_html = ""
|
|
|
|
|
if result['seal_results']:
|
|
|
|
|
seals_html = "<h2>Detected Seals and Institution Names</h2>"
|
|
|
|
|
for seal in result['seal_results']:
|
|
|
|
|
status = "[OK]" if seal['success'] else "[FAIL]"
|
|
|
|
|
text = seal['text'] if seal['text'] else "No text recognized"
|
|
|
|
|
seals_html += f"""
|
|
|
|
|
<div style="background: white; padding: 15px; margin-bottom: 20px; border-radius: 6px; border-left: 4px solid #2196F3;">
|
|
|
|
|
<h3>Seal #{seal['index']}</h3>
|
|
|
|
|
<p><strong>Recognized Text:</strong> {text}</p>
|
|
|
|
|
<p><strong>Confidence:</strong> {seal['confidence']:.2%}</p>
|
|
|
|
|
<p><strong>Status:</strong> {status}</p>
|
|
|
|
|
<div style="display: flex; gap: 10px; margin-top: 10px;">
|
|
|
|
|
<div>
|
|
|
|
|
<p style="margin: 0;">Marked:</p>
|
|
|
|
|
<img src="{seal['marked_path']}" style="max-height: 200px; border: 1px solid #ddd;">
|
|
|
|
|
</div>
|
|
|
|
|
<div>
|
|
|
|
|
<p style="margin: 0;">Unwarped:</p>
|
|
|
|
|
{f'<img src="{seal["unwarp_path"]}" style="max-height: 200px; border: 1px solid #ddd;">' if seal.get('unwarp_path') else 'N/A'}
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>"""
|
|
|
|
|
|
|
|
|
|
html = f"""<!DOCTYPE html>
|
|
|
|
|
<html lang="zh-CN">
|
|
|
|
|
<head>
|
|
|
|
|
<meta charset="UTF-8">
|
|
|
|
|
<title>Extraction Report - {pdf_name}</title>
|
|
|
|
|
<style>
|
|
|
|
|
body {{ font-family: 'Segoe UI', sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }}
|
|
|
|
|
.container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; }}
|
|
|
|
|
h1 {{ color: #333; border-bottom: 3px solid #4caf50; padding-bottom: 10px; }}
|
|
|
|
|
.info-grid {{ display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px; margin: 20px 0; }}
|
|
|
|
|
.info-box {{ background: #f9f9f9; padding: 15px; border-radius: 6px; }}
|
|
|
|
|
.info-box label {{ display: block; font-weight: bold; color: #666; margin-bottom: 5px; }}
|
|
|
|
|
.info-box .value {{ font-size: 18px; }}
|
|
|
|
|
.cma-box {{ border-left: 4px solid {cma_color}; }}
|
|
|
|
|
.inst-box {{ border-left: 4px solid {inst_color}; }}
|
|
|
|
|
.similarity {{ text-align: center; margin: 20px 0; }}
|
|
|
|
|
.similarity .score {{ font-size: 48px; font-weight: bold; }}
|
|
|
|
|
</style>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="container">
|
|
|
|
|
<h1>CMA & Institution Extraction Report</h1>
|
|
|
|
|
<p><strong>PDF:</strong> {pdf_name}</p>
|
|
|
|
|
<p><strong>Processing Time:</strong> {total_time:.2f}s</p>
|
|
|
|
|
|
|
|
|
|
<h2>CMA Code Extraction</h2>
|
|
|
|
|
<div class="info-grid">
|
|
|
|
|
<div class="info-box cma-box">
|
|
|
|
|
<label>Expected CMA</label>
|
|
|
|
|
<div class="value">{expected_cma}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box cma-box">
|
|
|
|
|
<label>Extracted CMA</label>
|
|
|
|
|
<div class="value">{extracted_cma if extracted_cma else 'N/A'}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Match Type</label>
|
|
|
|
|
<div class="value" style="color: {cma_color};">{cma_match.upper()}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Similarity</label>
|
|
|
|
|
<div class="value">{cma_sim:.1f}%</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<h2>Institution Name Extraction</h2>
|
|
|
|
|
<div class="info-grid">
|
|
|
|
|
<div class="info-box inst-box">
|
|
|
|
|
<label>Expected Institution</label>
|
|
|
|
|
<div class="value">{expected_inst}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box inst-box">
|
|
|
|
|
<label>Extracted Institution</label>
|
|
|
|
|
<div class="value">{extracted_inst if extracted_inst else 'N/A'}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Match Type</label>
|
|
|
|
|
<div class="value" style="color: {inst_color};">{inst_match.upper()}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Similarity</label>
|
|
|
|
|
<div class="value">{inst_sim:.1f}%</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<h2>Performance</h2>
|
|
|
|
|
<div class="info-grid">
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Total Time</label>
|
|
|
|
|
<div class="value">{total_time:.2f}s</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>CMA Extraction Time</label>
|
|
|
|
|
<div class="value">{result['performance']['cma_time']:.2f}s</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Seal Extraction Time</label>
|
|
|
|
|
<div class="value">{result['performance']['seal_time']:.2f}s</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="info-box">
|
|
|
|
|
<label>Seals Detected</label>
|
|
|
|
|
<div class="value">{len(result['seal_results'])}</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
{seals_html}
|
|
|
|
|
|
|
|
|
|
<h2>Visualizations</h2>
|
|
|
|
|
<div style="background: white; padding: 15px; border-radius: 6px;">
|
|
|
|
|
<p style="margin: 0 0 10px 0;">CMA Detection:</p>
|
|
|
|
|
<img src="cma_detection_fullpage.png" style="max-width: 100%; border: 1px solid #ddd;">
|
|
|
|
|
</div>
|
|
|
|
|
<div style="background: white; padding: 15px; border-radius: 6px; margin-top: 10px;">
|
|
|
|
|
<p style="margin: 0 0 10px 0;">Layout Detection:</p>
|
|
|
|
|
<img src="doc_layout_viz.png" style="max-width: 100%; border: 1px solid #ddd;">
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>"""
|
|
|
|
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
with open(output_dir / 'index.html', 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_summary_report(all_results: List[Dict[str, Any]], output_dir: Path):
|
|
|
|
|
"""Generate summary HTML report"""
|
|
|
|
|
# Calculate statistics
|
|
|
|
|
total = len(all_results)
|
|
|
|
|
valid_cma = [r for r in all_results if r['expected']['cma'] not in ['无', None]]
|
|
|
|
|
valid_inst = [r for r in all_results if r['expected']['institution'] not in ['无', None]]
|
|
|
|
|
|
|
|
|
|
cma_exact = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'exact')
|
|
|
|
|
cma_partial = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'partial')
|
|
|
|
|
cma_no = len(valid_cma) - cma_exact - cma_partial
|
|
|
|
|
|
|
|
|
|
inst_exact = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'exact')
|
|
|
|
|
inst_partial = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'partial')
|
|
|
|
|
inst_no = len(valid_inst) - inst_exact - inst_partial
|
|
|
|
|
|
|
|
|
|
cma_acc = (cma_exact / len(valid_cma) * 100) if valid_cma else 0
|
|
|
|
|
inst_acc = (inst_exact / len(valid_inst) * 100) if valid_inst else 0
|
|
|
|
|
|
|
|
|
|
avg_time = np.mean([r['performance']['total_time'] for r in all_results])
|
|
|
|
|
|
|
|
|
|
html = f"""<!DOCTYPE html>
|
|
|
|
|
<html lang="zh-CN">
|
|
|
|
|
<head>
|
|
|
|
|
<meta charset="UTF-8">
|
|
|
|
|
<title>Batch Test Summary - CMA & Institution Extraction</title>
|
|
|
|
|
<style>
|
|
|
|
|
body {{ font-family: 'Segoe UI', sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }}
|
|
|
|
|
.container {{ max-width: 1400px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; }}
|
|
|
|
|
h1 {{ color: #333; }}
|
|
|
|
|
.summary {{ display: grid; grid-template-columns: repeat(4, 1fr); gap: 20px; margin: 20px 0; }}
|
|
|
|
|
.summary-card {{ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 8px; color: white; text-align: center; }}
|
|
|
|
|
.summary-card .label {{ font-size: 14px; opacity: 0.9; }}
|
|
|
|
|
.summary-card .value {{ font-size: 32px; font-weight: bold; }}
|
|
|
|
|
table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
|
|
|
|
|
th, td {{ padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }}
|
|
|
|
|
th {{ background: #f5f5f5; }}
|
|
|
|
|
</style>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div class="container">
|
|
|
|
|
<h1>CMA & Institution Extraction - Batch Test Summary</h1>
|
|
|
|
|
<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
|
|
|
|
|
|
|
|
|
|
<h2>CMA Code Results</h2>
|
|
|
|
|
<div class="summary">
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #4caf50 0%, #45a049 100%);">
|
|
|
|
|
<div class="label">Exact Match</div>
|
|
|
|
|
<div class="value">{cma_exact}/{len(valid_cma)}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #ff9800 0%, #f57c00 100%);">
|
|
|
|
|
<div class="label">Partial Match</div>
|
|
|
|
|
<div class="value">{cma_partial}/{len(valid_cma)}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%);">
|
|
|
|
|
<div class="label">No Match</div>
|
|
|
|
|
<div class="value">{cma_no}/{len(valid_cma)}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #2196F3 0%, #1976D2 100%);">
|
|
|
|
|
<div class="label">Accuracy</div>
|
|
|
|
|
<div class="value">{cma_acc:.1f}%</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<h2>Institution Name Results</h2>
|
|
|
|
|
<div class="summary">
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #4caf50 0%, #45a049 100%);">
|
|
|
|
|
<div class="label">Exact Match</div>
|
|
|
|
|
<div class="value">{inst_exact}/{len(valid_inst)}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #ff9800 0%, #f57c00 100%);">
|
|
|
|
|
<div class="label">Partial Match</div>
|
|
|
|
|
<div class="value">{inst_partial}/{len(valid_inst)}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #f44336 0%, #d32f2f 100%);">
|
|
|
|
|
<div class="label">No Match</div>
|
|
|
|
|
<div class="value">{inst_no}/{len(valid_inst)}</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="summary-card" style="background: linear-gradient(135deg, #2196F3 0%, #1976D2 100%);">
|
|
|
|
|
<div class="label">Accuracy</div>
|
|
|
|
|
<div class="value">{inst_acc:.1f}%</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<h2>Performance</h2>
|
|
|
|
|
<p>Average processing time: {avg_time:.1f}s per PDF</p>
|
|
|
|
|
|
|
|
|
|
<h2>Complete Results</h2>
|
|
|
|
|
<table>
|
|
|
|
|
<thead>
|
|
|
|
|
<tr>
|
|
|
|
|
<th>PDF</th>
|
|
|
|
|
<th>Expected CMA</th>
|
|
|
|
|
<th>Extracted CMA</th>
|
|
|
|
|
<th>CMA Match</th>
|
|
|
|
|
<th>Expected Inst</th>
|
|
|
|
|
<th>Extracted Inst</th>
|
|
|
|
|
<th>Inst Match</th>
|
|
|
|
|
<th>Seals</th>
|
|
|
|
|
<th>Time</th>
|
|
|
|
|
</tr>
|
|
|
|
|
</thead>
|
|
|
|
|
<tbody>"""
|
|
|
|
|
|
|
|
|
|
for r in all_results:
|
|
|
|
|
cma_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(r['comparison'].get('cma', {}).get('match_type', 'no_match'), '[?]')
|
|
|
|
|
inst_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(r['comparison'].get('institution', {}).get('match_type', 'no_match'), '[?]')
|
|
|
|
|
seals_count = len(r['seal_results'])
|
|
|
|
|
|
|
|
|
|
html += f"""
|
|
|
|
|
<tr>
|
|
|
|
|
<td>{r['pdf_name']}</td>
|
|
|
|
|
<td>{r['expected']['cma']}</td>
|
|
|
|
|
<td>{r['extracted']['cma'] or 'N/A'}</td>
|
|
|
|
|
<td>{cma_symbol}</td>
|
|
|
|
|
<td>{r['expected']['institution'][:30]}...</td>
|
|
|
|
|
<td>{(r['extracted']['institution'] or 'N/A')[:30]}...</td>
|
|
|
|
|
<td>{inst_symbol}</td>
|
|
|
|
|
<td>{seals_count}</td>
|
|
|
|
|
<td>{r['performance']['total_time']:.1f}s</td>
|
|
|
|
|
</tr>"""
|
|
|
|
|
|
|
|
|
|
html += """
|
|
|
|
|
</tbody>
|
|
|
|
|
</table>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>"""
|
|
|
|
|
|
|
|
|
|
with open(output_dir / 'summary.html', 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""Main execution function"""
|
|
|
|
|
# Parse command line arguments
|
|
|
|
|
import argparse
|
|
|
|
|
parser = argparse.ArgumentParser(description='CMA & Institution Extraction - Batch Accuracy Test')
|
|
|
|
|
parser.add_argument('--ocr-model', type=str, default=OCR_MODEL,
|
|
|
|
|
choices=['ppocr_v5', 'paddleocr_vl'],
|
|
|
|
|
help='OCR model to use (default: from OCR_MODEL env var or ppocr_v5)')
|
|
|
|
|
parser.add_argument('--batch-size', type=int, default=BATCH_SIZE,
|
|
|
|
|
help=f'Number of PDFs to process (default: {BATCH_SIZE})')
|
2026-02-07 23:13:03 +08:00
|
|
|
parser.add_argument('--pdf-names', type=str, default=None,
|
|
|
|
|
help='Comma-separated list of PDF names to process (e.g., "1.pdf,2.pdf"). Overrides --batch-size')
|
2026-02-07 14:03:10 +08:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
# Use command line argument if provided
|
|
|
|
|
ocr_model = args.ocr_model
|
|
|
|
|
batch_size = args.batch_size
|
2026-02-07 23:13:03 +08:00
|
|
|
pdf_names_filter = args.pdf_names
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
print("CMA & INSTITUTION EXTRACTION - BATCH ACCURACY TEST")
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
print(f"OCR Model: {ocr_model.upper()}")
|
|
|
|
|
print(f"Processing first {batch_size} PDFs from results.json...")
|
|
|
|
|
print(f"PDF directory: {PDF_DIR}")
|
|
|
|
|
print(f"Output directory: {OUTPUT_DIR}")
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
# Load ground truth
|
|
|
|
|
if not RESULTS_JSON.exists():
|
|
|
|
|
logger.error(f"Ground truth file not found: {RESULTS_JSON}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
with open(RESULTS_JSON, 'r', encoding='utf-8') as f:
|
|
|
|
|
ground_truth = json.load(f)
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
# Filter PDFs: either by name filter or by batch size
|
|
|
|
|
if pdf_names_filter:
|
|
|
|
|
# Split comma-separated names and strip whitespace
|
|
|
|
|
requested_names = [name.strip() for name in pdf_names_filter.split(',')]
|
|
|
|
|
pdf_list = [(name, ground_truth[name]) for name in requested_names if name in ground_truth]
|
|
|
|
|
if not pdf_list:
|
|
|
|
|
logger.error(f"None of the specified PDFs found in results.json: {requested_names}")
|
|
|
|
|
print(f"ERROR: None of the specified PDFs found in results.json: {requested_names}")
|
|
|
|
|
return
|
|
|
|
|
print(f"Processing {len(pdf_list)} specified PDF(s): {[name for name, _ in pdf_list]}")
|
|
|
|
|
else:
|
|
|
|
|
# Get first N PDFs
|
|
|
|
|
pdf_list = list(ground_truth.items())[:batch_size]
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
# Initialize OCR engines
|
|
|
|
|
# Note: We ALWAYS initialize ocr_engine for CMA recognition
|
2026-02-07 23:13:03 +08:00
|
|
|
# We ALWAYS try to initialize vl_pipeline for backup seal recognition (when unwarp fails)
|
2026-02-07 14:03:10 +08:00
|
|
|
ocr_engine = None
|
|
|
|
|
vl_pipeline = None
|
|
|
|
|
|
|
|
|
|
logger.info("Initializing PaddleOCR engine for CMA recognition...")
|
|
|
|
|
print("Initializing PaddleOCR engine (required for CMA extraction)...")
|
|
|
|
|
ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch')
|
|
|
|
|
logger.info("PaddleOCR initialized successfully")
|
|
|
|
|
print("PaddleOCR initialized successfully\n")
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
# Initialize PaddleOCRVL for backup seal recognition (always try if available)
|
|
|
|
|
# This provides a fallback when polar unwarping fails
|
|
|
|
|
if PADDLEOCRVL_AVAILABLE:
|
|
|
|
|
logger.info("Initializing PaddleOCRVL for backup seal recognition...")
|
|
|
|
|
print("Initializing PaddleOCRVL for backup seal recognition (this may take a while)...")
|
|
|
|
|
try:
|
|
|
|
|
vl_pipeline = PaddleOCRVL(
|
|
|
|
|
use_seal_recognition=True,
|
|
|
|
|
use_ocr_for_image_block=True,
|
|
|
|
|
use_layout_detection=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Verify initialization
|
|
|
|
|
if vl_pipeline is None:
|
|
|
|
|
raise RuntimeError("PaddleOCRVL initialization returned None")
|
|
|
|
|
|
|
|
|
|
logger.info("PaddleOCRVL initialized successfully (backup ready)")
|
|
|
|
|
print("PaddleOCRVL backup ready - will be used when polar unwarping fails\n")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to initialize PaddleOCRVL: {e}")
|
|
|
|
|
logger.error(f"Exception type: {type(e).__name__}")
|
|
|
|
|
print(f"WARNING: Failed to initialize PaddleOCRVL: {e}")
|
|
|
|
|
print("Polar unwarping failures will skip OCR (no backup available)\n")
|
|
|
|
|
else:
|
|
|
|
|
logger.info("PaddleOCRVL not available - polar unwarping failures will skip OCR")
|
|
|
|
|
print("Note: PaddleOCRVL not installed - polar unwarping failures will skip OCR")
|
|
|
|
|
print(" To enable backup: pip install paddleocr[doc-parser]\n")
|
|
|
|
|
|
|
|
|
|
# Validate OCR model selection
|
|
|
|
|
if ocr_model == "paddleocr_vl" and vl_pipeline is None:
|
|
|
|
|
print("WARNING: PaddleOCRVL requested for primary seal recognition but not available!")
|
|
|
|
|
print("Falling back to PP-OCRv5 for seal recognition")
|
|
|
|
|
print("Please install: pip install paddleocr[doc-parser]")
|
|
|
|
|
ocr_model = "ppocr_v5"
|
2026-02-07 14:03:10 +08:00
|
|
|
|
|
|
|
|
# Create output directory
|
|
|
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# Process each PDF
|
|
|
|
|
all_results = []
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
total_pdfs = len(pdf_list)
|
2026-02-07 14:03:10 +08:00
|
|
|
for i, (pdf_name, expected_data) in enumerate(pdf_list, 1):
|
|
|
|
|
expected_cma = expected_data.get('CMA', '')
|
|
|
|
|
expected_inst = expected_data.get('机构名', '')
|
|
|
|
|
|
2026-02-07 23:13:03 +08:00
|
|
|
print(f"\n[{i}/{total_pdfs}] Processing: {pdf_name}")
|
2026-02-07 14:03:10 +08:00
|
|
|
print(" + Loading PDF and extracting page...")
|
|
|
|
|
|
|
|
|
|
result = process_single_pdf(
|
|
|
|
|
pdf_name, expected_cma, expected_inst,
|
|
|
|
|
PDF_DIR, OUTPUT_DIR, ocr_engine,
|
|
|
|
|
ocr_model=ocr_model, vl_pipeline=vl_pipeline
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
all_results.append(result)
|
|
|
|
|
|
|
|
|
|
# Print result summary
|
|
|
|
|
if result['status'] == 'file_not_found':
|
|
|
|
|
print(f" + [!] File not found, skipping")
|
|
|
|
|
else:
|
|
|
|
|
cma_match = result['comparison']['cma'].get('match_type', 'unknown')
|
|
|
|
|
cma_sim = result['comparison']['cma'].get('similarity', 0)
|
|
|
|
|
cma_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(cma_match, '[?]')
|
|
|
|
|
|
|
|
|
|
print(f" + CMA Extraction:")
|
|
|
|
|
print(f" + Extracted: {result['extracted']['cma'] or 'N/A'}")
|
|
|
|
|
print(f" + Expected: {expected_cma}")
|
|
|
|
|
print(f" + Match: {cma_symbol} {cma_match.upper()} ({cma_sim:.1f}%)")
|
|
|
|
|
|
|
|
|
|
if result['extracted']['institution']:
|
|
|
|
|
inst_match = result['comparison']['institution'].get('match_type', 'unknown')
|
|
|
|
|
inst_sim = result['comparison']['institution'].get('similarity', 0)
|
|
|
|
|
inst_symbol = {'exact': '[OK]', 'partial': '[PARTIAL]', 'no_match': '[FAIL]'}.get(inst_match, '[?]')
|
|
|
|
|
print(f" + Institution Extraction:")
|
|
|
|
|
print(f" + Extracted: {result['extracted']['institution'][:50]}...")
|
|
|
|
|
print(f" + Expected: {expected_inst[:50]}...")
|
|
|
|
|
print(f" + Match: {inst_symbol} {inst_match.upper()} ({inst_sim:.1f}%)")
|
|
|
|
|
|
|
|
|
|
print(f" + Seals detected: {len(result['seal_results'])}")
|
|
|
|
|
print(f" + Completed in {result['performance']['total_time']:.2f}s")
|
|
|
|
|
|
|
|
|
|
# Generate individual report
|
|
|
|
|
generate_individual_report(result, OUTPUT_DIR / pdf_name)
|
|
|
|
|
|
|
|
|
|
# Interim results every 5
|
|
|
|
|
if i % 5 == 0:
|
|
|
|
|
valid_cma = [r for r in all_results if r['expected']['cma'] not in ['无', None]]
|
|
|
|
|
cma_exact = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'exact')
|
|
|
|
|
cma_acc = (cma_exact / len(valid_cma) * 100) if valid_cma else 0
|
|
|
|
|
|
|
|
|
|
valid_inst = [r for r in all_results if r['expected']['institution'] not in ['无', None] and r['extracted']['institution']]
|
|
|
|
|
inst_exact = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'exact')
|
|
|
|
|
inst_acc = (inst_exact / len(valid_inst) * 100) if valid_inst else 0
|
|
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
print(f"INTERIM RESULTS ({i}/{BATCH_SIZE} completed)")
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
print(f"CMA Accuracy: {cma_acc:.1f}% ({cma_exact}/{len(valid_cma)} exact)")
|
|
|
|
|
print(f"Institution Accuracy: {inst_acc:.1f}% ({inst_exact}/{len(valid_inst)} exact)")
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
total_time = time.time() - start_time
|
|
|
|
|
|
|
|
|
|
# Calculate final statistics
|
|
|
|
|
valid_cma = [r for r in all_results if r['expected']['cma'] not in ['无', None]]
|
|
|
|
|
cma_exact = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'exact')
|
|
|
|
|
cma_partial = sum(1 for r in valid_cma if r['comparison']['cma'].get('match_type') == 'partial')
|
|
|
|
|
cma_no = len(valid_cma) - cma_exact - cma_partial
|
|
|
|
|
cma_acc = (cma_exact / len(valid_cma) * 100) if valid_cma else 0
|
|
|
|
|
|
|
|
|
|
valid_inst = [r for r in all_results if r['expected']['institution'] not in ['无', None] and r['extracted']['institution']]
|
|
|
|
|
inst_exact = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'exact')
|
|
|
|
|
inst_partial = sum(1 for r in valid_inst if r['comparison']['institution'].get('match_type') == 'partial')
|
|
|
|
|
inst_no = len(valid_inst) - inst_exact - inst_partial
|
|
|
|
|
inst_acc = (inst_exact / len(valid_inst) * 100) if valid_inst else 0
|
|
|
|
|
|
|
|
|
|
# Generate summary report
|
|
|
|
|
print("\nGenerating summary report...")
|
|
|
|
|
generate_summary_report(all_results, OUTPUT_DIR)
|
|
|
|
|
|
|
|
|
|
# Save JSON
|
|
|
|
|
json_output = {
|
|
|
|
|
'summary': {
|
|
|
|
|
'total_processed': len(all_results),
|
|
|
|
|
'cma': {
|
|
|
|
|
'exact': cma_exact,
|
|
|
|
|
'partial': cma_partial,
|
|
|
|
|
'no_match': cma_no,
|
|
|
|
|
'accuracy': cma_acc / 100
|
|
|
|
|
},
|
|
|
|
|
'institution': {
|
|
|
|
|
'exact': inst_exact,
|
|
|
|
|
'partial': inst_partial,
|
|
|
|
|
'no_match': inst_no,
|
|
|
|
|
'accuracy': inst_acc / 100
|
|
|
|
|
},
|
|
|
|
|
'avg_processing_time': np.mean([r['performance']['total_time'] for r in all_results])
|
|
|
|
|
},
|
|
|
|
|
'results': all_results
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
with open(OUTPUT_DIR / 'test_report.json', 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(json_output, f, ensure_ascii=False, indent=2, cls=NumpyEncoder)
|
|
|
|
|
|
|
|
|
|
# Print final summary
|
|
|
|
|
print("\n" + "=" * 80)
|
|
|
|
|
print("BATCH TEST COMPLETED - FINAL RESULTS")
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
print(f"Total Processed: {len(all_results)}")
|
|
|
|
|
print()
|
|
|
|
|
print("CMA Code Results:")
|
|
|
|
|
print(f" Exact Match: {cma_exact}/{len(valid_cma)} ({cma_exact/len(valid_cma)*100:.1f}%)")
|
|
|
|
|
print(f" Partial Match: {cma_partial}/{len(valid_cma)} ({cma_partial/len(valid_cma)*100:.1f}%)")
|
|
|
|
|
print(f" No Match: {cma_no}/{len(valid_cma)} ({cma_no/len(valid_cma)*100:.1f}%)")
|
|
|
|
|
print(f" ** CMA Accuracy: {cma_acc:.1f}% **")
|
|
|
|
|
print()
|
|
|
|
|
print("Institution Name Results:")
|
|
|
|
|
print(f" Exact Match: {inst_exact}/{len(valid_inst)} ({inst_exact/len(valid_inst)*100:.1f}%)")
|
|
|
|
|
print(f" Partial Match: {inst_partial}/{len(valid_inst)} ({inst_partial/len(valid_inst)*100:.1f}%)")
|
|
|
|
|
print(f" No Match: {inst_no}/{len(valid_inst)} ({inst_no/len(valid_inst)*100:.1f}%)")
|
|
|
|
|
print(f" ** Institution Accuracy: {inst_acc:.1f}% **")
|
|
|
|
|
print()
|
|
|
|
|
print("Performance:")
|
|
|
|
|
print(f" Total Time: {total_time:.1f}s ({total_time/60:.1f}min)")
|
|
|
|
|
print(f" Average Time: {total_time/len(all_results):.1f}s per PDF")
|
|
|
|
|
print()
|
|
|
|
|
print("Reports Generated:")
|
|
|
|
|
print(f" - {OUTPUT_DIR / 'summary.html'}")
|
|
|
|
|
print(f" - {OUTPUT_DIR / 'test_report.json'}")
|
|
|
|
|
print(f" - Individual reports: {OUTPUT_DIR / '{pdf_name}/'}")
|
|
|
|
|
print()
|
|
|
|
|
print("=" * 80)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|