diff --git a/debug_paddle.py b/debug_paddle.py
deleted file mode 100644
index ddcbf37..0000000
--- a/debug_paddle.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from paddleocr import SealTextDetection
-import os
-
-def debug_paddle():
- img_path = "seal_cropped.png"
- if not os.path.exists(img_path):
- print(f"Error: {img_path} not found")
- return
-
- print(f"Loading SealTextDetection model on {img_path}...")
- try:
- model = SealTextDetection(model_name="PP-OCRv4_server_seal_det")
- output = model.predict(img_path, batch_size=1)
-
- print(f"Output type: {type(output)}")
- for i, res in enumerate(output):
- print(f"Result {i} attributes: {dir(res)}")
- res.print()
- # Try to see if it has boxes or polygons
- if hasattr(res, 'boxes'):
- print(f"Boxes found: {len(res.boxes)}")
- if hasattr(res, 'polygons'):
- print(f"Polygons found: {len(res.polygons)}")
-
- # Save to see what it does
- res.save_to_img(save_path="./debug_output")
- print("Saved debug image to ./debug_output")
-
- except Exception as e:
- print(f"Caught Exception: {e}")
- import traceback
- traceback.print_exc()
-
-if __name__ == "__main__":
- debug_paddle()
diff --git a/generate_viz_report.py b/generate_viz_report.py
deleted file mode 100644
index 7f4ef51..0000000
--- a/generate_viz_report.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import glob
-import re
-
-def generate_html(viz_dir="report_viz"):
- html_file = os.path.join(viz_dir, "index.html")
-
- # Sort files by timestamp
- files = sorted(os.listdir(viz_dir))
-
- full_pages = [f for f in files if f.startswith("viz_")]
- crops = [f for f in files if "seal_crop_" in f]
- unwarps = [f for f in files if "seal_localized_" in f]
-
- html = """
-
-
- Seal Unwarp Verification Report
-
-
-
- Seal Unwarp Verification Report
- Intermediate steps for seal detection and unwarping.
- """
-
- # Group by similarity in timestamp (they might not be identical)
- # Actually, let's just show them in sequence.
-
- html += '1. Full Page Detections
'
- for pf in full_pages:
- html += f'
{pf}

'
- html += '
'
-
- html += '2. Seal Crops & Unwarps
'
- # Match crops with unwarps by proximity in sorted list or timestamp extraction
- for crop in crops:
- ts = re.search(r"(\d+)", crop).group(1)
- # Find unwarps that happened shortly after this crop
- matching_unwarps = [u for u in unwarps if abs(int(re.search(r"(\d+)", u).group(1)) - int(ts)) < 2000]
-
- html += '
'
- html += f'
Step A: Seal Crop

'
-
- for u in matching_unwarps:
- label = "Step B: 7:30 Unwarp" if "730" in u else "Step C: Smart Unwarp"
- html += f'
{label}

'
-
- html += '
'
- html += '
'
-
- html += ""
-
- with open(html_file, "w", encoding="utf-8") as f:
- f.write(html)
- print(f"HTML report generated: {html_file}")
-
-if __name__ == "__main__":
- generate_html()
diff --git a/test_accuracy_batch_full.py b/test_accuracy_batch_full.py
index efbee14..343897f 100644
--- a/test_accuracy_batch_full.py
+++ b/test_accuracy_batch_full.py
@@ -99,6 +99,39 @@ SIMILARITY_THRESHOLD = 85.0
OCR_MODEL = os.environ.get("OCR_MODEL", "ppocr_v5")
+# ============ Helper Functions ============
+
+def imwrite_safe(file_path, img):
+ """
+ Write image file safely, handling Chinese paths on Windows.
+
+ On Windows, cv2.imwrite fails with Chinese paths. This function uses
+ cv2.imencode + tofile as a fallback.
+
+ Args:
+ file_path: Path to save the image
+ img: Image data (numpy array)
+
+ Returns:
+ bool: True if successful, False otherwise
+ """
+ try:
+ # Try standard cv2.imwrite first
+ success = cv2.imwrite(file_path, img)
+ if success:
+ return True
+
+ # Fallback: Use imencode + tofile for Chinese paths
+ is_success, buffer = cv2.imencode(".png", img)
+ if is_success:
+ buffer.tofile(file_path)
+ return True
+ return False
+ except Exception as e:
+ logger.error(f"Failed to write image to {file_path}: {e}")
+ return False
+
+
# ============ Seal Processing Functions (from v_verify_logic.py) ============
def polar_unwarp(img, center, radius, start_theta, angular_extent):
@@ -219,7 +252,18 @@ def calculate_precise_arc(polygons, center):
candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight})
candidates.sort(key=lambda x: x['score'], reverse=True)
best = candidates[0]
- return best['start'], best['end'] - best['start']
+
+ # FIX: Limit extent to max 350° to avoid overlap and distortion
+ # Extent > 360° causes severe image distortion in polar unwarping
+ MAX_EXTENT_DEG = 350.0
+ start_theta = best['start']
+ extent = best['end'] - best['start']
+
+ if math.degrees(extent) > MAX_EXTENT_DEG:
+ logger.warning(f"Arc extent {math.degrees(extent):.2f}° exceeds {MAX_EXTENT_DEG}°, clamping to avoid distortion")
+ extent = math.radians(MAX_EXTENT_DEG)
+
+ return start_theta, extent
def fit_circle_from_text_polygons(all_polygons):
@@ -384,10 +428,12 @@ def run_ocr_recognition(image_path, rec_model):
def run_ocr_recognition_vl(image_path, vl_pipeline):
"""
- Run OCR recognition using PaddleOCRVL on unwarp seal image.
+ Run OCR recognition using PaddleOCRVL on seal image.
+
+ Can be used on both unwarp images and crop images (backup mode).
Args:
- image_path: Path to unwarp seal image
+ image_path: Path to seal image (unwarp or crop)
vl_pipeline: Initialized PaddleOCRVL pipeline
Returns:
@@ -492,9 +538,9 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
# Save page image
doc_path = os.path.join(output_dir, "doc_page.png")
try:
- success = cv2.imwrite(doc_path, page_img)
+ success = imwrite_safe(doc_path, page_img)
if not success:
- logger.error(f"cv2.imwrite returned False for {doc_path}")
+ logger.error(f"imwrite_safe returned False for {doc_path}")
# Try alternative save method using PIL
try:
from PIL import Image
@@ -544,7 +590,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
if is_seal:
seal_boxes.append(box)
- cv2.imwrite(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)
+ imwrite_safe(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)
if not seal_boxes:
logger.warning("No seals detected")
@@ -585,7 +631,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
continue
crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
- success = cv2.imwrite(crop_path, seal_crop)
+ success = imwrite_safe(crop_path, seal_crop)
if not success:
# Try PIL fallback
try:
@@ -623,6 +669,88 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
logger.info(f" - Center: ({center[0]}, {center[1]})")
logger.info(f" - Radius: {radius}")
+ # ============ INSUFFICIENT POLYGONS CHECK ============
+ # If too few text polygons detected, polar unwarping will likely fail
+ # Skip directly to PaddleOCRVL backup in this case
+ MIN_POLYGONS_FOR_UNWARP = 3
+ if len(all_polygons) < MIN_POLYGONS_FOR_UNWARP:
+ logger.warning(f" Seal #{i}: Only {len(all_polygons)} text polygons detected (< {MIN_POLYGONS_FOR_UNWARP})")
+ logger.warning(f" Seal #{i}: Skipping polar unwarping (insufficient polygon data)")
+ logger.info(f" Seal #{i}: Using PaddleOCRVL backup instead")
+
+ # Save crop image
+ imwrite_safe(crop_path, seal_crop)
+
+ # Use PaddleOCRVL directly on crop (no unwarp)
+ if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
+ ocr_result = run_ocr_recognition_vl(crop_path, vl_pipeline)
+ logger.info(f" Seal #{i} PaddleOCRVL Result (direct crop):")
+ logger.info(f" - Text: '{ocr_result['text']}'")
+ logger.info(f" - Score: {ocr_result['score']:.4f}")
+ logger.info(f" - Success: {ocr_result['success']}")
+ logger.info(f" - ** Used PaddleOCRVL (insufficient polygons for unwarping) **")
+
+ # Create debug info without unwarp
+ seal_data = {
+ 'index': i,
+ 'box': box,
+ 'crop_path': Path(crop_path).name,
+ 'unwarp_path': None, # No unwarp performed
+ 'marked_path': None, # No marked image
+ 'polar_viz_path': None, # No polar visualization
+ 'text': ocr_result['text'],
+ 'confidence': float(ocr_result['score']),
+ 'success': bool(ocr_result['success']),
+ 'method_used': f'{method_used}_skip_unwarp',
+ 'used_fallback': True,
+ 'debug_info': {
+ 'center': center,
+ 'radius': radius,
+ 'start_theta_deg': None,
+ 'extent_deg': None,
+ 'num_polygons': len(all_polygons),
+ 'crop_size': (cw, ch),
+ 'unwarp_size': None,
+ 'skip_reason': f'Insufficient polygons ({len(all_polygons)} < {MIN_POLYGONS_FOR_UNWARP})'
+ }
+ }
+ result['seals'].append(seal_data)
+
+ if ocr_result['success']:
+ result['institutions'].append(ocr_result['text'])
+ logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})")
+ else:
+ logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name")
+
+ continue # Skip to next seal
+ else:
+ logger.error(f" Seal #{i}: PaddleOCRVL not available, cannot extract text")
+ seal_data = {
+ 'index': i,
+ 'box': box,
+ 'crop_path': Path(crop_path).name,
+ 'unwarp_path': None,
+ 'marked_path': None,
+ 'polar_viz_path': None,
+ 'text': '',
+ 'confidence': 0.0,
+ 'success': False,
+ 'method_used': f'{method_used}_skip_unwarp',
+ 'used_fallback': True,
+ 'debug_info': {
+ 'center': center,
+ 'radius': radius,
+ 'start_theta_deg': None,
+ 'extent_deg': None,
+ 'num_polygons': len(all_polygons),
+ 'crop_size': (cw, ch),
+ 'unwarp_size': None,
+ 'skip_reason': f'Insufficient polygons and no PaddleOCRVL backup'
+ }
+ }
+ result['seals'].append(seal_data)
+ continue
+
# Calculate arc and unwarp
start_theta, extent = calculate_precise_arc(all_polygons, center)
logger.info(f" Seal #{i} Arc Parameters:")
@@ -658,7 +786,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
logger.info(f" Seal #{i}: Performing polar unwarping with detected text polygons...")
unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent)
if unwarp is not None:
- cv2.imwrite(unwarp_path, unwarp)
+ imwrite_safe(unwarp_path, unwarp)
logger.info(f" - Unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
def draw_line(m, theta, color):
@@ -684,7 +812,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
# Save polar visualization
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
- cv2.imwrite(polar_viz_path, polar_viz)
+ imwrite_safe(polar_viz_path, polar_viz)
logger.info(f" - Polar visualization saved: seal_polar_viz_{i}.png")
else:
logger.warning(f" Seal #{i}: Polar unwarp returned None")
@@ -707,7 +835,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
unwarp = polar_unwarp(seal_crop, center, radius, fallback_start_theta, fallback_extent)
if unwarp is not None:
- cv2.imwrite(unwarp_path, unwarp)
+ imwrite_safe(unwarp_path, unwarp)
logger.info(f" - Fallback unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}")
# Update start_theta and extent for visualization
@@ -736,20 +864,19 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1)
polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png")
- cv2.imwrite(polar_viz_path, polar_viz)
+ imwrite_safe(polar_viz_path, polar_viz)
logger.info(f" - Fallback polar visualization saved: seal_polar_viz_{i}.png")
else:
logger.warning(f" Seal #{i}: Fallback polar unwarp also returned None")
- if unwarp is None:
- logger.warning(f" Seal #{i}: No unwarp image available, skipping OCR")
-
marked_path = os.path.join(output_dir, f"seal_marked_{i}.png")
- cv2.imwrite(marked_path, marked)
+ imwrite_safe(marked_path, marked)
# OCR recognition
ocr_result = {'text': '', 'score': 0.0, 'success': False}
+
if unwarp is not None:
+ # Standard path: Recognize unwarp image
method_str = "FALLBACK" if used_fallback else "Standard"
logger.info(f" Seal #{i}: Running OCR ({method_str}, model={ocr_model}) on unwarp image...")
@@ -766,7 +893,21 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v
if used_fallback:
logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **")
else:
- logger.warning(f" Seal #{i}: No unwarp image available, skipping OCR")
+ # ============ BACKUP: Use PaddleOCRVL directly on seal crop ============
+ logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)")
+
+ if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE:
+ logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image")
+ seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
+ ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline)
+ logger.info(f" Seal #{i} PaddleOCRVL Backup Result:")
+ logger.info(f" - Text: '{ocr_result['text']}'")
+ logger.info(f" - Score: {ocr_result['score']:.4f}")
+ logger.info(f" - Success: {ocr_result['success']}")
+ logger.info(f" - Text length: {len(ocr_result['text'])} chars")
+ logger.info(f" - ** Used PaddleOCRVL backup (direct crop recognition) **")
+ else:
+ logger.warning(f" Seal #{i}: No backup available (vl_pipeline=None or PaddleOCRVL not installed), skipping OCR")
seal_data = {
'index': int(i),
@@ -994,6 +1135,10 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str,
elif not best_inst:
best_inst = inst
+ # Fallback: if best_inst is still None (all similarities were 0), use first institution
+ if best_inst is None and seal_result['institutions']:
+ best_inst = seal_result['institutions'][0]
+
result['extracted']['institution'] = best_inst
# Compare institution
@@ -1299,11 +1444,14 @@ def main():
help='OCR model to use (default: from OCR_MODEL env var or ppocr_v5)')
parser.add_argument('--batch-size', type=int, default=BATCH_SIZE,
help=f'Number of PDFs to process (default: {BATCH_SIZE})')
+ parser.add_argument('--pdf-names', type=str, default=None,
+ help='Comma-separated list of PDF names to process (e.g., "1.pdf,2.pdf"). Overrides --batch-size')
args = parser.parse_args()
# Use command line argument if provided
ocr_model = args.ocr_model
batch_size = args.batch_size
+ pdf_names_filter = args.pdf_names
print("=" * 80)
print("CMA & INSTITUTION EXTRACTION - BATCH ACCURACY TEST")
@@ -1322,12 +1470,23 @@ def main():
with open(RESULTS_JSON, 'r', encoding='utf-8') as f:
ground_truth = json.load(f)
- # Get first N PDFs
- pdf_list = list(ground_truth.items())[:batch_size]
+ # Filter PDFs: either by name filter or by batch size
+ if pdf_names_filter:
+ # Split comma-separated names and strip whitespace
+ requested_names = [name.strip() for name in pdf_names_filter.split(',')]
+ pdf_list = [(name, ground_truth[name]) for name in requested_names if name in ground_truth]
+ if not pdf_list:
+ logger.error(f"None of the specified PDFs found in results.json: {requested_names}")
+ print(f"ERROR: None of the specified PDFs found in results.json: {requested_names}")
+ return
+ print(f"Processing {len(pdf_list)} specified PDF(s): {[name for name, _ in pdf_list]}")
+ else:
+ # Get first N PDFs
+ pdf_list = list(ground_truth.items())[:batch_size]
# Initialize OCR engines
# Note: We ALWAYS initialize ocr_engine for CMA recognition
- # PaddleOCRVL is ONLY used for seal text recognition
+ # We ALWAYS try to initialize vl_pipeline for backup seal recognition (when unwarp fails)
ocr_engine = None
vl_pipeline = None
@@ -1337,35 +1496,40 @@ def main():
logger.info("PaddleOCR initialized successfully")
print("PaddleOCR initialized successfully\n")
- # Initialize PaddleOCRVL if requested for seal recognition
- if ocr_model == "paddleocr_vl":
- if not PADDLEOCRVL_AVAILABLE:
- print("WARNING: PaddleOCRVL requested but not available!")
- print("Falling back to PP-OCRv5 for seal recognition")
- print("Please install: pip install paddleocr[doc-parser]")
- ocr_model = "ppocr_v5"
- else:
- logger.info("Initializing PaddleOCRVL for seal recognition...")
- print("Initializing PaddleOCRVL for seal recognition (this may take a while)...")
- try:
- vl_pipeline = PaddleOCRVL(
- use_seal_recognition=True,
- use_ocr_for_image_block=True,
- use_layout_detection=True
- )
+ # Initialize PaddleOCRVL for backup seal recognition (always try if available)
+ # This provides a fallback when polar unwarping fails
+ if PADDLEOCRVL_AVAILABLE:
+ logger.info("Initializing PaddleOCRVL for backup seal recognition...")
+ print("Initializing PaddleOCRVL for backup seal recognition (this may take a while)...")
+ try:
+ vl_pipeline = PaddleOCRVL(
+ use_seal_recognition=True,
+ use_ocr_for_image_block=True,
+ use_layout_detection=True
+ )
- # Verify initialization
- if vl_pipeline is None:
- raise RuntimeError("PaddleOCRVL initialization returned None")
+ # Verify initialization
+ if vl_pipeline is None:
+ raise RuntimeError("PaddleOCRVL initialization returned None")
- logger.info("PaddleOCRVL initialized successfully")
- print("PaddleOCRVL for seal recognition initialized successfully\n")
- except Exception as e:
- logger.error(f"Failed to initialize PaddleOCRVL: {e}")
- logger.error(f"Exception type: {type(e).__name__}")
- print(f"WARNING: Failed to initialize PaddleOCRVL: {e}")
- print("Falling back to PP-OCRv5 for seal recognition")
- ocr_model = "ppocr_v5"
+ logger.info("PaddleOCRVL initialized successfully (backup ready)")
+ print("PaddleOCRVL backup ready - will be used when polar unwarping fails\n")
+ except Exception as e:
+ logger.error(f"Failed to initialize PaddleOCRVL: {e}")
+ logger.error(f"Exception type: {type(e).__name__}")
+ print(f"WARNING: Failed to initialize PaddleOCRVL: {e}")
+ print("Polar unwarping failures will skip OCR (no backup available)\n")
+ else:
+ logger.info("PaddleOCRVL not available - polar unwarping failures will skip OCR")
+ print("Note: PaddleOCRVL not installed - polar unwarping failures will skip OCR")
+ print(" To enable backup: pip install paddleocr[doc-parser]\n")
+
+ # Validate OCR model selection
+ if ocr_model == "paddleocr_vl" and vl_pipeline is None:
+ print("WARNING: PaddleOCRVL requested for primary seal recognition but not available!")
+ print("Falling back to PP-OCRv5 for seal recognition")
+ print("Please install: pip install paddleocr[doc-parser]")
+ ocr_model = "ppocr_v5"
# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True)
@@ -1374,11 +1538,12 @@ def main():
all_results = []
start_time = time.time()
+ total_pdfs = len(pdf_list)
for i, (pdf_name, expected_data) in enumerate(pdf_list, 1):
expected_cma = expected_data.get('CMA', '')
expected_inst = expected_data.get('机构名', '')
- print(f"\n[{i}/{BATCH_SIZE}] Processing: {pdf_name}")
+ print(f"\n[{i}/{total_pdfs}] Processing: {pdf_name}")
print(" + Loading PDF and extracting page...")
result = process_single_pdf(