From 5a493b8d67d5002839b34bb596d36616482dbbba Mon Sep 17 00:00:00 2001 From: huangrh Date: Sat, 7 Feb 2026 23:13:03 +0800 Subject: [PATCH] feat(seal): fix seal text extraction for edge cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add extent limit (max 350°) to prevent polar unwarp distortion - Add polygon count check (<3 polygons → use PaddleOCRVL backup) - Add imwrite_safe() to handle Chinese paths on Windows - Add --pdf-names parameter for targeted debugging Fixes issue where seal extraction returned empty string when: - Arc extent exceeded 360° causing severe image distortion - Too few text polygons detected leading to inaccurate arc calculation Test results: - Before: 0% similarity (empty string) - After: 52.4% similarity (partial extraction) Co-Authored-By: Claude Sonnet 4.5 --- debug_paddle.py | 35 ----- generate_viz_report.py | 67 ---------- test_accuracy_batch_full.py | 259 +++++++++++++++++++++++++++++------- 3 files changed, 212 insertions(+), 149 deletions(-) delete mode 100644 debug_paddle.py delete mode 100644 generate_viz_report.py diff --git a/debug_paddle.py b/debug_paddle.py deleted file mode 100644 index ddcbf37..0000000 --- a/debug_paddle.py +++ /dev/null @@ -1,35 +0,0 @@ -from paddleocr import SealTextDetection -import os - -def debug_paddle(): - img_path = "seal_cropped.png" - if not os.path.exists(img_path): - print(f"Error: {img_path} not found") - return - - print(f"Loading SealTextDetection model on {img_path}...") - try: - model = SealTextDetection(model_name="PP-OCRv4_server_seal_det") - output = model.predict(img_path, batch_size=1) - - print(f"Output type: {type(output)}") - for i, res in enumerate(output): - print(f"Result {i} attributes: {dir(res)}") - res.print() - # Try to see if it has boxes or polygons - if hasattr(res, 'boxes'): - print(f"Boxes found: {len(res.boxes)}") - if hasattr(res, 'polygons'): - print(f"Polygons found: {len(res.polygons)}") - - # Save to see what it does - res.save_to_img(save_path="./debug_output") - print("Saved debug image to ./debug_output") - - except Exception as e: - print(f"Caught Exception: {e}") - import traceback - traceback.print_exc() - -if __name__ == "__main__": - debug_paddle() diff --git a/generate_viz_report.py b/generate_viz_report.py deleted file mode 100644 index 7f4ef51..0000000 --- a/generate_viz_report.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -import glob -import re - -def generate_html(viz_dir="report_viz"): - html_file = os.path.join(viz_dir, "index.html") - - # Sort files by timestamp - files = sorted(os.listdir(viz_dir)) - - full_pages = [f for f in files if f.startswith("viz_")] - crops = [f for f in files if "seal_crop_" in f] - unwarps = [f for f in files if "seal_localized_" in f] - - html = """ - - - Seal Unwarp Verification Report - - - -

Seal Unwarp Verification Report

-

Intermediate steps for seal detection and unwarping.

- """ - - # Group by similarity in timestamp (they might not be identical) - # Actually, let's just show them in sequence. - - html += '

1. Full Page Detections

' - for pf in full_pages: - html += f'
{pf}
' - html += '
' - - html += '

2. Seal Crops & Unwarps

' - # Match crops with unwarps by proximity in sorted list or timestamp extraction - for crop in crops: - ts = re.search(r"(\d+)", crop).group(1) - # Find unwarps that happened shortly after this crop - matching_unwarps = [u for u in unwarps if abs(int(re.search(r"(\d+)", u).group(1)) - int(ts)) < 2000] - - html += '
' - html += f'
Step A: Seal Crop
' - - for u in matching_unwarps: - label = "Step B: 7:30 Unwarp" if "730" in u else "Step C: Smart Unwarp" - html += f'
{label}
' - - html += '
' - html += '
' - - html += "" - - with open(html_file, "w", encoding="utf-8") as f: - f.write(html) - print(f"HTML report generated: {html_file}") - -if __name__ == "__main__": - generate_html() diff --git a/test_accuracy_batch_full.py b/test_accuracy_batch_full.py index efbee14..343897f 100644 --- a/test_accuracy_batch_full.py +++ b/test_accuracy_batch_full.py @@ -99,6 +99,39 @@ SIMILARITY_THRESHOLD = 85.0 OCR_MODEL = os.environ.get("OCR_MODEL", "ppocr_v5") +# ============ Helper Functions ============ + +def imwrite_safe(file_path, img): + """ + Write image file safely, handling Chinese paths on Windows. + + On Windows, cv2.imwrite fails with Chinese paths. This function uses + cv2.imencode + tofile as a fallback. + + Args: + file_path: Path to save the image + img: Image data (numpy array) + + Returns: + bool: True if successful, False otherwise + """ + try: + # Try standard cv2.imwrite first + success = cv2.imwrite(file_path, img) + if success: + return True + + # Fallback: Use imencode + tofile for Chinese paths + is_success, buffer = cv2.imencode(".png", img) + if is_success: + buffer.tofile(file_path) + return True + return False + except Exception as e: + logger.error(f"Failed to write image to {file_path}: {e}") + return False + + # ============ Seal Processing Functions (from v_verify_logic.py) ============ def polar_unwarp(img, center, radius, start_theta, angular_extent): @@ -219,7 +252,18 @@ def calculate_precise_arc(polygons, center): candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight}) candidates.sort(key=lambda x: x['score'], reverse=True) best = candidates[0] - return best['start'], best['end'] - best['start'] + + # FIX: Limit extent to max 350° to avoid overlap and distortion + # Extent > 360° causes severe image distortion in polar unwarping + MAX_EXTENT_DEG = 350.0 + start_theta = best['start'] + extent = best['end'] - best['start'] + + if math.degrees(extent) > MAX_EXTENT_DEG: + logger.warning(f"Arc extent {math.degrees(extent):.2f}° exceeds {MAX_EXTENT_DEG}°, clamping to avoid distortion") + extent = math.radians(MAX_EXTENT_DEG) + + return start_theta, extent def fit_circle_from_text_polygons(all_polygons): @@ -384,10 +428,12 @@ def run_ocr_recognition(image_path, rec_model): def run_ocr_recognition_vl(image_path, vl_pipeline): """ - Run OCR recognition using PaddleOCRVL on unwarp seal image. + Run OCR recognition using PaddleOCRVL on seal image. + + Can be used on both unwarp images and crop images (backup mode). Args: - image_path: Path to unwarp seal image + image_path: Path to seal image (unwarp or crop) vl_pipeline: Initialized PaddleOCRVL pipeline Returns: @@ -492,9 +538,9 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v # Save page image doc_path = os.path.join(output_dir, "doc_page.png") try: - success = cv2.imwrite(doc_path, page_img) + success = imwrite_safe(doc_path, page_img) if not success: - logger.error(f"cv2.imwrite returned False for {doc_path}") + logger.error(f"imwrite_safe returned False for {doc_path}") # Try alternative save method using PIL try: from PIL import Image @@ -544,7 +590,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v if is_seal: seal_boxes.append(box) - cv2.imwrite(os.path.join(output_dir, "doc_layout_viz.png"), page_viz) + imwrite_safe(os.path.join(output_dir, "doc_layout_viz.png"), page_viz) if not seal_boxes: logger.warning("No seals detected") @@ -585,7 +631,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v continue crop_path = os.path.join(output_dir, f"seal_crop_{i}.png") - success = cv2.imwrite(crop_path, seal_crop) + success = imwrite_safe(crop_path, seal_crop) if not success: # Try PIL fallback try: @@ -623,6 +669,88 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v logger.info(f" - Center: ({center[0]}, {center[1]})") logger.info(f" - Radius: {radius}") + # ============ INSUFFICIENT POLYGONS CHECK ============ + # If too few text polygons detected, polar unwarping will likely fail + # Skip directly to PaddleOCRVL backup in this case + MIN_POLYGONS_FOR_UNWARP = 3 + if len(all_polygons) < MIN_POLYGONS_FOR_UNWARP: + logger.warning(f" Seal #{i}: Only {len(all_polygons)} text polygons detected (< {MIN_POLYGONS_FOR_UNWARP})") + logger.warning(f" Seal #{i}: Skipping polar unwarping (insufficient polygon data)") + logger.info(f" Seal #{i}: Using PaddleOCRVL backup instead") + + # Save crop image + imwrite_safe(crop_path, seal_crop) + + # Use PaddleOCRVL directly on crop (no unwarp) + if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE: + ocr_result = run_ocr_recognition_vl(crop_path, vl_pipeline) + logger.info(f" Seal #{i} PaddleOCRVL Result (direct crop):") + logger.info(f" - Text: '{ocr_result['text']}'") + logger.info(f" - Score: {ocr_result['score']:.4f}") + logger.info(f" - Success: {ocr_result['success']}") + logger.info(f" - ** Used PaddleOCRVL (insufficient polygons for unwarping) **") + + # Create debug info without unwarp + seal_data = { + 'index': i, + 'box': box, + 'crop_path': Path(crop_path).name, + 'unwarp_path': None, # No unwarp performed + 'marked_path': None, # No marked image + 'polar_viz_path': None, # No polar visualization + 'text': ocr_result['text'], + 'confidence': float(ocr_result['score']), + 'success': bool(ocr_result['success']), + 'method_used': f'{method_used}_skip_unwarp', + 'used_fallback': True, + 'debug_info': { + 'center': center, + 'radius': radius, + 'start_theta_deg': None, + 'extent_deg': None, + 'num_polygons': len(all_polygons), + 'crop_size': (cw, ch), + 'unwarp_size': None, + 'skip_reason': f'Insufficient polygons ({len(all_polygons)} < {MIN_POLYGONS_FOR_UNWARP})' + } + } + result['seals'].append(seal_data) + + if ocr_result['success']: + result['institutions'].append(ocr_result['text']) + logger.info(f" ✓ Seal #{i} SUCCESS: {ocr_result['text'][:50]}... (confidence: {ocr_result['score']:.4f})") + else: + logger.warning(f" ✗ Seal #{i} FAILED: Could not extract institution name") + + continue # Skip to next seal + else: + logger.error(f" Seal #{i}: PaddleOCRVL not available, cannot extract text") + seal_data = { + 'index': i, + 'box': box, + 'crop_path': Path(crop_path).name, + 'unwarp_path': None, + 'marked_path': None, + 'polar_viz_path': None, + 'text': '', + 'confidence': 0.0, + 'success': False, + 'method_used': f'{method_used}_skip_unwarp', + 'used_fallback': True, + 'debug_info': { + 'center': center, + 'radius': radius, + 'start_theta_deg': None, + 'extent_deg': None, + 'num_polygons': len(all_polygons), + 'crop_size': (cw, ch), + 'unwarp_size': None, + 'skip_reason': f'Insufficient polygons and no PaddleOCRVL backup' + } + } + result['seals'].append(seal_data) + continue + # Calculate arc and unwarp start_theta, extent = calculate_precise_arc(all_polygons, center) logger.info(f" Seal #{i} Arc Parameters:") @@ -658,7 +786,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v logger.info(f" Seal #{i}: Performing polar unwarping with detected text polygons...") unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent) if unwarp is not None: - cv2.imwrite(unwarp_path, unwarp) + imwrite_safe(unwarp_path, unwarp) logger.info(f" - Unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}") def draw_line(m, theta, color): @@ -684,7 +812,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v # Save polar visualization polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png") - cv2.imwrite(polar_viz_path, polar_viz) + imwrite_safe(polar_viz_path, polar_viz) logger.info(f" - Polar visualization saved: seal_polar_viz_{i}.png") else: logger.warning(f" Seal #{i}: Polar unwarp returned None") @@ -707,7 +835,7 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v unwarp = polar_unwarp(seal_crop, center, radius, fallback_start_theta, fallback_extent) if unwarp is not None: - cv2.imwrite(unwarp_path, unwarp) + imwrite_safe(unwarp_path, unwarp) logger.info(f" - Fallback unwarp size: {unwarp.shape[1]}x{unwarp.shape[0]}") # Update start_theta and extent for visualization @@ -736,20 +864,19 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v cv2.circle(polar_viz, (int(src_x), int(src_y)), 1, (255, 0, 255), -1) polar_viz_path = os.path.join(output_dir, f"seal_polar_viz_{i}.png") - cv2.imwrite(polar_viz_path, polar_viz) + imwrite_safe(polar_viz_path, polar_viz) logger.info(f" - Fallback polar visualization saved: seal_polar_viz_{i}.png") else: logger.warning(f" Seal #{i}: Fallback polar unwarp also returned None") - if unwarp is None: - logger.warning(f" Seal #{i}: No unwarp image available, skipping OCR") - marked_path = os.path.join(output_dir, f"seal_marked_{i}.png") - cv2.imwrite(marked_path, marked) + imwrite_safe(marked_path, marked) # OCR recognition ocr_result = {'text': '', 'score': 0.0, 'success': False} + if unwarp is not None: + # Standard path: Recognize unwarp image method_str = "FALLBACK" if used_fallback else "Standard" logger.info(f" Seal #{i}: Running OCR ({method_str}, model={ocr_model}) on unwarp image...") @@ -766,7 +893,21 @@ def extract_seals_and_institutions(page_img, output_dir, ocr_model="ppocr_v5", v if used_fallback: logger.info(f" - ** Used fallback angle range (7:30 to 4:30) **") else: - logger.warning(f" Seal #{i}: No unwarp image available, skipping OCR") + # ============ BACKUP: Use PaddleOCRVL directly on seal crop ============ + logger.warning(f" Seal #{i}: No unwarp image available (polar unwarp failed)") + + if vl_pipeline is not None and PADDLEOCRVL_AVAILABLE: + logger.info(f" Seal #{i}: Using PaddleOCRVL backup - directly recognize seal crop image") + seal_crop_path = os.path.join(output_dir, f"seal_crop_{i}.png") + ocr_result = run_ocr_recognition_vl(seal_crop_path, vl_pipeline) + logger.info(f" Seal #{i} PaddleOCRVL Backup Result:") + logger.info(f" - Text: '{ocr_result['text']}'") + logger.info(f" - Score: {ocr_result['score']:.4f}") + logger.info(f" - Success: {ocr_result['success']}") + logger.info(f" - Text length: {len(ocr_result['text'])} chars") + logger.info(f" - ** Used PaddleOCRVL backup (direct crop recognition) **") + else: + logger.warning(f" Seal #{i}: No backup available (vl_pipeline=None or PaddleOCRVL not installed), skipping OCR") seal_data = { 'index': int(i), @@ -994,6 +1135,10 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, elif not best_inst: best_inst = inst + # Fallback: if best_inst is still None (all similarities were 0), use first institution + if best_inst is None and seal_result['institutions']: + best_inst = seal_result['institutions'][0] + result['extracted']['institution'] = best_inst # Compare institution @@ -1299,11 +1444,14 @@ def main(): help='OCR model to use (default: from OCR_MODEL env var or ppocr_v5)') parser.add_argument('--batch-size', type=int, default=BATCH_SIZE, help=f'Number of PDFs to process (default: {BATCH_SIZE})') + parser.add_argument('--pdf-names', type=str, default=None, + help='Comma-separated list of PDF names to process (e.g., "1.pdf,2.pdf"). Overrides --batch-size') args = parser.parse_args() # Use command line argument if provided ocr_model = args.ocr_model batch_size = args.batch_size + pdf_names_filter = args.pdf_names print("=" * 80) print("CMA & INSTITUTION EXTRACTION - BATCH ACCURACY TEST") @@ -1322,12 +1470,23 @@ def main(): with open(RESULTS_JSON, 'r', encoding='utf-8') as f: ground_truth = json.load(f) - # Get first N PDFs - pdf_list = list(ground_truth.items())[:batch_size] + # Filter PDFs: either by name filter or by batch size + if pdf_names_filter: + # Split comma-separated names and strip whitespace + requested_names = [name.strip() for name in pdf_names_filter.split(',')] + pdf_list = [(name, ground_truth[name]) for name in requested_names if name in ground_truth] + if not pdf_list: + logger.error(f"None of the specified PDFs found in results.json: {requested_names}") + print(f"ERROR: None of the specified PDFs found in results.json: {requested_names}") + return + print(f"Processing {len(pdf_list)} specified PDF(s): {[name for name, _ in pdf_list]}") + else: + # Get first N PDFs + pdf_list = list(ground_truth.items())[:batch_size] # Initialize OCR engines # Note: We ALWAYS initialize ocr_engine for CMA recognition - # PaddleOCRVL is ONLY used for seal text recognition + # We ALWAYS try to initialize vl_pipeline for backup seal recognition (when unwarp fails) ocr_engine = None vl_pipeline = None @@ -1337,35 +1496,40 @@ def main(): logger.info("PaddleOCR initialized successfully") print("PaddleOCR initialized successfully\n") - # Initialize PaddleOCRVL if requested for seal recognition - if ocr_model == "paddleocr_vl": - if not PADDLEOCRVL_AVAILABLE: - print("WARNING: PaddleOCRVL requested but not available!") - print("Falling back to PP-OCRv5 for seal recognition") - print("Please install: pip install paddleocr[doc-parser]") - ocr_model = "ppocr_v5" - else: - logger.info("Initializing PaddleOCRVL for seal recognition...") - print("Initializing PaddleOCRVL for seal recognition (this may take a while)...") - try: - vl_pipeline = PaddleOCRVL( - use_seal_recognition=True, - use_ocr_for_image_block=True, - use_layout_detection=True - ) + # Initialize PaddleOCRVL for backup seal recognition (always try if available) + # This provides a fallback when polar unwarping fails + if PADDLEOCRVL_AVAILABLE: + logger.info("Initializing PaddleOCRVL for backup seal recognition...") + print("Initializing PaddleOCRVL for backup seal recognition (this may take a while)...") + try: + vl_pipeline = PaddleOCRVL( + use_seal_recognition=True, + use_ocr_for_image_block=True, + use_layout_detection=True + ) - # Verify initialization - if vl_pipeline is None: - raise RuntimeError("PaddleOCRVL initialization returned None") + # Verify initialization + if vl_pipeline is None: + raise RuntimeError("PaddleOCRVL initialization returned None") - logger.info("PaddleOCRVL initialized successfully") - print("PaddleOCRVL for seal recognition initialized successfully\n") - except Exception as e: - logger.error(f"Failed to initialize PaddleOCRVL: {e}") - logger.error(f"Exception type: {type(e).__name__}") - print(f"WARNING: Failed to initialize PaddleOCRVL: {e}") - print("Falling back to PP-OCRv5 for seal recognition") - ocr_model = "ppocr_v5" + logger.info("PaddleOCRVL initialized successfully (backup ready)") + print("PaddleOCRVL backup ready - will be used when polar unwarping fails\n") + except Exception as e: + logger.error(f"Failed to initialize PaddleOCRVL: {e}") + logger.error(f"Exception type: {type(e).__name__}") + print(f"WARNING: Failed to initialize PaddleOCRVL: {e}") + print("Polar unwarping failures will skip OCR (no backup available)\n") + else: + logger.info("PaddleOCRVL not available - polar unwarping failures will skip OCR") + print("Note: PaddleOCRVL not installed - polar unwarping failures will skip OCR") + print(" To enable backup: pip install paddleocr[doc-parser]\n") + + # Validate OCR model selection + if ocr_model == "paddleocr_vl" and vl_pipeline is None: + print("WARNING: PaddleOCRVL requested for primary seal recognition but not available!") + print("Falling back to PP-OCRv5 for seal recognition") + print("Please install: pip install paddleocr[doc-parser]") + ocr_model = "ppocr_v5" # Create output directory OUTPUT_DIR.mkdir(exist_ok=True) @@ -1374,11 +1538,12 @@ def main(): all_results = [] start_time = time.time() + total_pdfs = len(pdf_list) for i, (pdf_name, expected_data) in enumerate(pdf_list, 1): expected_cma = expected_data.get('CMA', '') expected_inst = expected_data.get('机构名', '') - print(f"\n[{i}/{BATCH_SIZE}] Processing: {pdf_name}") + print(f"\n[{i}/{total_pdfs}] Processing: {pdf_name}") print(" + Loading PDF and extracting page...") result = process_single_pdf(