diff --git a/test_accuracy_batch_full.py b/test_accuracy_batch_full.py index e28ed68..1b4b1a7 100644 --- a/test_accuracy_batch_full.py +++ b/test_accuracy_batch_full.py @@ -68,7 +68,7 @@ try: except ImportError: PADDLEOCRVL_AVAILABLE = False print("Warning: PaddleOCRVL not available. Install with: pip install paddleocr[doc-parser]") - PADDLEOCRVL_TIMEOUT = 60 # Default timeout in seconds, can be overridden by command-line argument + PADDLEOCRVL_TIMEOUT = 300 # Default timeout in seconds (increased for better accuracy) try: import paddlex as px PADDLEX_AVAILABLE = True @@ -822,72 +822,101 @@ def _run_ocr_vl_wrapper(image_path, result_queue): def run_ocr_recognition_vl(image_path, vl_pipeline, timeout=300): """ - Run OCR recognition using PaddleOCRVL on seal image with timeout protection. + Run OCR recognition using PaddleOCRVL on seal image. - Can be used on both unwarp images and crop images (backup mode). + DIRECT CALL VERSION - No multiprocessing, uses the provided vl_pipeline directly. Args: image_path: Path to seal image (unwarp or crop) - vl_pipeline: Initialized PaddleOCRVL pipeline (deprecated parameter, kept for compatibility) - timeout: Timeout in seconds (default: 60) + vl_pipeline: Initialized PaddleOCRVL pipeline (REQUIRED) + timeout: Timeout in seconds (reserved for future use, not currently implemented) Returns: Dict with 'text', 'score', 'success' keys """ - import multiprocessing + import json + from pathlib import Path - result_queue = multiprocessing.Queue() - - # Start subprocess to run PaddleOCRVL - process = multiprocessing.Process( - target=_run_ocr_vl_wrapper, - args=(image_path, result_queue) - ) - process.start() - - # Wait for result or timeout - process.join(timeout=timeout) - - if process.is_alive(): - # Timeout - force terminate process - process.terminate() - process.join(timeout=5) # Wait up to 5 seconds for cleanup - if process.is_alive(): - process.kill() # Force kill if still alive - - logger.warning(f"PaddleOCRVL recognition timeout ({timeout}s) for {image_path}") + if vl_pipeline is None: + logger.error("vl_pipeline is None, cannot run OCR") return { 'text': '', 'score': 0.0, 'success': False, - 'error': f'timeout after {timeout}s' + 'error': 'vl_pipeline is None' } - # Get result + logger.info(f"PaddleOCRVL direct call for: {image_path}") + try: - if not result_queue.empty(): - result = result_queue.get_nowait() - # Log the result - if result.get('error'): - logger.warning(f"PaddleOCRVL subprocess error: {result.get('error')}") - elif result.get('debug'): - logger.info(f"PaddleOCRVL debug: {result.get('debug')}") - elif result.get('success') and result.get('text'): - logger.info(f"PaddleOCRVL SUCCESS: '{result['text']}'") + # Direct call to PaddleOCRVL predict + output = vl_pipeline.predict(image_path, batch_size=1) + + logger.info(f"Prediction completed, output length: {len(output) if output else 0}") + + if output and len(output) > 0: + res = output[0] + temp_output_dir = Path("temp_paddleocr_vl") + temp_output_dir.mkdir(exist_ok=True) + + logger.info(f"Saving JSON to: {temp_output_dir}") + + res.save_to_json(save_path=str(temp_output_dir)) + + json_file = temp_output_dir / f"{Path(image_path).stem}_res.json" + + logger.info(f"Looking for JSON file: {json_file}") + + if json_file.exists(): + logger.info("JSON file found, reading...") + with open(json_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + logger.info(f"Data loaded, parsing_res_list count: {len(data.get('parsing_res_list', []))}") + + for block in data.get('parsing_res_list', []): + logger.info(f" Block label: {block.get('block_label')}") + if block.get('block_label') == 'seal': + text = block.get('block_content', '').strip() + logger.info(f" *** SEAL FOUND *** Text: '{text}' (length: {len(text)})") + + # Clean up temp files + import shutil + if temp_output_dir.exists(): + shutil.rmtree(temp_output_dir, ignore_errors=True) + + result = { + 'text': text, + 'score': 1.0, + 'success': len(text) > 0 + } + + if result['success']: + logger.info(f"PaddleOCRVL SUCCESS: '{text}'") + else: + logger.warning("PaddleOCRVL returned empty text") + + return result + + logger.warning("No seal block found in parsing_res_list") else: - logger.warning("PaddleOCRVL returned empty result (no seal detected)") - return result + logger.error(f"JSON file not found: {json_file}") else: - # Process finished without returning result - logger.error("PaddleOCRVL process completed but returned no result") - return { - 'text': '', - 'score': 0.0, - 'success': False, - 'error': 'process completed without result' - } + logger.warning("No output from predict()") + + # If no seal block found + logger.warning("Returning empty result") + return { + 'text': '', + 'score': 0.0, + 'success': False, + 'debug': 'no_seal_block' + } + except Exception as e: - logger.error(f"Failed to get PaddleOCRVL result: {e}") + logger.error(f"PaddleOCRVL direct call error: {e}") + import traceback + logger.error(f"Traceback:\n{traceback.format_exc()}") return { 'text': '', 'score': 0.0, @@ -1904,6 +1933,14 @@ def classify_match(extracted: Optional[str], expected: str, field_type: str = 'd Returns: Dict with match_type, similarity, edit_distance """ + # Handle None values for expected (when not in test mode) + if expected is None: + return { + 'match_type': 'not_tested', + 'similarity': 0.0, + 'edit_distance': 0 + } + if extracted is None: return { 'match_type': 'no_match', @@ -1971,7 +2008,7 @@ def extract_pdf_page(pdf_path: str, page_num: int = 0) -> Optional[np.ndarray]: def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, pdf_dir: Path, output_dir: Path, ocr_engine, - ocr_model="ppocr_v5", vl_pipeline=None) -> Dict[str, Any]: + ocr_model="ppocr_v5", vl_pipeline=None, verbose: bool = False) -> Dict[str, Any]: """ Process a single PDF for CMA and institution extraction. @@ -1984,6 +2021,7 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, ocr_engine: Global PaddleOCR instance (not currently used) ocr_model: OCR model to use ("ppocr_v5" or "paddleocr_vl") vl_pipeline: PaddleOCRVL pipeline (required if ocr_model="paddleocr_vl") + verbose: Enable verbose output with detailed steps Returns: Result dictionary with extraction and comparison data @@ -2146,11 +2184,19 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, result['comparison']['cma'] = comparison # Extract seals and institutions (OCR fallback) - logger.info(f"Running seal extraction on {pdf_name}...") - seal_start = time.time() - seal_result = extract_seals_and_institutions(page_img, str(pdf_output_dir), - ocr_model=ocr_model, vl_pipeline=vl_pipeline) - result['performance']['seal_time'] = time.time() - seal_start + # Optimization: Skip seal recognition if CRT extraction succeeded + if crt_institutions and len(crt_institutions) > 0: + logger.info(f"✓ CRT extraction successful, skipping seal recognition (timeout prevention)") + logger.info(f" Found institution: {crt_institutions[0]}") + # Create empty seal result to avoid timeout + seal_result = {'seals': [], 'institutions': []} + result['performance']['seal_time'] = 0.0 + else: + logger.info(f"Running seal extraction on {pdf_name}...") + seal_start = time.time() + seal_result = extract_seals_and_institutions(page_img, str(pdf_output_dir), + ocr_model=ocr_model, vl_pipeline=vl_pipeline) + result['performance']['seal_time'] = time.time() - seal_start result['seal_results'] = seal_result['seals'] result['extracted']['institutions_from_seals'] = seal_result['institutions'] @@ -2201,6 +2247,8 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, logger.info(f" - Selected: '{best_inst[:50]}...' (similarity: {best_similarity:.1f}%)") result['extracted']['institution'] = best_inst result['extracted']['institution_source'] = 'seal_ocr' + # BUG FIX: Also add to all_institutions when CRT fails + all_institutions.extend(seal_result['institutions']) else: # CRT succeeded - skip OCR entirely, just store for reference logger.debug(f"OCR institutions available but skipped (CRT priority)") @@ -2225,6 +2273,54 @@ def process_single_pdf(pdf_name: str, expected_cma: str, expected_inst: str, result['performance']['total_time'] = time.time() - total_start + # Verbose output + if verbose: + print(f"\n{'='*60}") + print(f"步骤1: PDF提取") + print(f"{'='*60}") + print(f"文件: {pdf_name}") + print(f"大小: {result.get('file_size', 0) / 1024:.2f} KB") + print(f"状态: {'✓ 成功' if result.get('status') != 'extraction_failed' else '✗ 失败'}") + + print(f"\n{'='*60}") + print(f"步骤2: CMA提取") + print(f"{'='*60}") + print(f"方法: {result['extracted'].get('cma_method', 'unknown')}") + print(f"结果: {result['extracted']['cma']}") + print(f"置信度: {result['extracted']['cma_confidence']:.2f}") + print(f"耗时: {result['performance'].get('cma_time', 0):.2f}秒") + + print(f"\n{'='*60}") + print(f"步骤3: CRT提取") + print(f"{'='*60}") + print(f"机构数: {len(result['extracted']['crt_institutions'])}") + for inst in result['extracted']['crt_institutions'][:3]: + print(f" - {inst}") + if len(result['extracted']['crt_institutions']) > 3: + print(f" ... 还有 {len(result['extracted']['crt_institutions']) - 3} 个") + print(f"耗时: {result['performance'].get('crt_time', 0):.2f}秒") + + print(f"\n{'='*60}") + print(f"步骤4: 印章识别") + print(f"{'='*60}") + print(f"检测到印章: {len(result['seal_results'])}") + for seal in result['seal_results'][:5]: + if seal.get('success'): + print(f" - 印章{seal['index']}: {seal['text']} (置信度: {seal['confidence']:.2f})") + else: + print(f" - 印章{seal['index']}: [识别失败]") + if len(result['seal_results']) > 5: + print(f" ... 还有 {len(result['seal_results']) - 5} 个") + print(f"耗时: {result['performance'].get('seal_time', 0):.2f}秒") + + print(f"\n{'='*60}") + print(f"性能统计") + print(f"{'='*60}") + print(f"总耗时: {result['performance']['total_time']:.2f}秒") + print(f" ├─ CMA提取: {result['performance'].get('cma_time', 0):.2f}秒") + print(f" ├─ CRT提取: {result['performance'].get('crt_time', 0):.2f}秒") + print(f" └─ 印章识别: {result['performance'].get('seal_time', 0):.2f}秒") + return result @@ -2532,8 +2628,8 @@ def main(): parser.add_argument("--pdf-names", help="Comma-separated list of PDF names to process") parser.add_argument('--disable-paddleocrvl', action='store_true', help='Disable PaddleOCRVL backup for seal recognition (faster but less accurate)') - parser.add_argument('--paddleocrvl-timeout', type=int, default=60, - help='Timeout in seconds for PaddleOCRVL recognition (default: 60, recommended: 300 for better results)') + parser.add_argument('--paddleocrvl-timeout', type=int, default=300, + help='Timeout in seconds for PaddleOCRVL recognition (default: 300)') args = parser.parse_args() @@ -2630,7 +2726,7 @@ def main(): import psutil mem = psutil.virtual_memory() available_gb = mem.available / (1024**3) - required_gb = 3.0 # PaddleOCR-VL needs ~3GB free memory + required_gb = 2.0 # PaddleOCR-VL needs ~2GB free memory (lowered for testing) logger.info(f"Available memory: {available_gb:.1f} GB, Required: {required_gb:.1f} GB") @@ -2879,36 +2975,54 @@ def main(): print("=" * 80) -def process_single_pdf_standalone(pdf_path: Path, output_dir: Path, ocr_model: str): - """Bridge function for Java to call for a single PDF""" +def process_single_pdf_standalone(pdf_path: Path, output_dir: Path, ocr_model: str, + vl_pipeline=None, verbose: bool = False): + """ + Bridge function for Java to call for a single PDF (with verbose support) + + Args: + pdf_path: Path to PDF file + output_dir: Output directory + ocr_model: OCR model to use + vl_pipeline: PaddleOCRVL pipeline (optional, will be created if not provided) + verbose: Enable verbose output with detailed steps + + Returns: + Formatted response dictionary for API + """ total_start = time.time() - - # Initialize engines + + # Initialize engines if not provided logger.info(f"Initializing engines for standalone processing (Model: {ocr_model})...") - - vl_pipeline = None - if ocr_model == "paddleocr_vl" and PADDLEOCRVL_AVAILABLE: + + # Initialize OCR engine for CMA extraction (REQUIRED!) + from paddleocr import PaddleOCR + ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch') + logger.info("PaddleOCR initialized for CMA extraction") + + if vl_pipeline is None and ocr_model == "paddleocr_vl" and PADDLEOCRVL_AVAILABLE: vl_pipeline = PaddleOCRVL(use_seal_recognition=True, use_ocr_for_image_block=True, use_layout_detection=True) - - # Re-use the existing core logic function + + # Re-use the existing core logic function (with verbose parameter) result = process_single_pdf( pdf_name=pdf_path.name, expected_cma=None, expected_inst=None, pdf_dir=pdf_path.parent, output_dir=output_dir, - ocr_engine=None, # Global instance not needed for this path + ocr_engine=ocr_engine, # ← CRITICAL: Must provide ocr_engine for CMA extraction! ocr_model=ocr_model, - vl_pipeline=vl_pipeline + vl_pipeline=vl_pipeline, + verbose=verbose # Pass verbose parameter ) - + # Format for bridge output bridge_res = { "success": result["status"] == "success", "cma": { "code": result["extracted"]["cma"], "confidence": result["extracted"]["cma_confidence"], - "box": None # Not captured in current flat result + "method": result["extracted"].get("cma_method"), } if result["extracted"]["cma"] else None, "seals": [ { @@ -2919,10 +3033,47 @@ def process_single_pdf_standalone(pdf_path: Path, output_dir: Path, ocr_model: s "method": "vl" if ocr_model == "paddleocr_vl" else "ppocr" } for s in result["seal_results"] ], - "institutions": [s["text"] for s in result["seal_results"] if s["success"] and s["text"]], + "institutions": result["extracted"].get("all_institutions", []), "error": result["error"] } - + + # Add verbose information if requested + if verbose: + bridge_res["steps"] = { + "pdf_extraction": { + "status": "success" if result.get("status") != "extraction_failed" else "failed", + "time": result["performance"].get("cma_time", 0), # PDF extraction time included in cma_time + "file_size": result.get("file_size", 0) + }, + "cma_extraction": { + "status": "success" if result["extracted"]["cma"] else "failed", + "method": result["extracted"].get("cma_method"), + "code": result["extracted"]["cma"], + "confidence": result["extracted"]["cma_confidence"], + "time": result["performance"].get("cma_time", 0) + }, + "crt_extraction": { + "status": "success" if result["extracted"]["crt_institutions"] else "skipped", + "institutions": result["extracted"]["crt_institutions"], + "time": result["performance"].get("crt_time", 0) + }, + "seal_recognition": { + "status": "success" if any(s["success"] for s in result["seal_results"]) else "failed", + "seals_found": len(result["seal_results"]), + "seals": [ + { + "index": s["index"], + "text": s["text"], + "confidence": s["confidence"], + "success": s["success"] + } for s in result["seal_results"] + ], + "institutions": result["extracted"]["institutions_from_seals"], + "time": result["performance"].get("seal_time", 0) + } + } + bridge_res["performance"] = result["performance"] + return bridge_res