#!/usr/bin/env python # -*- coding: utf-8 -*- """ Simple test to check if PaddleOCRVL wrapper is working. """ import sys import time from pathlib import Path import multiprocessing # Module-level wrapper function (required for Windows multiprocessing) def _run_ocr_vl_wrapper(image_path, result_queue): """Wrapper function to run PaddleOCRVL in a subprocess.""" try: # Helper to print to console def log(msg): print(f"[Subprocess] {msg}") sys.stdout.flush() log("Starting...") from paddleocr import PaddleOCRVL log("Import successful, initializing pipeline...") # Re-initialize pipeline in subprocess (required) vl_pipeline = PaddleOCRVL( use_seal_recognition=True, use_ocr_for_image_block=True, use_layout_detection=True ) log("Pipeline initialized, starting prediction...") start_time = time.time() output = vl_pipeline.predict(image_path, batch_size=1) elapsed = time.time() - start_time log(f"Prediction completed in {elapsed:.1f}s, output length: {len(output) if output else 0}") if output and len(output) > 0: res = output[0] # Save to JSON import json temp_output_dir = Path("temp_paddleocr_vl_test") temp_output_dir.mkdir(exist_ok=True) res.save_to_json(save_path=str(temp_output_dir)) json_file = temp_output_dir / f"{Path(image_path).stem}_res.json" log(f"Looking for JSON: {json_file}") if json_file.exists(): log("JSON found, reading...") with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) blocks = data.get('parsing_res_list', []) log(f"Found {len(blocks)} blocks") for i, block in enumerate(blocks): label = block.get('block_label', 'unknown') content = block.get('block_content', '') log(f" Block {i}: {label} - '{content[:50] if content else '(empty)'}...'") if label == 'seal': text = content.strip() log(f" *** SEAL FOUND: '{text}' ***") # Clean up import shutil if temp_output_dir.exists(): shutil.rmtree(temp_output_dir, ignore_errors=True) result_queue.put({ 'text': text, 'success': len(text) > 0 }) return log("No seal block found") result_queue.put({'text': '', 'success': False, 'debug': 'no_seal'}) else: log("No output from predict()") result_queue.put({'text': '', 'success': False, 'debug': 'no_output'}) except Exception as e: import traceback log(f"ERROR: {e}") log(f"Traceback:\n{traceback.format_exc()}") result_queue.put({ 'text': '', 'success': False, 'error': str(e) }) def test(): print("Testing PaddleOCRVL with existing seal image...") # Find a seal image seal_image = Path("test_reports_full/1.pdf/seal_crop_0.png") if not seal_image.exists(): print(f"Seal image not found: {seal_image}") return False print(f"Using image: {seal_image}") print(f"Image size: {seal_image.stat().st_size} bytes") # Run the test result_queue = multiprocessing.Queue() print("Starting subprocess...") process = multiprocessing.Process( target=_run_ocr_vl_wrapper, args=(str(seal_image), result_queue) ) start_time = time.time() process.start() # Wait up to 120 seconds process.join(timeout=120) elapsed = time.time() - start_time print(f"Process completed in {elapsed:.1f}s") if process.is_alive(): print("TIMEOUT: Process still running, terminating...") process.terminate() process.join(timeout=5) if process.is_alive(): process.kill() print("Process terminated") return False # Get result if not result_queue.empty(): result = result_queue.get_nowait() print(f"\nResult:") print(f" Text: '{result.get('text', '')}'") print(f" Success: {result.get('success', False)}") if result.get('error'): print(f" Error: {result.get('error')}") if result.get('debug'): print(f" Debug: {result.get('debug')}") return result.get('success', False) and len(result.get('text', '')) > 0 else: print("No result returned from process") return False if __name__ == "__main__": success = test() print("\n" + "=" * 60) if success: print("SUCCESS: PaddleOCRVL is working!") sys.exit(0) else: print("FAILED: PaddleOCRVL test failed") sys.exit(1)