165 lines
5.0 KiB
Python
165 lines
5.0 KiB
Python
|
|
#!/usr/bin/env python
|
||
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
Simple test to check if PaddleOCRVL wrapper is working.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
import multiprocessing
|
||
|
|
|
||
|
|
# Module-level wrapper function (required for Windows multiprocessing)
|
||
|
|
def _run_ocr_vl_wrapper(image_path, result_queue):
|
||
|
|
"""Wrapper function to run PaddleOCRVL in a subprocess."""
|
||
|
|
try:
|
||
|
|
# Helper to print to console
|
||
|
|
def log(msg):
|
||
|
|
print(f"[Subprocess] {msg}")
|
||
|
|
sys.stdout.flush()
|
||
|
|
|
||
|
|
log("Starting...")
|
||
|
|
|
||
|
|
from paddleocr import PaddleOCRVL
|
||
|
|
|
||
|
|
log("Import successful, initializing pipeline...")
|
||
|
|
|
||
|
|
# Re-initialize pipeline in subprocess (required)
|
||
|
|
vl_pipeline = PaddleOCRVL(
|
||
|
|
use_seal_recognition=True,
|
||
|
|
use_ocr_for_image_block=True,
|
||
|
|
use_layout_detection=True
|
||
|
|
)
|
||
|
|
|
||
|
|
log("Pipeline initialized, starting prediction...")
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
output = vl_pipeline.predict(image_path, batch_size=1)
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
|
||
|
|
log(f"Prediction completed in {elapsed:.1f}s, output length: {len(output) if output else 0}")
|
||
|
|
|
||
|
|
if output and len(output) > 0:
|
||
|
|
res = output[0]
|
||
|
|
|
||
|
|
# Save to JSON
|
||
|
|
import json
|
||
|
|
temp_output_dir = Path("temp_paddleocr_vl_test")
|
||
|
|
temp_output_dir.mkdir(exist_ok=True)
|
||
|
|
|
||
|
|
res.save_to_json(save_path=str(temp_output_dir))
|
||
|
|
|
||
|
|
json_file = temp_output_dir / f"{Path(image_path).stem}_res.json"
|
||
|
|
|
||
|
|
log(f"Looking for JSON: {json_file}")
|
||
|
|
|
||
|
|
if json_file.exists():
|
||
|
|
log("JSON found, reading...")
|
||
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
||
|
|
data = json.load(f)
|
||
|
|
|
||
|
|
blocks = data.get('parsing_res_list', [])
|
||
|
|
log(f"Found {len(blocks)} blocks")
|
||
|
|
|
||
|
|
for i, block in enumerate(blocks):
|
||
|
|
label = block.get('block_label', 'unknown')
|
||
|
|
content = block.get('block_content', '')
|
||
|
|
log(f" Block {i}: {label} - '{content[:50] if content else '(empty)'}...'")
|
||
|
|
|
||
|
|
if label == 'seal':
|
||
|
|
text = content.strip()
|
||
|
|
log(f" *** SEAL FOUND: '{text}' ***")
|
||
|
|
|
||
|
|
# Clean up
|
||
|
|
import shutil
|
||
|
|
if temp_output_dir.exists():
|
||
|
|
shutil.rmtree(temp_output_dir, ignore_errors=True)
|
||
|
|
|
||
|
|
result_queue.put({
|
||
|
|
'text': text,
|
||
|
|
'success': len(text) > 0
|
||
|
|
})
|
||
|
|
return
|
||
|
|
|
||
|
|
log("No seal block found")
|
||
|
|
result_queue.put({'text': '', 'success': False, 'debug': 'no_seal'})
|
||
|
|
else:
|
||
|
|
log("No output from predict()")
|
||
|
|
result_queue.put({'text': '', 'success': False, 'debug': 'no_output'})
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
import traceback
|
||
|
|
log(f"ERROR: {e}")
|
||
|
|
log(f"Traceback:\n{traceback.format_exc()}")
|
||
|
|
result_queue.put({
|
||
|
|
'text': '',
|
||
|
|
'success': False,
|
||
|
|
'error': str(e)
|
||
|
|
})
|
||
|
|
|
||
|
|
|
||
|
|
def test():
|
||
|
|
print("Testing PaddleOCRVL with existing seal image...")
|
||
|
|
|
||
|
|
# Find a seal image
|
||
|
|
seal_image = Path("test_reports_full/1.pdf/seal_crop_0.png")
|
||
|
|
if not seal_image.exists():
|
||
|
|
print(f"Seal image not found: {seal_image}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
print(f"Using image: {seal_image}")
|
||
|
|
print(f"Image size: {seal_image.stat().st_size} bytes")
|
||
|
|
|
||
|
|
# Run the test
|
||
|
|
result_queue = multiprocessing.Queue()
|
||
|
|
|
||
|
|
print("Starting subprocess...")
|
||
|
|
process = multiprocessing.Process(
|
||
|
|
target=_run_ocr_vl_wrapper,
|
||
|
|
args=(str(seal_image), result_queue)
|
||
|
|
)
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
process.start()
|
||
|
|
|
||
|
|
# Wait up to 120 seconds
|
||
|
|
process.join(timeout=120)
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
|
||
|
|
print(f"Process completed in {elapsed:.1f}s")
|
||
|
|
|
||
|
|
if process.is_alive():
|
||
|
|
print("TIMEOUT: Process still running, terminating...")
|
||
|
|
process.terminate()
|
||
|
|
process.join(timeout=5)
|
||
|
|
if process.is_alive():
|
||
|
|
process.kill()
|
||
|
|
print("Process terminated")
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Get result
|
||
|
|
if not result_queue.empty():
|
||
|
|
result = result_queue.get_nowait()
|
||
|
|
print(f"\nResult:")
|
||
|
|
print(f" Text: '{result.get('text', '')}'")
|
||
|
|
print(f" Success: {result.get('success', False)}")
|
||
|
|
if result.get('error'):
|
||
|
|
print(f" Error: {result.get('error')}")
|
||
|
|
if result.get('debug'):
|
||
|
|
print(f" Debug: {result.get('debug')}")
|
||
|
|
return result.get('success', False) and len(result.get('text', '')) > 0
|
||
|
|
else:
|
||
|
|
print("No result returned from process")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
success = test()
|
||
|
|
print("\n" + "=" * 60)
|
||
|
|
if success:
|
||
|
|
print("SUCCESS: PaddleOCRVL is working!")
|
||
|
|
sys.exit(0)
|
||
|
|
else:
|
||
|
|
print("FAILED: PaddleOCRVL test failed")
|
||
|
|
sys.exit(1)
|