report-detect/archive/temp_scripts/test_vl_simple.py

165 lines
5.0 KiB
Python
Raw Permalink Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Simple test to check if PaddleOCRVL wrapper is working.
"""
import sys
import time
from pathlib import Path
import multiprocessing
# Module-level wrapper function (required for Windows multiprocessing)
def _run_ocr_vl_wrapper(image_path, result_queue):
"""Wrapper function to run PaddleOCRVL in a subprocess."""
try:
# Helper to print to console
def log(msg):
print(f"[Subprocess] {msg}")
sys.stdout.flush()
log("Starting...")
from paddleocr import PaddleOCRVL
log("Import successful, initializing pipeline...")
# Re-initialize pipeline in subprocess (required)
vl_pipeline = PaddleOCRVL(
use_seal_recognition=True,
use_ocr_for_image_block=True,
use_layout_detection=True
)
log("Pipeline initialized, starting prediction...")
start_time = time.time()
output = vl_pipeline.predict(image_path, batch_size=1)
elapsed = time.time() - start_time
log(f"Prediction completed in {elapsed:.1f}s, output length: {len(output) if output else 0}")
if output and len(output) > 0:
res = output[0]
# Save to JSON
import json
temp_output_dir = Path("temp_paddleocr_vl_test")
temp_output_dir.mkdir(exist_ok=True)
res.save_to_json(save_path=str(temp_output_dir))
json_file = temp_output_dir / f"{Path(image_path).stem}_res.json"
log(f"Looking for JSON: {json_file}")
if json_file.exists():
log("JSON found, reading...")
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
blocks = data.get('parsing_res_list', [])
log(f"Found {len(blocks)} blocks")
for i, block in enumerate(blocks):
label = block.get('block_label', 'unknown')
content = block.get('block_content', '')
log(f" Block {i}: {label} - '{content[:50] if content else '(empty)'}...'")
if label == 'seal':
text = content.strip()
log(f" *** SEAL FOUND: '{text}' ***")
# Clean up
import shutil
if temp_output_dir.exists():
shutil.rmtree(temp_output_dir, ignore_errors=True)
result_queue.put({
'text': text,
'success': len(text) > 0
})
return
log("No seal block found")
result_queue.put({'text': '', 'success': False, 'debug': 'no_seal'})
else:
log("No output from predict()")
result_queue.put({'text': '', 'success': False, 'debug': 'no_output'})
except Exception as e:
import traceback
log(f"ERROR: {e}")
log(f"Traceback:\n{traceback.format_exc()}")
result_queue.put({
'text': '',
'success': False,
'error': str(e)
})
def test():
print("Testing PaddleOCRVL with existing seal image...")
# Find a seal image
seal_image = Path("test_reports_full/1.pdf/seal_crop_0.png")
if not seal_image.exists():
print(f"Seal image not found: {seal_image}")
return False
print(f"Using image: {seal_image}")
print(f"Image size: {seal_image.stat().st_size} bytes")
# Run the test
result_queue = multiprocessing.Queue()
print("Starting subprocess...")
process = multiprocessing.Process(
target=_run_ocr_vl_wrapper,
args=(str(seal_image), result_queue)
)
start_time = time.time()
process.start()
# Wait up to 120 seconds
process.join(timeout=120)
elapsed = time.time() - start_time
print(f"Process completed in {elapsed:.1f}s")
if process.is_alive():
print("TIMEOUT: Process still running, terminating...")
process.terminate()
process.join(timeout=5)
if process.is_alive():
process.kill()
print("Process terminated")
return False
# Get result
if not result_queue.empty():
result = result_queue.get_nowait()
print(f"\nResult:")
print(f" Text: '{result.get('text', '')}'")
print(f" Success: {result.get('success', False)}")
if result.get('error'):
print(f" Error: {result.get('error')}")
if result.get('debug'):
print(f" Debug: {result.get('debug')}")
return result.get('success', False) and len(result.get('text', '')) > 0
else:
print("No result returned from process")
return False
if __name__ == "__main__":
success = test()
print("\n" + "=" * 60)
if success:
print("SUCCESS: PaddleOCRVL is working!")
sys.exit(0)
else:
print("FAILED: PaddleOCRVL test failed")
sys.exit(1)