report-detect/archive/temp_scripts/test_paddleocrvl_direct.py

158 lines
4.9 KiB
Python
Raw Permalink Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Direct test of PaddleOCRVL to verify it works correctly.
"""
import sys
from pathlib import Path
def test_paddleocrvl_direct():
"""Test PaddleOCRVL directly without multiprocessing."""
print("=" * 80)
print("PaddleOCRVL Direct Test")
print("=" * 80)
try:
from paddleocr import PaddleOCRVL
print("OK PaddleOCRVL import successful")
except ImportError as e:
print(f"FAIL Failed to import PaddleOCRVL: {e}")
print(" Install with: pip install paddleocr[doc-parser]")
return False
# Initialize
print("\nInitializing PaddleOCRVL pipeline...")
try:
vl_pipeline = PaddleOCRVL(
use_seal_recognition=True,
use_ocr_for_image_block=True,
use_layout_detection=True
)
print("OK Pipeline initialized successfully")
except Exception as e:
print(f"FAIL Failed to initialize pipeline: {e}")
import traceback
traceback.print_exc()
return False
# Find a test image
test_dirs = [
Path("test_reports_full"),
Path("bridge_output"),
Path("temp_paddleocr_vl"),
]
test_image = None
for test_dir in test_dirs:
if test_dir.exists():
# Find any PNG file
png_files = list(test_dir.glob("**/*seal*.png"))
if png_files:
test_image = png_files[0]
break
if not test_image:
print("\nNo test image found. Creating a simple test...")
# Create a simple test image with text
from PIL import Image, ImageDraw, ImageFont
img = Image.new('RGB', (400, 400), color='white')
draw = ImageDraw.Draw(img)
# Draw a red circle (seal-like)
draw.ellipse([50, 50, 350, 350], outline='red', width=5)
# Add text
try:
# Try to use a font that supports Chinese
font = ImageFont.truetype("msyh.ttc", 30)
except:
font = ImageFont.load_default()
text = "测试机构名称"
draw.text((200, 200), text, fill='black', font=font, anchor='mm')
test_image = Path("test_seal.png")
img.save(test_image)
print(f"Created test image: {test_image}")
print(f"\nTesting with image: {test_image}")
print(f"Image size: {test_image.stat().st_size} bytes")
# Run prediction
print("\nRunning prediction (this may take 10-30 seconds)...")
import time
start = time.time()
try:
output = vl_pipeline.predict(str(test_image), batch_size=1)
elapsed = time.time() - start
print(f"OK Prediction completed in {elapsed:.1f} seconds")
print(f"Output length: {len(output) if output else 0}")
if output and len(output) > 0:
res = output[0]
# Save to JSON
temp_dir = Path("test_paddleocrvl_output")
temp_dir.mkdir(exist_ok=True)
res.save_to_json(save_path=str(temp_dir))
json_file = temp_dir / f"{test_image.stem}_res.json"
print(f"\nJSON saved to: {json_file}")
if json_file.exists():
import json
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"\nParsing results ({len(data.get('parsing_res_list', []))} blocks):")
for i, block in enumerate(data.get('parsing_res_list', [])):
label = block.get('block_label', 'unknown')
content = block.get('block_content', '')
print(f" Block {i+1}: {label}")
if content:
print(f" Content: '{content[:100]}...'")
if label == 'seal':
print(f" *** SEAL DETECTED ***")
print(f" Full text: '{content}'")
# Check if seal was found
seal_blocks = [b for b in data.get('parsing_res_list', []) if b.get('block_label') == 'seal']
if seal_blocks:
print(f"\nOK SUCCESS: Found {len(seal_blocks)} seal(s)")
return True
else:
print(f"\nFAIL FAIL: No seal blocks detected")
return False
else:
print(f"\nFAIL JSON file not created")
return False
else:
print(f"\nFAIL No output from predict()")
return False
except Exception as e:
elapsed = time.time() - start
print(f"\nFAIL Prediction failed after {elapsed:.1f} seconds: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = test_paddleocrvl_direct()
print("\n" + "=" * 80)
if success:
print("PaddleOCRVL is working correctly!")
sys.exit(0)
else:
print("PaddleOCRVL test failed!")
sys.exit(1)