report-detect/archive/temp_scripts/test_paddleocrvl_direct.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Direct test of PaddleOCRVL to verify it works correctly.
"""

import sys
from pathlib import Path

def test_paddleocrvl_direct():
    """Test PaddleOCRVL directly without multiprocessing."""
    print("=" * 80)
    print("PaddleOCRVL Direct Test")
    print("=" * 80)

    try:
        from paddleocr import PaddleOCRVL
        print("OK PaddleOCRVL import successful")

    except ImportError as e:
        print(f"FAIL Failed to import PaddleOCRVL: {e}")
        print("  Install with: pip install paddleocr[doc-parser]")
        return False

    # Initialize
    print("\nInitializing PaddleOCRVL pipeline...")
    try:
        vl_pipeline = PaddleOCRVL(
            use_seal_recognition=True,
            use_ocr_for_image_block=True,
            use_layout_detection=True
        )
        print("OK Pipeline initialized successfully")

    except Exception as e:
        print(f"FAIL Failed to initialize pipeline: {e}")
        import traceback
        traceback.print_exc()
        return False

    # Find a test image
    test_dirs = [
        Path("test_reports_full"),
        Path("bridge_output"),
        Path("temp_paddleocr_vl"),
    ]

    test_image = None
    for test_dir in test_dirs:
        if test_dir.exists():
            # Find any PNG file
            png_files = list(test_dir.glob("**/*seal*.png"))
            if png_files:
                test_image = png_files[0]
                break

    if not test_image:
        print("\nNo test image found. Creating a simple test...")

        # Create a simple test image with text
        from PIL import Image, ImageDraw, ImageFont
        img = Image.new('RGB', (400, 400), color='white')
        draw = ImageDraw.Draw(img)

        # Draw a red circle (seal-like)
        draw.ellipse([50, 50, 350, 350], outline='red', width=5)

        # Add text
        try:
            # Try to use a font that supports Chinese
            font = ImageFont.truetype("msyh.ttc", 30)
        except:
            font = ImageFont.load_default()

        text = "测试机构名称"
        draw.text((200, 200), text, fill='black', font=font, anchor='mm')

        test_image = Path("test_seal.png")
        img.save(test_image)
        print(f"Created test image: {test_image}")

    print(f"\nTesting with image: {test_image}")
    print(f"Image size: {test_image.stat().st_size} bytes")

    # Run prediction
    print("\nRunning prediction (this may take 10-30 seconds)...")
    import time
    start = time.time()

    try:
        output = vl_pipeline.predict(str(test_image), batch_size=1)
        elapsed = time.time() - start

        print(f"OK Prediction completed in {elapsed:.1f} seconds")
        print(f"Output length: {len(output) if output else 0}")

        if output and len(output) > 0:
            res = output[0]

            # Save to JSON
            temp_dir = Path("test_paddleocrvl_output")
            temp_dir.mkdir(exist_ok=True)
            res.save_to_json(save_path=str(temp_dir))

            json_file = temp_dir / f"{test_image.stem}_res.json"
            print(f"\nJSON saved to: {json_file}")

            if json_file.exists():
                import json
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                print(f"\nParsing results ({len(data.get('parsing_res_list', []))} blocks):")

                for i, block in enumerate(data.get('parsing_res_list', [])):
                    label = block.get('block_label', 'unknown')
                    content = block.get('block_content', '')
                    print(f"  Block {i+1}: {label}")
                    if content:
                        print(f"    Content: '{content[:100]}...'")

                    if label == 'seal':
                        print(f"    *** SEAL DETECTED ***")
                        print(f"    Full text: '{content}'")

                # Check if seal was found
                seal_blocks = [b for b in data.get('parsing_res_list', []) if b.get('block_label') == 'seal']
                if seal_blocks:
                    print(f"\nOK SUCCESS: Found {len(seal_blocks)} seal(s)")
                    return True
                else:
                    print(f"\nFAIL FAIL: No seal blocks detected")
                    return False
            else:
                print(f"\nFAIL JSON file not created")
                return False
        else:
            print(f"\nFAIL No output from predict()")
            return False

    except Exception as e:
        elapsed = time.time() - start
        print(f"\nFAIL Prediction failed after {elapsed:.1f} seconds: {e}")
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = test_paddleocrvl_direct()
    print("\n" + "=" * 80)
    if success:
        print("PaddleOCRVL is working correctly!")
        sys.exit(0)
    else:
        print("PaddleOCRVL test failed!")
        sys.exit(1)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`"""`
			`Direct test of PaddleOCRVL to verify it works correctly.`
			`"""`

			`import sys`
			`from pathlib import Path`

			`def test_paddleocrvl_direct():`
			`"""Test PaddleOCRVL directly without multiprocessing."""`
			`print("=" * 80)`
			`print("PaddleOCRVL Direct Test")`
			`print("=" * 80)`

			`try:`
			`from paddleocr import PaddleOCRVL`
			`print("OK PaddleOCRVL import successful")`

			`except ImportError as e:`
			`print(f"FAIL Failed to import PaddleOCRVL: {e}")`
			`print(" Install with: pip install paddleocr[doc-parser]")`
			`return False`

			`# Initialize`
			`print("\nInitializing PaddleOCRVL pipeline...")`
			`try:`
			`vl_pipeline = PaddleOCRVL(`
			`use_seal_recognition=True,`
			`use_ocr_for_image_block=True,`
			`use_layout_detection=True`
			`)`
			`print("OK Pipeline initialized successfully")`

			`except Exception as e:`
			`print(f"FAIL Failed to initialize pipeline: {e}")`
			`import traceback`
			`traceback.print_exc()`
			`return False`

			`# Find a test image`
			`test_dirs = [`
			`Path("test_reports_full"),`
			`Path("bridge_output"),`
			`Path("temp_paddleocr_vl"),`
			`]`

			`test_image = None`
			`for test_dir in test_dirs:`
			`if test_dir.exists():`
			`# Find any PNG file`
			`png_files = list(test_dir.glob("*/seal*.png"))`
			`if png_files:`
			`test_image = png_files[0]`
			`break`

			`if not test_image:`
			`print("\nNo test image found. Creating a simple test...")`

			`# Create a simple test image with text`
			`from PIL import Image, ImageDraw, ImageFont`
			`img = Image.new('RGB', (400, 400), color='white')`
			`draw = ImageDraw.Draw(img)`

			`# Draw a red circle (seal-like)`
			`draw.ellipse([50, 50, 350, 350], outline='red', width=5)`

			`# Add text`
			`try:`
			`# Try to use a font that supports Chinese`
			`font = ImageFont.truetype("msyh.ttc", 30)`
			`except:`
			`font = ImageFont.load_default()`

			`text = "测试机构名称"`
			`draw.text((200, 200), text, fill='black', font=font, anchor='mm')`

			`test_image = Path("test_seal.png")`
			`img.save(test_image)`
			`print(f"Created test image: {test_image}")`

			`print(f"\nTesting with image: {test_image}")`
			`print(f"Image size: {test_image.stat().st_size} bytes")`

			`# Run prediction`
			`print("\nRunning prediction (this may take 10-30 seconds)...")`
			`import time`
			`start = time.time()`

			`try:`
			`output = vl_pipeline.predict(str(test_image), batch_size=1)`
			`elapsed = time.time() - start`

			`print(f"OK Prediction completed in {elapsed:.1f} seconds")`
			`print(f"Output length: {len(output) if output else 0}")`

			`if output and len(output) > 0:`
			`res = output[0]`

			`# Save to JSON`
			`temp_dir = Path("test_paddleocrvl_output")`
			`temp_dir.mkdir(exist_ok=True)`
			`res.save_to_json(save_path=str(temp_dir))`

			`json_file = temp_dir / f"{test_image.stem}_res.json"`
			`print(f"\nJSON saved to: {json_file}")`

			`if json_file.exists():`
			`import json`
			`with open(json_file, 'r', encoding='utf-8') as f:`
			`data = json.load(f)`

			`print(f"\nParsing results ({len(data.get('parsing_res_list', []))} blocks):")`

			`for i, block in enumerate(data.get('parsing_res_list', [])):`
			`label = block.get('block_label', 'unknown')`
			`content = block.get('block_content', '')`
			`print(f" Block {i+1}: {label}")`
			`if content:`
			`print(f" Content: '{content[:100]}...'")`

			`if label == 'seal':`
			`print(f" * SEAL DETECTED *")`
			`print(f" Full text: '{content}'")`

			`# Check if seal was found`
			`seal_blocks = [b for b in data.get('parsing_res_list', []) if b.get('block_label') == 'seal']`
			`if seal_blocks:`
			`print(f"\nOK SUCCESS: Found {len(seal_blocks)} seal(s)")`
			`return True`
			`else:`
			`print(f"\nFAIL FAIL: No seal blocks detected")`
			`return False`
			`else:`
			`print(f"\nFAIL JSON file not created")`
			`return False`
			`else:`
			`print(f"\nFAIL No output from predict()")`
			`return False`

			`except Exception as e:`
			`elapsed = time.time() - start`
			`print(f"\nFAIL Prediction failed after {elapsed:.1f} seconds: {e}")`
			`import traceback`
			`traceback.print_exc()`
			`return False`


			`if __name__ == "__main__":`
			`success = test_paddleocrvl_direct()`
			`print("\n" + "=" * 80)`
			`if success:`
			`print("PaddleOCRVL is working correctly!")`
			`sys.exit(0)`
			`else:`
			`print("PaddleOCRVL test failed!")`
			`sys.exit(1)`