report-detect/archive/temp_scripts/test_vl_simple.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Simple test to check if PaddleOCRVL wrapper is working.
"""

import sys
import time
from pathlib import Path
import multiprocessing

# Module-level wrapper function (required for Windows multiprocessing)
def _run_ocr_vl_wrapper(image_path, result_queue):
    """Wrapper function to run PaddleOCRVL in a subprocess."""
    try:
        # Helper to print to console
        def log(msg):
            print(f"[Subprocess] {msg}")
            sys.stdout.flush()

        log("Starting...")

        from paddleocr import PaddleOCRVL

        log("Import successful, initializing pipeline...")

        # Re-initialize pipeline in subprocess (required)
        vl_pipeline = PaddleOCRVL(
            use_seal_recognition=True,
            use_ocr_for_image_block=True,
            use_layout_detection=True
        )

        log("Pipeline initialized, starting prediction...")

        start_time = time.time()
        output = vl_pipeline.predict(image_path, batch_size=1)
        elapsed = time.time() - start_time

        log(f"Prediction completed in {elapsed:.1f}s, output length: {len(output) if output else 0}")

        if output and len(output) > 0:
            res = output[0]

            # Save to JSON
            import json
            temp_output_dir = Path("temp_paddleocr_vl_test")
            temp_output_dir.mkdir(exist_ok=True)

            res.save_to_json(save_path=str(temp_output_dir))

            json_file = temp_output_dir / f"{Path(image_path).stem}_res.json"

            log(f"Looking for JSON: {json_file}")

            if json_file.exists():
                log("JSON found, reading...")
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                blocks = data.get('parsing_res_list', [])
                log(f"Found {len(blocks)} blocks")

                for i, block in enumerate(blocks):
                    label = block.get('block_label', 'unknown')
                    content = block.get('block_content', '')
                    log(f"  Block {i}: {label} - '{content[:50] if content else '(empty)'}...'")

                    if label == 'seal':
                        text = content.strip()
                        log(f"  *** SEAL FOUND: '{text}' ***")

                        # Clean up
                        import shutil
                        if temp_output_dir.exists():
                            shutil.rmtree(temp_output_dir, ignore_errors=True)

                        result_queue.put({
                            'text': text,
                            'success': len(text) > 0
                        })
                        return

            log("No seal block found")
            result_queue.put({'text': '', 'success': False, 'debug': 'no_seal'})
        else:
            log("No output from predict()")
            result_queue.put({'text': '', 'success': False, 'debug': 'no_output'})

    except Exception as e:
        import traceback
        log(f"ERROR: {e}")
        log(f"Traceback:\n{traceback.format_exc()}")
        result_queue.put({
            'text': '',
            'success': False,
            'error': str(e)
        })


def test():
    print("Testing PaddleOCRVL with existing seal image...")

    # Find a seal image
    seal_image = Path("test_reports_full/1.pdf/seal_crop_0.png")
    if not seal_image.exists():
        print(f"Seal image not found: {seal_image}")
        return False

    print(f"Using image: {seal_image}")
    print(f"Image size: {seal_image.stat().st_size} bytes")

    # Run the test
    result_queue = multiprocessing.Queue()

    print("Starting subprocess...")
    process = multiprocessing.Process(
        target=_run_ocr_vl_wrapper,
        args=(str(seal_image), result_queue)
    )

    start_time = time.time()
    process.start()

    # Wait up to 120 seconds
    process.join(timeout=120)
    elapsed = time.time() - start_time

    print(f"Process completed in {elapsed:.1f}s")

    if process.is_alive():
        print("TIMEOUT: Process still running, terminating...")
        process.terminate()
        process.join(timeout=5)
        if process.is_alive():
            process.kill()
        print("Process terminated")
        return False

    # Get result
    if not result_queue.empty():
        result = result_queue.get_nowait()
        print(f"\nResult:")
        print(f"  Text: '{result.get('text', '')}'")
        print(f"  Success: {result.get('success', False)}")
        if result.get('error'):
            print(f"  Error: {result.get('error')}")
        if result.get('debug'):
            print(f"  Debug: {result.get('debug')}")
        return result.get('success', False) and len(result.get('text', '')) > 0
    else:
        print("No result returned from process")
        return False


if __name__ == "__main__":
    success = test()
    print("\n" + "=" * 60)
    if success:
        print("SUCCESS: PaddleOCRVL is working!")
        sys.exit(0)
    else:
        print("FAILED: PaddleOCRVL test failed")
        sys.exit(1)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`"""`
			`Simple test to check if PaddleOCRVL wrapper is working.`
			`"""`

			`import sys`
			`import time`
			`from pathlib import Path`
			`import multiprocessing`

			`# Module-level wrapper function (required for Windows multiprocessing)`
			`def _run_ocr_vl_wrapper(image_path, result_queue):`
			`"""Wrapper function to run PaddleOCRVL in a subprocess."""`
			`try:`
			`# Helper to print to console`
			`def log(msg):`
			`print(f"[Subprocess] {msg}")`
			`sys.stdout.flush()`

			`log("Starting...")`

			`from paddleocr import PaddleOCRVL`

			`log("Import successful, initializing pipeline...")`

			`# Re-initialize pipeline in subprocess (required)`
			`vl_pipeline = PaddleOCRVL(`
			`use_seal_recognition=True,`
			`use_ocr_for_image_block=True,`
			`use_layout_detection=True`
			`)`

			`log("Pipeline initialized, starting prediction...")`

			`start_time = time.time()`
			`output = vl_pipeline.predict(image_path, batch_size=1)`
			`elapsed = time.time() - start_time`

			`log(f"Prediction completed in {elapsed:.1f}s, output length: {len(output) if output else 0}")`

			`if output and len(output) > 0:`
			`res = output[0]`

			`# Save to JSON`
			`import json`
			`temp_output_dir = Path("temp_paddleocr_vl_test")`
			`temp_output_dir.mkdir(exist_ok=True)`

			`res.save_to_json(save_path=str(temp_output_dir))`

			`json_file = temp_output_dir / f"{Path(image_path).stem}_res.json"`

			`log(f"Looking for JSON: {json_file}")`

			`if json_file.exists():`
			`log("JSON found, reading...")`
			`with open(json_file, 'r', encoding='utf-8') as f:`
			`data = json.load(f)`

			`blocks = data.get('parsing_res_list', [])`
			`log(f"Found {len(blocks)} blocks")`

			`for i, block in enumerate(blocks):`
			`label = block.get('block_label', 'unknown')`
			`content = block.get('block_content', '')`
			`log(f" Block {i}: {label} - '{content[:50] if content else '(empty)'}...'")`

			`if label == 'seal':`
			`text = content.strip()`
			`log(f" * SEAL FOUND: '{text}' *")`

			`# Clean up`
			`import shutil`
			`if temp_output_dir.exists():`
			`shutil.rmtree(temp_output_dir, ignore_errors=True)`

			`result_queue.put({`
			`'text': text,`
			`'success': len(text) > 0`
			`})`
			`return`

			`log("No seal block found")`
			`result_queue.put({'text': '', 'success': False, 'debug': 'no_seal'})`
			`else:`
			`log("No output from predict()")`
			`result_queue.put({'text': '', 'success': False, 'debug': 'no_output'})`

			`except Exception as e:`
			`import traceback`
			`log(f"ERROR: {e}")`
			`log(f"Traceback:\n{traceback.format_exc()}")`
			`result_queue.put({`
			`'text': '',`
			`'success': False,`
			`'error': str(e)`
			`})`


			`def test():`
			`print("Testing PaddleOCRVL with existing seal image...")`

			`# Find a seal image`
			`seal_image = Path("test_reports_full/1.pdf/seal_crop_0.png")`
			`if not seal_image.exists():`
			`print(f"Seal image not found: {seal_image}")`
			`return False`

			`print(f"Using image: {seal_image}")`
			`print(f"Image size: {seal_image.stat().st_size} bytes")`

			`# Run the test`
			`result_queue = multiprocessing.Queue()`

			`print("Starting subprocess...")`
			`process = multiprocessing.Process(`
			`target=_run_ocr_vl_wrapper,`
			`args=(str(seal_image), result_queue)`
			`)`

			`start_time = time.time()`
			`process.start()`

			`# Wait up to 120 seconds`
			`process.join(timeout=120)`
			`elapsed = time.time() - start_time`

			`print(f"Process completed in {elapsed:.1f}s")`

			`if process.is_alive():`
			`print("TIMEOUT: Process still running, terminating...")`
			`process.terminate()`
			`process.join(timeout=5)`
			`if process.is_alive():`
			`process.kill()`
			`print("Process terminated")`
			`return False`

			`# Get result`
			`if not result_queue.empty():`
			`result = result_queue.get_nowait()`
			`print(f"\nResult:")`
			`print(f" Text: '{result.get('text', '')}'")`
			`print(f" Success: {result.get('success', False)}")`
			`if result.get('error'):`
			`print(f" Error: {result.get('error')}")`
			`if result.get('debug'):`
			`print(f" Debug: {result.get('debug')}")`
			`return result.get('success', False) and len(result.get('text', '')) > 0`
			`else:`
			`print("No result returned from process")`
			`return False`


			`if __name__ == "__main__":`
			`success = test()`
			`print("\n" + "=" * 60)`
			`if success:`
			`print("SUCCESS: PaddleOCRVL is working!")`
			`sys.exit(0)`
			`else:`
			`print("FAILED: PaddleOCRVL test failed")`
			`sys.exit(1)`