report-detect/archive/temp_scripts/quick_validation_test.py

"""
Quick validation test for CMA template matching improvements.
Tests a subset of PDFs to verify the improvements.
"""
import sys
import os
import json
import logging
import fitz
import numpy as np
import cv2
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

# Add parent dir to path
sys.path.insert(0, os.path.dirname(__file__))

# Import from our module
from cma_extraction_template_primary import extract_cma_code_fullpage

# Disable model source check
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
from paddleocr import PaddleOCR

PDF_DIR = Path("src/test/resources/data/pdfs")
RESULTS_FILE = Path("src/test/resources/data/results.json")

def main():
    # Load expected results
    with open(RESULTS_FILE, 'r', encoding='utf-8') as f:
        expected_results = json.load(f)

    # Test specific PDFs
    test_pdfs = [
        "WTS2025-21283.pdf",
        "YDQ23_001838.pdf",
        "YDQ23_001850.pdf",
        "YDQ25_001875.pdf",
        "YDQ25_002294.pdf",
        "1.pdf",
    ]

    # Initialize OCR
    logger.info("Initializing PaddleOCR...")
    ocr = PaddleOCR(lang='ch')

    results = []

    logger.info("=" * 80)
    logger.info("QUICK VALIDATION TEST FOR CMA TEMPLATE MATCHING")
    logger.info("=" * 80)

    for pdf_name in test_pdfs:
        pdf_path = PDF_DIR / pdf_name
        if not pdf_path.exists():
            logger.warning(f"PDF not found: {pdf_name}")
            continue

        logger.info(f"\nProcessing: {pdf_name}")
        logger.info("-" * 80)

        # Extract first page
        doc = fitz.open(str(pdf_path))
        page = doc[0]
        mat = fitz.Matrix(300 / 72, 300 / 72)
        pix = page.get_pixmap(matrix=mat)
        img_data = pix.tobytes("png")
        img_array = np.frombuffer(img_data, dtype=np.uint8)
        page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        doc.close()

        # Get expected CMA
        expected_cma = expected_results.get(pdf_name, {}).get('cma')

        # Process with template matching
        result = extract_cma_code_fullpage(page_img, ocr, None)

        # Record result
        success = result.get('success', False)
        extracted_cma = result.get('code')

        logger.info(f"  Expected CMA: {expected_cma}")
        logger.info(f"  Extracted CMA: {extracted_cma}")
        logger.info(f"  Status: {'✓ PASS' if (success and extracted_cma == expected_cma) else '✗ FAIL'}")

        results.append({
            'pdf': pdf_name,
            'expected': expected_cma,
            'extracted': extracted_cma,
            'success': success and extracted_cma == expected_cma
        })

    # Summary
    logger.info("\n" + "=" * 80)
    logger.info("SUMMARY")
    logger.info("=" * 80)

    passed = sum(1 for r in results if r['success'])
    total = len(results)

    for r in results:
        status = "✓ PASS" if r['success'] else "✗ FAIL"
        logger.info(f"{status} | {r['pdf']:30s} | {r['extracted'] or 'None':15s} (expected: {r['expected']})")

    logger.info("-" * 80)
    logger.info(f"Accuracy: {passed}/{total} ({passed/total*100:.1f}%)")
    logger.info("=" * 80)

    return passed, total

if __name__ == "__main__":
    try:
        passed, total = main()
        sys.exit(0 if passed == total else 1)
    except Exception as e:
        logger.error(f"Test failed: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Quick validation test for CMA template matching improvements.`
			`Tests a subset of PDFs to verify the improvements.`
			`"""`
			`import sys`
			`import os`
			`import json`
			`import logging`
			`import fitz`
			`import numpy as np`
			`import cv2`
			`from pathlib import Path`

			`logging.basicConfig(level=logging.INFO, format='%(message)s')`
			`logger = logging.getLogger(__name__)`

			`# Add parent dir to path`
			`sys.path.insert(0, os.path.dirname(__file__))`

			`# Import from our module`
			`from cma_extraction_template_primary import extract_cma_code_fullpage`

			`# Disable model source check`
			`os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"`
			`from paddleocr import PaddleOCR`

			`PDF_DIR = Path("src/test/resources/data/pdfs")`
			`RESULTS_FILE = Path("src/test/resources/data/results.json")`

			`def main():`
			`# Load expected results`
			`with open(RESULTS_FILE, 'r', encoding='utf-8') as f:`
			`expected_results = json.load(f)`

			`# Test specific PDFs`
			`test_pdfs = [`
			`"WTS2025-21283.pdf",`
			`"YDQ23_001838.pdf",`
			`"YDQ23_001850.pdf",`
			`"YDQ25_001875.pdf",`
			`"YDQ25_002294.pdf",`
			`"1.pdf",`
			`]`

			`# Initialize OCR`
			`logger.info("Initializing PaddleOCR...")`
			`ocr = PaddleOCR(lang='ch')`

			`results = []`

			`logger.info("=" * 80)`
			`logger.info("QUICK VALIDATION TEST FOR CMA TEMPLATE MATCHING")`
			`logger.info("=" * 80)`

			`for pdf_name in test_pdfs:`
			`pdf_path = PDF_DIR / pdf_name`
			`if not pdf_path.exists():`
			`logger.warning(f"PDF not found: {pdf_name}")`
			`continue`

			`logger.info(f"\nProcessing: {pdf_name}")`
			`logger.info("-" * 80)`

			`# Extract first page`
			`doc = fitz.open(str(pdf_path))`
			`page = doc[0]`
			`mat = fitz.Matrix(300 / 72, 300 / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_data = pix.tobytes("png")`
			`img_array = np.frombuffer(img_data, dtype=np.uint8)`
			`page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)`
			`doc.close()`

			`# Get expected CMA`
			`expected_cma = expected_results.get(pdf_name, {}).get('cma')`

			`# Process with template matching`
			`result = extract_cma_code_fullpage(page_img, ocr, None)`

			`# Record result`
			`success = result.get('success', False)`
			`extracted_cma = result.get('code')`

			`logger.info(f" Expected CMA: {expected_cma}")`
			`logger.info(f" Extracted CMA: {extracted_cma}")`
			`logger.info(f" Status: {'✓ PASS' if (success and extracted_cma == expected_cma) else '✗ FAIL'}")`

			`results.append({`
			`'pdf': pdf_name,`
			`'expected': expected_cma,`
			`'extracted': extracted_cma,`
			`'success': success and extracted_cma == expected_cma`
			`})`

			`# Summary`
			`logger.info("\n" + "=" * 80)`
			`logger.info("SUMMARY")`
			`logger.info("=" * 80)`

			`passed = sum(1 for r in results if r['success'])`
			`total = len(results)`

			`for r in results:`
			`status = "✓ PASS" if r['success'] else "✗ FAIL"`
			`logger.info(f"{status} \| {r['pdf']:30s} \| {r['extracted'] or 'None':15s} (expected: {r['expected']})")`

			`logger.info("-" * 80)`
			`logger.info(f"Accuracy: {passed}/{total} ({passed/total*100:.1f}%)")`
			`logger.info("=" * 80)`

			`return passed, total`

			`if __name__ == "__main__":`
			`try:`
			`passed, total = main()`
			`sys.exit(0 if passed == total else 1)`
			`except Exception as e:`
			`logger.error(f"Test failed: {e}")`
			`import traceback`
			`traceback.print_exc()`
			`sys.exit(1)`