""" Quick validation test for CMA template matching improvements. Tests a subset of PDFs to verify the improvements. """ import sys import os import json import logging import fitz import numpy as np import cv2 from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(message)s') logger = logging.getLogger(__name__) # Add parent dir to path sys.path.insert(0, os.path.dirname(__file__)) # Import from our module from cma_extraction_template_primary import extract_cma_code_fullpage # Disable model source check os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" from paddleocr import PaddleOCR PDF_DIR = Path("src/test/resources/data/pdfs") RESULTS_FILE = Path("src/test/resources/data/results.json") def main(): # Load expected results with open(RESULTS_FILE, 'r', encoding='utf-8') as f: expected_results = json.load(f) # Test specific PDFs test_pdfs = [ "WTS2025-21283.pdf", "YDQ23_001838.pdf", "YDQ23_001850.pdf", "YDQ25_001875.pdf", "YDQ25_002294.pdf", "1.pdf", ] # Initialize OCR logger.info("Initializing PaddleOCR...") ocr = PaddleOCR(lang='ch') results = [] logger.info("=" * 80) logger.info("QUICK VALIDATION TEST FOR CMA TEMPLATE MATCHING") logger.info("=" * 80) for pdf_name in test_pdfs: pdf_path = PDF_DIR / pdf_name if not pdf_path.exists(): logger.warning(f"PDF not found: {pdf_name}") continue logger.info(f"\nProcessing: {pdf_name}") logger.info("-" * 80) # Extract first page doc = fitz.open(str(pdf_path)) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() # Get expected CMA expected_cma = expected_results.get(pdf_name, {}).get('cma') # Process with template matching result = extract_cma_code_fullpage(page_img, ocr, None) # Record result success = result.get('success', False) extracted_cma = result.get('code') logger.info(f" Expected CMA: {expected_cma}") logger.info(f" Extracted CMA: {extracted_cma}") logger.info(f" Status: {'✓ PASS' if (success and extracted_cma == expected_cma) else '✗ FAIL'}") results.append({ 'pdf': pdf_name, 'expected': expected_cma, 'extracted': extracted_cma, 'success': success and extracted_cma == expected_cma }) # Summary logger.info("\n" + "=" * 80) logger.info("SUMMARY") logger.info("=" * 80) passed = sum(1 for r in results if r['success']) total = len(results) for r in results: status = "✓ PASS" if r['success'] else "✗ FAIL" logger.info(f"{status} | {r['pdf']:30s} | {r['extracted'] or 'None':15s} (expected: {r['expected']})") logger.info("-" * 80) logger.info(f"Accuracy: {passed}/{total} ({passed/total*100:.1f}%)") logger.info("=" * 80) return passed, total if __name__ == "__main__": try: passed, total = main() sys.exit(0 if passed == total else 1) except Exception as e: logger.error(f"Test failed: {e}") import traceback traceback.print_exc() sys.exit(1)