122 lines
3.4 KiB
Python
122 lines
3.4 KiB
Python
"""
|
|
Quick validation test for CMA template matching improvements.
|
|
Tests a subset of PDFs to verify the improvements.
|
|
"""
|
|
import sys
|
|
import os
|
|
import json
|
|
import logging
|
|
import fitz
|
|
import numpy as np
|
|
import cv2
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Add parent dir to path
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
# Import from our module
|
|
from cma_extraction_template_primary import extract_cma_code_fullpage
|
|
|
|
# Disable model source check
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
from paddleocr import PaddleOCR
|
|
|
|
PDF_DIR = Path("src/test/resources/data/pdfs")
|
|
RESULTS_FILE = Path("src/test/resources/data/results.json")
|
|
|
|
def main():
|
|
# Load expected results
|
|
with open(RESULTS_FILE, 'r', encoding='utf-8') as f:
|
|
expected_results = json.load(f)
|
|
|
|
# Test specific PDFs
|
|
test_pdfs = [
|
|
"WTS2025-21283.pdf",
|
|
"YDQ23_001838.pdf",
|
|
"YDQ23_001850.pdf",
|
|
"YDQ25_001875.pdf",
|
|
"YDQ25_002294.pdf",
|
|
"1.pdf",
|
|
]
|
|
|
|
# Initialize OCR
|
|
logger.info("Initializing PaddleOCR...")
|
|
ocr = PaddleOCR(lang='ch')
|
|
|
|
results = []
|
|
|
|
logger.info("=" * 80)
|
|
logger.info("QUICK VALIDATION TEST FOR CMA TEMPLATE MATCHING")
|
|
logger.info("=" * 80)
|
|
|
|
for pdf_name in test_pdfs:
|
|
pdf_path = PDF_DIR / pdf_name
|
|
if not pdf_path.exists():
|
|
logger.warning(f"PDF not found: {pdf_name}")
|
|
continue
|
|
|
|
logger.info(f"\nProcessing: {pdf_name}")
|
|
logger.info("-" * 80)
|
|
|
|
# Extract first page
|
|
doc = fitz.open(str(pdf_path))
|
|
page = doc[0]
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
|
doc.close()
|
|
|
|
# Get expected CMA
|
|
expected_cma = expected_results.get(pdf_name, {}).get('cma')
|
|
|
|
# Process with template matching
|
|
result = extract_cma_code_fullpage(page_img, ocr, None)
|
|
|
|
# Record result
|
|
success = result.get('success', False)
|
|
extracted_cma = result.get('code')
|
|
|
|
logger.info(f" Expected CMA: {expected_cma}")
|
|
logger.info(f" Extracted CMA: {extracted_cma}")
|
|
logger.info(f" Status: {'✓ PASS' if (success and extracted_cma == expected_cma) else '✗ FAIL'}")
|
|
|
|
results.append({
|
|
'pdf': pdf_name,
|
|
'expected': expected_cma,
|
|
'extracted': extracted_cma,
|
|
'success': success and extracted_cma == expected_cma
|
|
})
|
|
|
|
# Summary
|
|
logger.info("\n" + "=" * 80)
|
|
logger.info("SUMMARY")
|
|
logger.info("=" * 80)
|
|
|
|
passed = sum(1 for r in results if r['success'])
|
|
total = len(results)
|
|
|
|
for r in results:
|
|
status = "✓ PASS" if r['success'] else "✗ FAIL"
|
|
logger.info(f"{status} | {r['pdf']:30s} | {r['extracted'] or 'None':15s} (expected: {r['expected']})")
|
|
|
|
logger.info("-" * 80)
|
|
logger.info(f"Accuracy: {passed}/{total} ({passed/total*100:.1f}%)")
|
|
logger.info("=" * 80)
|
|
|
|
return passed, total
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
passed, total = main()
|
|
sys.exit(0 if passed == total else 1)
|
|
except Exception as e:
|
|
logger.error(f"Test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|