""" Quick test to verify the new fallback mechanism works. """ import sys import os import fitz import numpy as np import cv2 from pathlib import Path os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" # Force reimport to get latest changes if 'test_accuracy_batch_full' in sys.modules: del sys.modules['test_accuracy_batch_full'] if 'cma_extraction_template_primary' in sys.modules: del sys.modules['cma_extraction_template_primary'] from test_accuracy_batch_full import process_cma_template_extraction, extract_pdf_page from paddleocr import PaddleOCR # Test with one of the failing PDFs pdf_name = "财政部关于请协助提供相关材料的函_pages4-9.pdf" pdf_path = Path("src/test/resources/data/pdfs") / pdf_name print(f"Testing: {pdf_name}") print("=" * 80) # Extract page doc = fitz.open(str(pdf_path)) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() print(f"Image size: {page_img.shape}") # Initialize OCR print("\nInitializing PaddleOCR...") ocr = PaddleOCR(lang='ch') # Run template matching extraction print("\nRunning template matching extraction...") result = process_cma_template_extraction(page_img, ocr, output_dir="test_output") print("\n" + "=" * 80) print("RESULT") print("=" * 80) print(f"Success: {result['success']}") print(f"CMA Code: {result.get('code', 'N/A')}") print(f"Confidence: {result.get('confidence', 0):.2f}") print("=" * 80)