""" Test full-page fallback for CMA extraction """ import sys, os os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" # Clear cache for module in list(sys.modules.keys()): if 'cma_extraction' in module: del sys.modules[module] import fitz, numpy as np, cv2 from paddleocr import PaddleOCR # Import with reload import importlib import cma_extraction_template_primary importlib.reload(cma_extraction_template_primary) from cma_extraction_template_primary import extract_cma_from_roi pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf" print("=" * 80) print("TESTING FULL-PAGE FALLBACK") print("=" * 80) # Extract page doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() print(f"\nPage size: {page_img.shape}") # Initialize OCR print("\nInitializing OCR...") ocr = PaddleOCR(lang='ch') # Test full-page extraction print("\nRunning extract_cma_from_roi on FULL PAGE...") result = extract_cma_from_roi(page_img, ocr, output_dir="test_fullpage_debug") print("\n" + "=" * 80) print("RESULT") print("=" * 80) print(f"Success: {result['success']}") print(f"CMA Code: {result.get('code')}") print(f"Confidence: {result.get('confidence')}") if result.get('code'): if result['code'] == '210020349096': print("\n✓ SUCCESS: Found correct CMA code!") elif result['code'] == '440023010130': print("\n✗ FAILED: Found 440023010130 instead") else: print(f"\n? UNEXPECTED: Found {result['code']}") else: print("\n✗ FAILED: No CMA code found") print(f"Reason: {result.get('reason', 'Unknown')}") print("=" * 80)