""" Simple test to see what CMA code is extracted """ import sys import os os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" # Clear cache for module in list(sys.modules.keys()): if 'cma_extraction' in module or 'test_accuracy' in module: del sys.modules[module] import fitz import numpy as np import cv2 from paddleocr import PaddleOCR # Import CMA extraction from cma_extraction_template_primary import extract_cma_code_fullpage, imread_unicode pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf" print(f"Processing: {pdf_path}") print("=" * 80) # Extract page doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() print(f"Page size: {page_img.shape}") # Initialize OCR print("\nInitializing OCR...") ocr = PaddleOCR(lang='ch') # Extract CMA print("\nExtracting CMA code...") output_dir = "test_debug" os.makedirs(output_dir, exist_ok=True) result = extract_cma_code_fullpage(page_img, ocr, output_dir=output_dir) print("\n" + "=" * 80) print("RESULT") print("=" * 80) print(f"Success: {result.get('success')}") print(f"CMA Code: {result.get('code')}") print(f"Confidence: {result.get('confidence')}") print(f"Method: {result.get('method')}") print(f"Position: {result.get('position')}") print(f"Box: {result.get('box')}") if result.get('code'): if result['code'] == '210020349096': print("\n✓ CORRECT CMA CODE EXTRACTED!") elif result['code'] == '440023010130': print("\n✗ WRONG CODE (440023010130) - This is the report number, not CMA!") else: print(f"\n? UNEXPECTED CODE: {result['code']}")