66 lines
1.7 KiB
Python
66 lines
1.7 KiB
Python
"""
|
|
Simple test to see what CMA code is extracted
|
|
"""
|
|
import sys
|
|
import os
|
|
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
# Clear cache
|
|
for module in list(sys.modules.keys()):
|
|
if 'cma_extraction' in module or 'test_accuracy' in module:
|
|
del sys.modules[module]
|
|
|
|
import fitz
|
|
import numpy as np
|
|
import cv2
|
|
from paddleocr import PaddleOCR
|
|
|
|
# Import CMA extraction
|
|
from cma_extraction_template_primary import extract_cma_code_fullpage, imread_unicode
|
|
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
|
print(f"Processing: {pdf_path}")
|
|
print("=" * 80)
|
|
|
|
# Extract page
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0]
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
|
doc.close()
|
|
|
|
print(f"Page size: {page_img.shape}")
|
|
|
|
# Initialize OCR
|
|
print("\nInitializing OCR...")
|
|
ocr = PaddleOCR(lang='ch')
|
|
|
|
# Extract CMA
|
|
print("\nExtracting CMA code...")
|
|
output_dir = "test_debug"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
result = extract_cma_code_fullpage(page_img, ocr, output_dir=output_dir)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("RESULT")
|
|
print("=" * 80)
|
|
print(f"Success: {result.get('success')}")
|
|
print(f"CMA Code: {result.get('code')}")
|
|
print(f"Confidence: {result.get('confidence')}")
|
|
print(f"Method: {result.get('method')}")
|
|
print(f"Position: {result.get('position')}")
|
|
print(f"Box: {result.get('box')}")
|
|
|
|
if result.get('code'):
|
|
if result['code'] == '210020349096':
|
|
print("\n✓ CORRECT CMA CODE EXTRACTED!")
|
|
elif result['code'] == '440023010130':
|
|
print("\n✗ WRONG CODE (440023010130) - This is the report number, not CMA!")
|
|
else:
|
|
print(f"\n? UNEXPECTED CODE: {result['code']}")
|