67 lines
1.7 KiB
Python
67 lines
1.7 KiB
Python
"""
|
|
Test full-page fallback for CMA extraction
|
|
"""
|
|
import sys, os
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
# Clear cache
|
|
for module in list(sys.modules.keys()):
|
|
if 'cma_extraction' in module:
|
|
del sys.modules[module]
|
|
|
|
import fitz, numpy as np, cv2
|
|
from paddleocr import PaddleOCR
|
|
|
|
# Import with reload
|
|
import importlib
|
|
import cma_extraction_template_primary
|
|
importlib.reload(cma_extraction_template_primary)
|
|
|
|
from cma_extraction_template_primary import extract_cma_from_roi
|
|
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
|
|
|
print("=" * 80)
|
|
print("TESTING FULL-PAGE FALLBACK")
|
|
print("=" * 80)
|
|
|
|
# Extract page
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0]
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
|
doc.close()
|
|
|
|
print(f"\nPage size: {page_img.shape}")
|
|
|
|
# Initialize OCR
|
|
print("\nInitializing OCR...")
|
|
ocr = PaddleOCR(lang='ch')
|
|
|
|
# Test full-page extraction
|
|
print("\nRunning extract_cma_from_roi on FULL PAGE...")
|
|
result = extract_cma_from_roi(page_img, ocr, output_dir="test_fullpage_debug")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("RESULT")
|
|
print("=" * 80)
|
|
print(f"Success: {result['success']}")
|
|
print(f"CMA Code: {result.get('code')}")
|
|
print(f"Confidence: {result.get('confidence')}")
|
|
|
|
if result.get('code'):
|
|
if result['code'] == '210020349096':
|
|
print("\n✓ SUCCESS: Found correct CMA code!")
|
|
elif result['code'] == '440023010130':
|
|
print("\n✗ FAILED: Found 440023010130 instead")
|
|
else:
|
|
print(f"\n? UNEXPECTED: Found {result['code']}")
|
|
else:
|
|
print("\n✗ FAILED: No CMA code found")
|
|
print(f"Reason: {result.get('reason', 'Unknown')}")
|
|
|
|
print("=" * 80)
|