report-detect/archive/temp_scripts/test_fullpage_fallback.py

67 lines
1.7 KiB
Python

"""
Test full-page fallback for CMA extraction
"""
import sys, os
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
# Clear cache
for module in list(sys.modules.keys()):
if 'cma_extraction' in module:
del sys.modules[module]
import fitz, numpy as np, cv2
from paddleocr import PaddleOCR
# Import with reload
import importlib
import cma_extraction_template_primary
importlib.reload(cma_extraction_template_primary)
from cma_extraction_template_primary import extract_cma_from_roi
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
print("=" * 80)
print("TESTING FULL-PAGE FALLBACK")
print("=" * 80)
# Extract page
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()
print(f"\nPage size: {page_img.shape}")
# Initialize OCR
print("\nInitializing OCR...")
ocr = PaddleOCR(lang='ch')
# Test full-page extraction
print("\nRunning extract_cma_from_roi on FULL PAGE...")
result = extract_cma_from_roi(page_img, ocr, output_dir="test_fullpage_debug")
print("\n" + "=" * 80)
print("RESULT")
print("=" * 80)
print(f"Success: {result['success']}")
print(f"CMA Code: {result.get('code')}")
print(f"Confidence: {result.get('confidence')}")
if result.get('code'):
if result['code'] == '210020349096':
print("\n✓ SUCCESS: Found correct CMA code!")
elif result['code'] == '440023010130':
print("\n✗ FAILED: Found 440023010130 instead")
else:
print(f"\n? UNEXPECTED: Found {result['code']}")
else:
print("\n✗ FAILED: No CMA code found")
print(f"Reason: {result.get('reason', 'Unknown')}")
print("=" * 80)