56 lines
1.5 KiB
Python
56 lines
1.5 KiB
Python
"""
|
|
Quick test to verify the new fallback mechanism works.
|
|
"""
|
|
import sys
|
|
import os
|
|
import fitz
|
|
import numpy as np
|
|
import cv2
|
|
from pathlib import Path
|
|
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
# Force reimport to get latest changes
|
|
if 'test_accuracy_batch_full' in sys.modules:
|
|
del sys.modules['test_accuracy_batch_full']
|
|
if 'cma_extraction_template_primary' in sys.modules:
|
|
del sys.modules['cma_extraction_template_primary']
|
|
|
|
from test_accuracy_batch_full import process_cma_template_extraction, extract_pdf_page
|
|
from paddleocr import PaddleOCR
|
|
|
|
# Test with one of the failing PDFs
|
|
pdf_name = "财政部关于请协助提供相关材料的函_pages4-9.pdf"
|
|
pdf_path = Path("src/test/resources/data/pdfs") / pdf_name
|
|
|
|
print(f"Testing: {pdf_name}")
|
|
print("=" * 80)
|
|
|
|
# Extract page
|
|
doc = fitz.open(str(pdf_path))
|
|
page = doc[0]
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
|
doc.close()
|
|
|
|
print(f"Image size: {page_img.shape}")
|
|
|
|
# Initialize OCR
|
|
print("\nInitializing PaddleOCR...")
|
|
ocr = PaddleOCR(lang='ch')
|
|
|
|
# Run template matching extraction
|
|
print("\nRunning template matching extraction...")
|
|
result = process_cma_template_extraction(page_img, ocr, output_dir="test_output")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("RESULT")
|
|
print("=" * 80)
|
|
print(f"Success: {result['success']}")
|
|
print(f"CMA Code: {result.get('code', 'N/A')}")
|
|
print(f"Confidence: {result.get('confidence', 0):.2f}")
|
|
print("=" * 80)
|