report-detect/archive/temp_scripts/test_single_pdf.py

56 lines
1.5 KiB
Python

"""
Quick test to verify the new fallback mechanism works.
"""
import sys
import os
import fitz
import numpy as np
import cv2
from pathlib import Path
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
# Force reimport to get latest changes
if 'test_accuracy_batch_full' in sys.modules:
del sys.modules['test_accuracy_batch_full']
if 'cma_extraction_template_primary' in sys.modules:
del sys.modules['cma_extraction_template_primary']
from test_accuracy_batch_full import process_cma_template_extraction, extract_pdf_page
from paddleocr import PaddleOCR
# Test with one of the failing PDFs
pdf_name = "财政部关于请协助提供相关材料的函_pages4-9.pdf"
pdf_path = Path("src/test/resources/data/pdfs") / pdf_name
print(f"Testing: {pdf_name}")
print("=" * 80)
# Extract page
doc = fitz.open(str(pdf_path))
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()
print(f"Image size: {page_img.shape}")
# Initialize OCR
print("\nInitializing PaddleOCR...")
ocr = PaddleOCR(lang='ch')
# Run template matching extraction
print("\nRunning template matching extraction...")
result = process_cma_template_extraction(page_img, ocr, output_dir="test_output")
print("\n" + "=" * 80)
print("RESULT")
print("=" * 80)
print(f"Success: {result['success']}")
print(f"CMA Code: {result.get('code', 'N/A')}")
print(f"Confidence: {result.get('confidence', 0):.2f}")
print("=" * 80)