report-detect/archive/temp_scripts/test_cma_simple.py

149 lines
4.4 KiB
Python

"""
Simple test script to debug CMA extraction issues.
"""
import os
import sys
import logging
from pathlib import Path
# Set up logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
try:
import fitz # PyMuPDF
import cv2
import numpy as np
from paddleocr import PaddleOCR
# Import CMA extraction module
try:
from cma_extraction_final import extract_cma_code_fullpage
logger.info("Using cma_extraction_final.py")
except ImportError as e:
logger.error(f"Cannot import cma_extraction_final.py: {e}")
sys.exit(1)
except ImportError as e:
logger.error(f"Required dependency not found: {e}")
sys.exit(1)
def extract_pdf_page(pdf_path: str, page_num: int = 0):
"""Extract a page from PDF as image"""
try:
doc = fitz.open(pdf_path)
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
# Convert to BGR format for OpenCV
if pix.n == 4: # RGBA
img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
elif pix.n == 3: # RGB
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
elif pix.n == 1: # Grayscale
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
doc.close()
return img
except Exception as e:
logger.error(f"Failed to extract page from {pdf_path}: {e}")
return None
def main():
# Disable model source check for faster loading
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
print("=" * 80)
print("CMA EXTRACTION DEBUG TEST")
print("=" * 80)
# Initialize PaddleOCR
print("\n[1/3] Initializing PaddleOCR...")
logger.info("Initializing PaddleOCR...")
try:
ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch')
print("✓ PaddleOCR initialized successfully\n")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {e}")
print(f"✗ Failed to initialize PaddleOCR: {e}\n")
sys.exit(1)
# Get PDF path
pdf_dir = Path("src/test/resources/data/pdfs")
if not pdf_dir.exists():
logger.error(f"PDF directory not found: {pdf_dir}")
print(f"✗ PDF directory not found: {pdf_dir}\n")
sys.exit(1)
# Test with first PDF
pdf_files = list(pdf_dir.glob("*.pdf"))
if not pdf_files:
logger.error("No PDF files found")
print("✗ No PDF files found\n")
sys.exit(1)
test_pdf = pdf_files[0]
print(f"[2/3] Testing with PDF: {test_pdf.name}")
logger.info(f"Testing with PDF: {test_pdf}")
# Extract page
print(" - Extracting first page...")
page_img = extract_pdf_page(str(test_pdf), page_num=0)
if page_img is None:
logger.error("Failed to extract page")
print(" ✗ Failed to extract page\n")
sys.exit(1)
h, w = page_img.shape[:2]
print(f" ✓ Page extracted: {w}x{h}\n")
# Extract CMA
print(f"[3/3] Running CMA extraction...")
logger.info("Running CMA extraction...")
try:
cma_result = extract_cma_code_fullpage(
page_img,
ocr_engine,
output_dir="cma_debug_output"
)
print("\n" + "=" * 80)
print("RESULT")
print("=" * 80)
print(f"Success: {cma_result['success']}")
if cma_result['success']:
print(f"CMA Code: {cma_result['code']}")
print(f"Confidence: {cma_result['confidence']:.4f}")
if cma_result.get('position'):
print(f"Position: {cma_result['position']}")
if cma_result.get('box'):
print(f"Box: {cma_result['box']}")
else:
print("No CMA code found")
print("=" * 80 + "\n")
logger.info(f"CMA extraction completed: success={cma_result['success']}")
if cma_result['success']:
logger.info(f"CMA code: {cma_result['code']} (confidence: {cma_result['confidence']:.4f})")
except Exception as e:
logger.error(f"CMA extraction failed with exception: {e}")
print(f"✗ CMA extraction failed with exception:\n")
print(f" {type(e).__name__}: {e}\n")
# Print full traceback
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()