report-detect/archive/temp_scripts/test_cma_simple.py

"""
Simple test script to debug CMA extraction issues.
"""
import os
import sys
import logging
from pathlib import Path

# Set up logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

try:
    import fitz  # PyMuPDF
    import cv2
    import numpy as np
    from paddleocr import PaddleOCR

    # Import CMA extraction module
    try:
        from cma_extraction_final import extract_cma_code_fullpage
        logger.info("Using cma_extraction_final.py")
    except ImportError as e:
        logger.error(f"Cannot import cma_extraction_final.py: {e}")
        sys.exit(1)

except ImportError as e:
    logger.error(f"Required dependency not found: {e}")
    sys.exit(1)


def extract_pdf_page(pdf_path: str, page_num: int = 0):
    """Extract a page from PDF as image"""
    try:
        doc = fitz.open(pdf_path)
        page = doc.load_page(page_num)
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

        # Convert to BGR format for OpenCV
        if pix.n == 4:  # RGBA
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        elif pix.n == 3:  # RGB
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        elif pix.n == 1:  # Grayscale
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

        doc.close()
        return img
    except Exception as e:
        logger.error(f"Failed to extract page from {pdf_path}: {e}")
        return None


def main():
    # Disable model source check for faster loading
    os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

    print("=" * 80)
    print("CMA EXTRACTION DEBUG TEST")
    print("=" * 80)

    # Initialize PaddleOCR
    print("\n[1/3] Initializing PaddleOCR...")
    logger.info("Initializing PaddleOCR...")
    try:
        ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch')
        print("✓ PaddleOCR initialized successfully\n")
    except Exception as e:
        logger.error(f"Failed to initialize PaddleOCR: {e}")
        print(f"✗ Failed to initialize PaddleOCR: {e}\n")
        sys.exit(1)

    # Get PDF path
    pdf_dir = Path("src/test/resources/data/pdfs")
    if not pdf_dir.exists():
        logger.error(f"PDF directory not found: {pdf_dir}")
        print(f"✗ PDF directory not found: {pdf_dir}\n")
        sys.exit(1)

    # Test with first PDF
    pdf_files = list(pdf_dir.glob("*.pdf"))
    if not pdf_files:
        logger.error("No PDF files found")
        print("✗ No PDF files found\n")
        sys.exit(1)

    test_pdf = pdf_files[0]
    print(f"[2/3] Testing with PDF: {test_pdf.name}")
    logger.info(f"Testing with PDF: {test_pdf}")

    # Extract page
    print("  - Extracting first page...")
    page_img = extract_pdf_page(str(test_pdf), page_num=0)
    if page_img is None:
        logger.error("Failed to extract page")
        print("  ✗ Failed to extract page\n")
        sys.exit(1)

    h, w = page_img.shape[:2]
    print(f"  ✓ Page extracted: {w}x{h}\n")

    # Extract CMA
    print(f"[3/3] Running CMA extraction...")
    logger.info("Running CMA extraction...")

    try:
        cma_result = extract_cma_code_fullpage(
            page_img,
            ocr_engine,
            output_dir="cma_debug_output"
        )

        print("\n" + "=" * 80)
        print("RESULT")
        print("=" * 80)
        print(f"Success: {cma_result['success']}")
        if cma_result['success']:
            print(f"CMA Code: {cma_result['code']}")
            print(f"Confidence: {cma_result['confidence']:.4f}")
            if cma_result.get('position'):
                print(f"Position: {cma_result['position']}")
            if cma_result.get('box'):
                print(f"Box: {cma_result['box']}")
        else:
            print("No CMA code found")
        print("=" * 80 + "\n")

        logger.info(f"CMA extraction completed: success={cma_result['success']}")
        if cma_result['success']:
            logger.info(f"CMA code: {cma_result['code']} (confidence: {cma_result['confidence']:.4f})")

    except Exception as e:
        logger.error(f"CMA extraction failed with exception: {e}")
        print(f"✗ CMA extraction failed with exception:\n")
        print(f"  {type(e).__name__}: {e}\n")

        # Print full traceback
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()