""" Simple test script to debug CMA extraction issues. """ import os import sys import logging from pathlib import Path # Set up logging logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) try: import fitz # PyMuPDF import cv2 import numpy as np from paddleocr import PaddleOCR # Import CMA extraction module try: from cma_extraction_final import extract_cma_code_fullpage logger.info("Using cma_extraction_final.py") except ImportError as e: logger.error(f"Cannot import cma_extraction_final.py: {e}") sys.exit(1) except ImportError as e: logger.error(f"Required dependency not found: {e}") sys.exit(1) def extract_pdf_page(pdf_path: str, page_num: int = 0): """Extract a page from PDF as image""" try: doc = fitz.open(pdf_path) page = doc.load_page(page_num) pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n) # Convert to BGR format for OpenCV if pix.n == 4: # RGBA img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR) elif pix.n == 3: # RGB img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) elif pix.n == 1: # Grayscale img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) doc.close() return img except Exception as e: logger.error(f"Failed to extract page from {pdf_path}: {e}") return None def main(): # Disable model source check for faster loading os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" print("=" * 80) print("CMA EXTRACTION DEBUG TEST") print("=" * 80) # Initialize PaddleOCR print("\n[1/3] Initializing PaddleOCR...") logger.info("Initializing PaddleOCR...") try: ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch') print("✓ PaddleOCR initialized successfully\n") except Exception as e: logger.error(f"Failed to initialize PaddleOCR: {e}") print(f"✗ Failed to initialize PaddleOCR: {e}\n") sys.exit(1) # Get PDF path pdf_dir = Path("src/test/resources/data/pdfs") if not pdf_dir.exists(): logger.error(f"PDF directory not found: {pdf_dir}") print(f"✗ PDF directory not found: {pdf_dir}\n") sys.exit(1) # Test with first PDF pdf_files = list(pdf_dir.glob("*.pdf")) if not pdf_files: logger.error("No PDF files found") print("✗ No PDF files found\n") sys.exit(1) test_pdf = pdf_files[0] print(f"[2/3] Testing with PDF: {test_pdf.name}") logger.info(f"Testing with PDF: {test_pdf}") # Extract page print(" - Extracting first page...") page_img = extract_pdf_page(str(test_pdf), page_num=0) if page_img is None: logger.error("Failed to extract page") print(" ✗ Failed to extract page\n") sys.exit(1) h, w = page_img.shape[:2] print(f" ✓ Page extracted: {w}x{h}\n") # Extract CMA print(f"[3/3] Running CMA extraction...") logger.info("Running CMA extraction...") try: cma_result = extract_cma_code_fullpage( page_img, ocr_engine, output_dir="cma_debug_output" ) print("\n" + "=" * 80) print("RESULT") print("=" * 80) print(f"Success: {cma_result['success']}") if cma_result['success']: print(f"CMA Code: {cma_result['code']}") print(f"Confidence: {cma_result['confidence']:.4f}") if cma_result.get('position'): print(f"Position: {cma_result['position']}") if cma_result.get('box'): print(f"Box: {cma_result['box']}") else: print("No CMA code found") print("=" * 80 + "\n") logger.info(f"CMA extraction completed: success={cma_result['success']}") if cma_result['success']: logger.info(f"CMA code: {cma_result['code']} (confidence: {cma_result['confidence']:.4f})") except Exception as e: logger.error(f"CMA extraction failed with exception: {e}") print(f"✗ CMA extraction failed with exception:\n") print(f" {type(e).__name__}: {e}\n") # Print full traceback import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()