149 lines
4.4 KiB
Python
149 lines
4.4 KiB
Python
"""
|
|
Simple test script to debug CMA extraction issues.
|
|
"""
|
|
import os
|
|
import sys
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.DEBUG,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import fitz # PyMuPDF
|
|
import cv2
|
|
import numpy as np
|
|
from paddleocr import PaddleOCR
|
|
|
|
# Import CMA extraction module
|
|
try:
|
|
from cma_extraction_final import extract_cma_code_fullpage
|
|
logger.info("Using cma_extraction_final.py")
|
|
except ImportError as e:
|
|
logger.error(f"Cannot import cma_extraction_final.py: {e}")
|
|
sys.exit(1)
|
|
|
|
except ImportError as e:
|
|
logger.error(f"Required dependency not found: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def extract_pdf_page(pdf_path: str, page_num: int = 0):
|
|
"""Extract a page from PDF as image"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
page = doc.load_page(page_num)
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
|
|
|
# Convert to BGR format for OpenCV
|
|
if pix.n == 4: # RGBA
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
|
|
elif pix.n == 3: # RGB
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
|
elif pix.n == 1: # Grayscale
|
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
|
|
|
doc.close()
|
|
return img
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract page from {pdf_path}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
# Disable model source check for faster loading
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
print("=" * 80)
|
|
print("CMA EXTRACTION DEBUG TEST")
|
|
print("=" * 80)
|
|
|
|
# Initialize PaddleOCR
|
|
print("\n[1/3] Initializing PaddleOCR...")
|
|
logger.info("Initializing PaddleOCR...")
|
|
try:
|
|
ocr_engine = PaddleOCR(use_angle_cls=True, lang='ch')
|
|
print("✓ PaddleOCR initialized successfully\n")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize PaddleOCR: {e}")
|
|
print(f"✗ Failed to initialize PaddleOCR: {e}\n")
|
|
sys.exit(1)
|
|
|
|
# Get PDF path
|
|
pdf_dir = Path("src/test/resources/data/pdfs")
|
|
if not pdf_dir.exists():
|
|
logger.error(f"PDF directory not found: {pdf_dir}")
|
|
print(f"✗ PDF directory not found: {pdf_dir}\n")
|
|
sys.exit(1)
|
|
|
|
# Test with first PDF
|
|
pdf_files = list(pdf_dir.glob("*.pdf"))
|
|
if not pdf_files:
|
|
logger.error("No PDF files found")
|
|
print("✗ No PDF files found\n")
|
|
sys.exit(1)
|
|
|
|
test_pdf = pdf_files[0]
|
|
print(f"[2/3] Testing with PDF: {test_pdf.name}")
|
|
logger.info(f"Testing with PDF: {test_pdf}")
|
|
|
|
# Extract page
|
|
print(" - Extracting first page...")
|
|
page_img = extract_pdf_page(str(test_pdf), page_num=0)
|
|
if page_img is None:
|
|
logger.error("Failed to extract page")
|
|
print(" ✗ Failed to extract page\n")
|
|
sys.exit(1)
|
|
|
|
h, w = page_img.shape[:2]
|
|
print(f" ✓ Page extracted: {w}x{h}\n")
|
|
|
|
# Extract CMA
|
|
print(f"[3/3] Running CMA extraction...")
|
|
logger.info("Running CMA extraction...")
|
|
|
|
try:
|
|
cma_result = extract_cma_code_fullpage(
|
|
page_img,
|
|
ocr_engine,
|
|
output_dir="cma_debug_output"
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("RESULT")
|
|
print("=" * 80)
|
|
print(f"Success: {cma_result['success']}")
|
|
if cma_result['success']:
|
|
print(f"CMA Code: {cma_result['code']}")
|
|
print(f"Confidence: {cma_result['confidence']:.4f}")
|
|
if cma_result.get('position'):
|
|
print(f"Position: {cma_result['position']}")
|
|
if cma_result.get('box'):
|
|
print(f"Box: {cma_result['box']}")
|
|
else:
|
|
print("No CMA code found")
|
|
print("=" * 80 + "\n")
|
|
|
|
logger.info(f"CMA extraction completed: success={cma_result['success']}")
|
|
if cma_result['success']:
|
|
logger.info(f"CMA code: {cma_result['code']} (confidence: {cma_result['confidence']:.4f})")
|
|
|
|
except Exception as e:
|
|
logger.error(f"CMA extraction failed with exception: {e}")
|
|
print(f"✗ CMA extraction failed with exception:\n")
|
|
print(f" {type(e).__name__}: {e}\n")
|
|
|
|
# Print full traceback
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|