report-detect/archive/temp_scripts/test_template_matching_unit.py

"""
Unit tests for CMA template matching improvements.

This module validates incremental improvements to the template matching algorithm
against known failure cases.
"""
import unittest
import cv2
import numpy as np
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# Constants
CMA_LOGO_PATH = Path("template/CMA_Logo.png")
PDF_DIR = Path("src/test/resources/data/pdfs")
RESULTS_FILE = Path("src/test/resources/data/results.json")

# Test cases with expected CMA codes
TEST_CASES = {
    "WTS2025-21283.pdf": "220020349627",
    "YDQ23_001838.pdf": "210020349096",
    "YDQ23_001850.pdf": "210020349096",
    "YDQ25_001875.pdf": "240020349096",
    "YDQ25_002294.pdf": "240020349096",
}

# Success cases (should match with high confidence)
SUCCESS_CASES = {
    "1.pdf": "181122170342",
    "YDQ25_001845.pdf": "240020349096",
}


def imread_unicode(path, flags=cv2.IMREAD_COLOR):
    """cv2.imread replacement that supports paths with non-ASCII characters."""
    try:
        data = np.fromfile(str(path), dtype=np.uint8)
        img = cv2.imdecode(data, flags)
        return img
    except Exception as e:
        logger.error(f"Failed to read image {path}: {e}")
        return None


def extract_pdf_page(pdf_path, page_num=0):
    """Extract a page from PDF as image."""
    import fitz
    try:
        doc = fitz.open(str(pdf_path))
        if page_num >= doc.page_count:
            doc.close()
            return None
        page = doc[page_num]

        # Render at 300 DPI for better quality
        mat = fitz.Matrix(300 / 72, 300 / 72)
        pix = page.get_pixmap(matrix=mat)
        img_data = pix.tobytes("png")
        img_array = np.frombuffer(img_data, dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

        doc.close()
        return img
    except Exception as e:
        logger.error(f"Failed to extract page from {pdf_path}: {e}")
        return None


def match_template_old(page_img, template, method=cv2.TM_CCOEFF_NORMED):
    """Original matching method: TM_CCOEFF_NORMED"""
    if len(page_img.shape) == 3:
        page_gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    else:
        page_gray = page_img

    if len(template.shape) == 3:
        template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    else:
        template_gray = template

    result = cv2.matchTemplate(page_gray, template_gray, method=method)
    if result is None:
        return None

    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    match_center = (
        max_loc[0] + template_gray.shape[1] // 2,
        max_loc[1] + template_gray.shape[0] // 2
    )

    return {
        'max_val': float(max_val),
        'match_center': match_center,
        'match_loc': max_loc,
        'method': 'TM_CCOEFF_NORMED'
    }


def match_template_new(page_img, template, method=cv2.TM_CCORR_NORMED):
    """Improved matching method: TM_CCORR_NORMED"""
    if len(page_img.shape) == 3:
        page_gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    else:
        page_gray = page_img

    if len(template.shape) == 3:
        template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    else:
        template_gray = template

    result = cv2.matchTemplate(page_gray, template_gray, method=method)
    if result is None:
        return None

    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    match_center = (
        max_loc[0] + template_gray.shape[1] // 2,
        max_loc[1] + template_gray.shape[0] // 2
    )

    return {
        'max_val': float(max_val),
        'match_center': match_center,
        'match_loc': max_loc,
        'method': 'TM_CCORR_NORMED'
    }


class TestTemplateMatching(unittest.TestCase):
    """Test cases for template matching improvements."""

    @classmethod
    def setUpClass(cls):
        """Load template once for all tests."""
        cls.template = imread_unicode(CMA_LOGO_PATH, cv2.IMREAD_COLOR)
        if cls.template is None:
            raise unittest.SkipTest(f"Could not load template from {CMA_LOGO_PATH}")
        logger.info(f"Loaded template: {cls.template.shape}")

    def test_specific_failures(self):
        """Test known failure cases (confidence 0.32-0.39)."""
        results = {}

        for pdf_name, expected_cma in TEST_CASES.items():
            pdf_path = PDF_DIR / pdf_name
            if not pdf_path.exists():
                self.skipTest(f"PDF not found: {pdf_path}")

            with self.subTest(pdf=pdf_name):
                img = extract_pdf_page(pdf_path)
                self.assertIsNotNone(img, f"Failed to extract page from {pdf_name}")

                # Test old method
                result_old = match_template_old(img, self.template)
                self.assertIsNotNone(result_old, f"Old method returned None for {pdf_name}")

                # Test new method
                result_new = match_template_new(img, self.template)
                self.assertIsNotNone(result_new, f"New method returned None for {pdf_name}")

                # Log results
                logger.info(f"{pdf_name}:")
                logger.info(f"  Old ({result_old['method']}): {result_old['max_val']:.3f}")
                logger.info(f"  New ({result_new['method']}): {result_new['max_val']:.3f}")

                # Store results
                results[pdf_name] = {
                    'expected_cma': expected_cma,
                    'old_confidence': result_old['max_val'],
                    'new_confidence': result_new['max_val'],
                }

                # Verify new method doesn't decrease confidence significantly
                # Allow small decrease (0.02) but overall should improve
                self.assertGreaterEqual(
                    result_new['max_val'],
                    result_old['max_val'] - 0.02,
                    f"{pdf_name}: New method should not significantly decrease confidence"
                )

        # Print summary
        logger.info("\n" + "=" * 60)
        logger.info("FAILURE CASES SUMMARY")
        logger.info("=" * 60)
        for pdf_name, data in results.items():
            logger.info(f"{pdf_name}:")
            logger.info(f"  Expected CMA: {data['expected_cma']}")
            logger.info(f"  Old: {data['old_confidence']:.3f}")
            logger.info(f"  New: {data['new_confidence']:.3f}")
            logger.info(f"  Improvement: {data['new_confidence'] - data['old_confidence']:+.3f}")

    def test_success_cases(self):
        """Test known success cases (should match with high confidence)."""
        results = {}

        for pdf_name, expected_cma in SUCCESS_CASES.items():
            pdf_path = PDF_DIR / pdf_name
            if not pdf_path.exists():
                self.skipTest(f"PDF not found: {pdf_path}")

            with self.subTest(pdf=pdf_name):
                img = extract_pdf_page(pdf_path)
                self.assertIsNotNone(img, f"Failed to extract page from {pdf_name}")

                # Test both methods
                result_old = match_template_old(img, self.template)
                result_new = match_template_new(img, self.template)

                self.assertIsNotNone(result_old)
                self.assertIsNotNone(result_new)

                # Log results
                logger.info(f"{pdf_name}:")
                logger.info(f"  Old: {result_old['max_val']:.3f}")
                logger.info(f"  New: {result_new['max_val']:.3f}")

                results[pdf_name] = {
                    'expected_cma': expected_cma,
                    'old_confidence': result_old['max_val'],
                    'new_confidence': result_new['max_val'],
                }

                # Both methods should find the template with high confidence
                self.assertGreater(
                    result_old['max_val'],
                    0.30,
                    f"{pdf_name}: Old method should find template with confidence > 0.30"
                )
                self.assertGreater(
                    result_new['max_val'],
                    0.30,
                    f"{pdf_name}: New method should find template with confidence > 0.30"
                )

        # Print summary
        logger.info("\n" + "=" * 60)
        logger.info("SUCCESS CASES SUMMARY")
        logger.info("=" * 60)
        for pdf_name, data in results.items():
            logger.info(f"{pdf_name}:")
            logger.info(f"  Expected CMA: {data['expected_cma']}")
            logger.info(f"  Old: {data['old_confidence']:.3f}")
            logger.info(f"  New: {data['new_confidence']:.3f}")

    def test_threshold_comparison(self):
        """Test how changing threshold affects match detection."""
        # Test various thresholds
        thresholds = [0.25, 0.30, 0.35, 0.40]

        for threshold in thresholds:
            detected = 0
            total = 0

            for pdf_name in list(TEST_CASES.keys()) + list(SUCCESS_CASES.keys()):
                pdf_path = PDF_DIR / pdf_name
                if not pdf_path.exists():
                    continue

                img = extract_pdf_page(pdf_path)
                if img is None:
                    continue

                total += 1
                result_new = match_template_new(img, self.template)

                if result_new and result_new['max_val'] >= threshold:
                    detected += 1

            logger.info(f"Threshold {threshold:.2f}: {detected}/{total} detected ({detected/total*100:.1f}%)")


if __name__ == '__main__':
    # Run tests with verbose output
    unittest.main(verbosity=2)