report-detect/archive/temp_scripts/test_template_matching_unit.py

"""
Unit tests for CMA template matching improvements.

This module validates incremental improvements to the template matching algorithm
against known failure cases.
"""
import unittest
import cv2
import numpy as np
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# Constants
CMA_LOGO_PATH = Path("template/CMA_Logo.png")
PDF_DIR = Path("src/test/resources/data/pdfs")
RESULTS_FILE = Path("src/test/resources/data/results.json")

# Test cases with expected CMA codes
TEST_CASES = {
    "WTS2025-21283.pdf": "220020349627",
    "YDQ23_001838.pdf": "210020349096",
    "YDQ23_001850.pdf": "210020349096",
    "YDQ25_001875.pdf": "240020349096",
    "YDQ25_002294.pdf": "240020349096",
}

# Success cases (should match with high confidence)
SUCCESS_CASES = {
    "1.pdf": "181122170342",
    "YDQ25_001845.pdf": "240020349096",
}


def imread_unicode(path, flags=cv2.IMREAD_COLOR):
    """cv2.imread replacement that supports paths with non-ASCII characters."""
    try:
        data = np.fromfile(str(path), dtype=np.uint8)
        img = cv2.imdecode(data, flags)
        return img
    except Exception as e:
        logger.error(f"Failed to read image {path}: {e}")
        return None


def extract_pdf_page(pdf_path, page_num=0):
    """Extract a page from PDF as image."""
    import fitz
    try:
        doc = fitz.open(str(pdf_path))
        if page_num >= doc.page_count:
            doc.close()
            return None
        page = doc[page_num]

        # Render at 300 DPI for better quality
        mat = fitz.Matrix(300 / 72, 300 / 72)
        pix = page.get_pixmap(matrix=mat)
        img_data = pix.tobytes("png")
        img_array = np.frombuffer(img_data, dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

        doc.close()
        return img
    except Exception as e:
        logger.error(f"Failed to extract page from {pdf_path}: {e}")
        return None


def match_template_old(page_img, template, method=cv2.TM_CCOEFF_NORMED):
    """Original matching method: TM_CCOEFF_NORMED"""
    if len(page_img.shape) == 3:
        page_gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    else:
        page_gray = page_img

    if len(template.shape) == 3:
        template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    else:
        template_gray = template

    result = cv2.matchTemplate(page_gray, template_gray, method=method)
    if result is None:
        return None

    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    match_center = (
        max_loc[0] + template_gray.shape[1] // 2,
        max_loc[1] + template_gray.shape[0] // 2
    )

    return {
        'max_val': float(max_val),
        'match_center': match_center,
        'match_loc': max_loc,
        'method': 'TM_CCOEFF_NORMED'
    }


def match_template_new(page_img, template, method=cv2.TM_CCORR_NORMED):
    """Improved matching method: TM_CCORR_NORMED"""
    if len(page_img.shape) == 3:
        page_gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
    else:
        page_gray = page_img

    if len(template.shape) == 3:
        template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    else:
        template_gray = template

    result = cv2.matchTemplate(page_gray, template_gray, method=method)
    if result is None:
        return None

    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    match_center = (
        max_loc[0] + template_gray.shape[1] // 2,
        max_loc[1] + template_gray.shape[0] // 2
    )

    return {
        'max_val': float(max_val),
        'match_center': match_center,
        'match_loc': max_loc,
        'method': 'TM_CCORR_NORMED'
    }


class TestTemplateMatching(unittest.TestCase):
    """Test cases for template matching improvements."""

    @classmethod
    def setUpClass(cls):
        """Load template once for all tests."""
        cls.template = imread_unicode(CMA_LOGO_PATH, cv2.IMREAD_COLOR)
        if cls.template is None:
            raise unittest.SkipTest(f"Could not load template from {CMA_LOGO_PATH}")
        logger.info(f"Loaded template: {cls.template.shape}")

    def test_specific_failures(self):
        """Test known failure cases (confidence 0.32-0.39)."""
        results = {}

        for pdf_name, expected_cma in TEST_CASES.items():
            pdf_path = PDF_DIR / pdf_name
            if not pdf_path.exists():
                self.skipTest(f"PDF not found: {pdf_path}")

            with self.subTest(pdf=pdf_name):
                img = extract_pdf_page(pdf_path)
                self.assertIsNotNone(img, f"Failed to extract page from {pdf_name}")

                # Test old method
                result_old = match_template_old(img, self.template)
                self.assertIsNotNone(result_old, f"Old method returned None for {pdf_name}")

                # Test new method
                result_new = match_template_new(img, self.template)
                self.assertIsNotNone(result_new, f"New method returned None for {pdf_name}")

                # Log results
                logger.info(f"{pdf_name}:")
                logger.info(f"  Old ({result_old['method']}): {result_old['max_val']:.3f}")
                logger.info(f"  New ({result_new['method']}): {result_new['max_val']:.3f}")

                # Store results
                results[pdf_name] = {
                    'expected_cma': expected_cma,
                    'old_confidence': result_old['max_val'],
                    'new_confidence': result_new['max_val'],
                }

                # Verify new method doesn't decrease confidence significantly
                # Allow small decrease (0.02) but overall should improve
                self.assertGreaterEqual(
                    result_new['max_val'],
                    result_old['max_val'] - 0.02,
                    f"{pdf_name}: New method should not significantly decrease confidence"
                )

        # Print summary
        logger.info("\n" + "=" * 60)
        logger.info("FAILURE CASES SUMMARY")
        logger.info("=" * 60)
        for pdf_name, data in results.items():
            logger.info(f"{pdf_name}:")
            logger.info(f"  Expected CMA: {data['expected_cma']}")
            logger.info(f"  Old: {data['old_confidence']:.3f}")
            logger.info(f"  New: {data['new_confidence']:.3f}")
            logger.info(f"  Improvement: {data['new_confidence'] - data['old_confidence']:+.3f}")

    def test_success_cases(self):
        """Test known success cases (should match with high confidence)."""
        results = {}

        for pdf_name, expected_cma in SUCCESS_CASES.items():
            pdf_path = PDF_DIR / pdf_name
            if not pdf_path.exists():
                self.skipTest(f"PDF not found: {pdf_path}")

            with self.subTest(pdf=pdf_name):
                img = extract_pdf_page(pdf_path)
                self.assertIsNotNone(img, f"Failed to extract page from {pdf_name}")

                # Test both methods
                result_old = match_template_old(img, self.template)
                result_new = match_template_new(img, self.template)

                self.assertIsNotNone(result_old)
                self.assertIsNotNone(result_new)

                # Log results
                logger.info(f"{pdf_name}:")
                logger.info(f"  Old: {result_old['max_val']:.3f}")
                logger.info(f"  New: {result_new['max_val']:.3f}")

                results[pdf_name] = {
                    'expected_cma': expected_cma,
                    'old_confidence': result_old['max_val'],
                    'new_confidence': result_new['max_val'],
                }

                # Both methods should find the template with high confidence
                self.assertGreater(
                    result_old['max_val'],
                    0.30,
                    f"{pdf_name}: Old method should find template with confidence > 0.30"
                )
                self.assertGreater(
                    result_new['max_val'],
                    0.30,
                    f"{pdf_name}: New method should find template with confidence > 0.30"
                )

        # Print summary
        logger.info("\n" + "=" * 60)
        logger.info("SUCCESS CASES SUMMARY")
        logger.info("=" * 60)
        for pdf_name, data in results.items():
            logger.info(f"{pdf_name}:")
            logger.info(f"  Expected CMA: {data['expected_cma']}")
            logger.info(f"  Old: {data['old_confidence']:.3f}")
            logger.info(f"  New: {data['new_confidence']:.3f}")

    def test_threshold_comparison(self):
        """Test how changing threshold affects match detection."""
        # Test various thresholds
        thresholds = [0.25, 0.30, 0.35, 0.40]

        for threshold in thresholds:
            detected = 0
            total = 0

            for pdf_name in list(TEST_CASES.keys()) + list(SUCCESS_CASES.keys()):
                pdf_path = PDF_DIR / pdf_name
                if not pdf_path.exists():
                    continue

                img = extract_pdf_page(pdf_path)
                if img is None:
                    continue

                total += 1
                result_new = match_template_new(img, self.template)

                if result_new and result_new['max_val'] >= threshold:
                    detected += 1

            logger.info(f"Threshold {threshold:.2f}: {detected}/{total} detected ({detected/total*100:.1f}%)")


if __name__ == '__main__':
    # Run tests with verbose output
    unittest.main(verbosity=2)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Unit tests for CMA template matching improvements.`

			`This module validates incremental improvements to the template matching algorithm`
			`against known failure cases.`
			`"""`
			`import unittest`
			`import cv2`
			`import numpy as np`
			`import logging`
			`from pathlib import Path`

			`# Configure logging`
			`logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')`
			`logger = logging.getLogger(__name__)`

			`# Constants`
			`CMA_LOGO_PATH = Path("template/CMA_Logo.png")`
			`PDF_DIR = Path("src/test/resources/data/pdfs")`
			`RESULTS_FILE = Path("src/test/resources/data/results.json")`

			`# Test cases with expected CMA codes`
			`TEST_CASES = {`
			`"WTS2025-21283.pdf": "220020349627",`
			`"YDQ23_001838.pdf": "210020349096",`
			`"YDQ23_001850.pdf": "210020349096",`
			`"YDQ25_001875.pdf": "240020349096",`
			`"YDQ25_002294.pdf": "240020349096",`
			`}`

			`# Success cases (should match with high confidence)`
			`SUCCESS_CASES = {`
			`"1.pdf": "181122170342",`
			`"YDQ25_001845.pdf": "240020349096",`
			`}`


			`def imread_unicode(path, flags=cv2.IMREAD_COLOR):`
			`"""cv2.imread replacement that supports paths with non-ASCII characters."""`
			`try:`
			`data = np.fromfile(str(path), dtype=np.uint8)`
			`img = cv2.imdecode(data, flags)`
			`return img`
			`except Exception as e:`
			`logger.error(f"Failed to read image {path}: {e}")`
			`return None`


			`def extract_pdf_page(pdf_path, page_num=0):`
			`"""Extract a page from PDF as image."""`
			`import fitz`
			`try:`
			`doc = fitz.open(str(pdf_path))`
			`if page_num >= doc.page_count:`
			`doc.close()`
			`return None`
			`page = doc[page_num]`

			`# Render at 300 DPI for better quality`
			`mat = fitz.Matrix(300 / 72, 300 / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_data = pix.tobytes("png")`
			`img_array = np.frombuffer(img_data, dtype=np.uint8)`
			`img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)`

			`doc.close()`
			`return img`
			`except Exception as e:`
			`logger.error(f"Failed to extract page from {pdf_path}: {e}")`
			`return None`


			`def match_template_old(page_img, template, method=cv2.TM_CCOEFF_NORMED):`
			`"""Original matching method: TM_CCOEFF_NORMED"""`
			`if len(page_img.shape) == 3:`
			`page_gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)`
			`else:`
			`page_gray = page_img`

			`if len(template.shape) == 3:`
			`template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)`
			`else:`
			`template_gray = template`

			`result = cv2.matchTemplate(page_gray, template_gray, method=method)`
			`if result is None:`
			`return None`

			`_, max_val, _, max_loc = cv2.minMaxLoc(result)`
			`match_center = (`
			`max_loc[0] + template_gray.shape[1] // 2,`
			`max_loc[1] + template_gray.shape[0] // 2`
			`)`

			`return {`
			`'max_val': float(max_val),`
			`'match_center': match_center,`
			`'match_loc': max_loc,`
			`'method': 'TM_CCOEFF_NORMED'`
			`}`


			`def match_template_new(page_img, template, method=cv2.TM_CCORR_NORMED):`
			`"""Improved matching method: TM_CCORR_NORMED"""`
			`if len(page_img.shape) == 3:`
			`page_gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)`
			`else:`
			`page_gray = page_img`

			`if len(template.shape) == 3:`
			`template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)`
			`else:`
			`template_gray = template`

			`result = cv2.matchTemplate(page_gray, template_gray, method=method)`
			`if result is None:`
			`return None`

			`_, max_val, _, max_loc = cv2.minMaxLoc(result)`
			`match_center = (`
			`max_loc[0] + template_gray.shape[1] // 2,`
			`max_loc[1] + template_gray.shape[0] // 2`
			`)`

			`return {`
			`'max_val': float(max_val),`
			`'match_center': match_center,`
			`'match_loc': max_loc,`
			`'method': 'TM_CCORR_NORMED'`
			`}`


			`class TestTemplateMatching(unittest.TestCase):`
			`"""Test cases for template matching improvements."""`

			`@classmethod`
			`def setUpClass(cls):`
			`"""Load template once for all tests."""`
			`cls.template = imread_unicode(CMA_LOGO_PATH, cv2.IMREAD_COLOR)`
			`if cls.template is None:`
			`raise unittest.SkipTest(f"Could not load template from {CMA_LOGO_PATH}")`
			`logger.info(f"Loaded template: {cls.template.shape}")`

			`def test_specific_failures(self):`
			`"""Test known failure cases (confidence 0.32-0.39)."""`
			`results = {}`

			`for pdf_name, expected_cma in TEST_CASES.items():`
			`pdf_path = PDF_DIR / pdf_name`
			`if not pdf_path.exists():`
			`self.skipTest(f"PDF not found: {pdf_path}")`

			`with self.subTest(pdf=pdf_name):`
			`img = extract_pdf_page(pdf_path)`
			`self.assertIsNotNone(img, f"Failed to extract page from {pdf_name}")`

			`# Test old method`
			`result_old = match_template_old(img, self.template)`
			`self.assertIsNotNone(result_old, f"Old method returned None for {pdf_name}")`

			`# Test new method`
			`result_new = match_template_new(img, self.template)`
			`self.assertIsNotNone(result_new, f"New method returned None for {pdf_name}")`

			`# Log results`
			`logger.info(f"{pdf_name}:")`
			`logger.info(f" Old ({result_old['method']}): {result_old['max_val']:.3f}")`
			`logger.info(f" New ({result_new['method']}): {result_new['max_val']:.3f}")`

			`# Store results`
			`results[pdf_name] = {`
			`'expected_cma': expected_cma,`
			`'old_confidence': result_old['max_val'],`
			`'new_confidence': result_new['max_val'],`
			`}`

			`# Verify new method doesn't decrease confidence significantly`
			`# Allow small decrease (0.02) but overall should improve`
			`self.assertGreaterEqual(`
			`result_new['max_val'],`
			`result_old['max_val'] - 0.02,`
			`f"{pdf_name}: New method should not significantly decrease confidence"`
			`)`

			`# Print summary`
			`logger.info("\n" + "=" * 60)`
			`logger.info("FAILURE CASES SUMMARY")`
			`logger.info("=" * 60)`
			`for pdf_name, data in results.items():`
			`logger.info(f"{pdf_name}:")`
			`logger.info(f" Expected CMA: {data['expected_cma']}")`
			`logger.info(f" Old: {data['old_confidence']:.3f}")`
			`logger.info(f" New: {data['new_confidence']:.3f}")`
			`logger.info(f" Improvement: {data['new_confidence'] - data['old_confidence']:+.3f}")`

			`def test_success_cases(self):`
			`"""Test known success cases (should match with high confidence)."""`
			`results = {}`

			`for pdf_name, expected_cma in SUCCESS_CASES.items():`
			`pdf_path = PDF_DIR / pdf_name`
			`if not pdf_path.exists():`
			`self.skipTest(f"PDF not found: {pdf_path}")`

			`with self.subTest(pdf=pdf_name):`
			`img = extract_pdf_page(pdf_path)`
			`self.assertIsNotNone(img, f"Failed to extract page from {pdf_name}")`

			`# Test both methods`
			`result_old = match_template_old(img, self.template)`
			`result_new = match_template_new(img, self.template)`

			`self.assertIsNotNone(result_old)`
			`self.assertIsNotNone(result_new)`

			`# Log results`
			`logger.info(f"{pdf_name}:")`
			`logger.info(f" Old: {result_old['max_val']:.3f}")`
			`logger.info(f" New: {result_new['max_val']:.3f}")`

			`results[pdf_name] = {`
			`'expected_cma': expected_cma,`
			`'old_confidence': result_old['max_val'],`
			`'new_confidence': result_new['max_val'],`
			`}`

			`# Both methods should find the template with high confidence`
			`self.assertGreater(`
			`result_old['max_val'],`
			`0.30,`
			`f"{pdf_name}: Old method should find template with confidence > 0.30"`
			`)`
			`self.assertGreater(`
			`result_new['max_val'],`
			`0.30,`
			`f"{pdf_name}: New method should find template with confidence > 0.30"`
			`)`

			`# Print summary`
			`logger.info("\n" + "=" * 60)`
			`logger.info("SUCCESS CASES SUMMARY")`
			`logger.info("=" * 60)`
			`for pdf_name, data in results.items():`
			`logger.info(f"{pdf_name}:")`
			`logger.info(f" Expected CMA: {data['expected_cma']}")`
			`logger.info(f" Old: {data['old_confidence']:.3f}")`
			`logger.info(f" New: {data['new_confidence']:.3f}")`

			`def test_threshold_comparison(self):`
			`"""Test how changing threshold affects match detection."""`
			`# Test various thresholds`
			`thresholds = [0.25, 0.30, 0.35, 0.40]`

			`for threshold in thresholds:`
			`detected = 0`
			`total = 0`

			`for pdf_name in list(TEST_CASES.keys()) + list(SUCCESS_CASES.keys()):`
			`pdf_path = PDF_DIR / pdf_name`
			`if not pdf_path.exists():`
			`continue`

			`img = extract_pdf_page(pdf_path)`
			`if img is None:`
			`continue`

			`total += 1`
			`result_new = match_template_new(img, self.template)`

			`if result_new and result_new['max_val'] >= threshold:`
			`detected += 1`

			`logger.info(f"Threshold {threshold:.2f}: {detected}/{total} detected ({detected/total*100:.1f}%)")`


			`if __name__ == '__main__':`
			`# Run tests with verbose output`
			`unittest.main(verbosity=2)`