report-detect/archive/temp_scripts/test_roi_fix.py

"""
Test the fixed ROI calculation
"""
import subprocess
import sys

# Clear all Python cache first
print("Clearing Python cache...")
subprocess.run(["python", "-c", """
import os, shutil
for root, dirs, files in os.walk('.'):
    for d in dirs[:200]:
        if d == '__pycache__':
            try:
                shutil.rmtree(os.path.join(root, d))
            except:
                pass
"""], capture_output=True)

# Now run the test with fresh Python
import os
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

import fitz
import numpy as np
import cv2
import re
from paddleocr import PaddleOCR

# Fresh import
import importlib
import cma_extraction_template_primary
importlib.reload(cma_extraction_template_primary)

from cma_extraction_template_primary import locate_template_multi_scale, imread_unicode

pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
template_path = "template/CMA_Logo.png"

print("=" * 80)
print("TESTING FIXED ROI CALCULATION")
print("=" * 80)

# Extract page
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()

print(f"\nPage size: {page_img.shape}")
h, w = page_img.shape[:2]

# Load template and match
template = imread_unicode(template_path, cv2.IMREAD_COLOR)

print("\nRunning template matching...")
match_res = locate_template_multi_scale(page_img, template)

if not match_res.get('success'):
    print(f"ERROR: Template matching failed: {match_res.get('reason')}")
    sys.exit(1)

print(f"Match succeeded: confidence={match_res['max_val']:.3f}")

# Calculate ROI with NEW formula
x, y = match_res['match_center']
template_h = match_res['template_h']
template_w = match_res['template_w']

print(f"\nCalculating ROI with NEW formula...")
print(f"  Logo center: ({x}, {y})")
print(f"  Template size: {template_w}x{template_h}")

# NEW ROI calculation: extend down by template_h * 4
roi_x1 = int(max(0, x))
roi_y1 = int(max(0, y - template_h // 2))
roi_x2 = int(min(w, x + min(600, w - x)))
roi_y2 = int(min(h, y + template_h * 4))  # NEW: extend down by 4x

print(f"\nNEW ROI coordinates:")
print(f"  ROI: ({roi_x1}, {roi_y1}) -> ({roi_x2}, {roi_y2})")
print(f"  ROI size: {roi_x2 - roi_x1}x{roi_y2 - roi_y1}")

rel_x1 = roi_x1 / w * 100
rel_y1 = roi_y1 / h * 100
rel_x2 = roi_x2 / w * 100
rel_y2 = roi_y2 / h * 100
print(f"  Relative: ({rel_x1:.1f}%, {rel_y1:.1f}%) -> ({rel_x2:.1f}%, {rel_y2:.1f}%)")

# Extract ROI
roi_img = page_img[roi_y1:roi_y2, roi_x1:roi_x2]
print(f"\nActual ROI size: {roi_img.shape}")

# Save ROI
os.makedirs("test_debug_new", exist_ok=True)
cv2.imwrite("test_debug_new/roi_debug.png", roi_img)
print("ROI saved to: test_debug_new/roi_debug.png")

# Run OCR on ROI
print("\nRunning OCR on NEW ROI...")
ocr = PaddleOCR(lang='ch')
ocr_result = ocr.predict(roi_img)

if ocr_result and len(ocr_result) > 0:
    res = ocr_result[0]
    texts = res.get('rec_texts', [])
    scores = res.get('rec_scores', [])

    print(f"\nOCR found {len(texts)} text lines:")
    found_4400 = False
    found_2100 = False
    for i, (text, score) in enumerate(zip(texts, scores)):
        numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
        if numbers or score > 0.5:
            print(f"  [{i}] '{text}' (score: {score:.2f})")
            if numbers:
                print(f"      Numbers: {numbers}")
                if "440023010130" in numbers:
                    print(f"      ^ Found 440023010130 (report number)")
                    found_4400 = True
                if "210020349096" in numbers:
                    print(f"      ^ Found 210020349096 (CORRECT CMA CODE!)")
                    found_2100 = True

    print("\n" + "=" * 80)
    print("RESULT")
    print("=" * 80)
    if found_2100:
        print("SUCCESS: Found correct CMA code 210020349096!")
    elif found_4400:
        print("FAILED: Still finding 440023010130 instead of 210020349096")
    else:
        print("FAILED: No CMA codes found")
else:
    print("ERROR: OCR returned no results")

print("=" * 80)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Test the fixed ROI calculation`
			`"""`
			`import subprocess`
			`import sys`

			`# Clear all Python cache first`
			`print("Clearing Python cache...")`
			`subprocess.run(["python", "-c", """`
			`import os, shutil`
			`for root, dirs, files in os.walk('.'):`
			`for d in dirs[:200]:`
			`if d == '__pycache__':`
			`try:`
			`shutil.rmtree(os.path.join(root, d))`
			`except:`
			`pass`
			`"""], capture_output=True)`

			`# Now run the test with fresh Python`
			`import os`
			`os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"`

			`import fitz`
			`import numpy as np`
			`import cv2`
			`import re`
			`from paddleocr import PaddleOCR`

			`# Fresh import`
			`import importlib`
			`import cma_extraction_template_primary`
			`importlib.reload(cma_extraction_template_primary)`

			`from cma_extraction_template_primary import locate_template_multi_scale, imread_unicode`

			`pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"`
			`template_path = "template/CMA_Logo.png"`

			`print("=" * 80)`
			`print("TESTING FIXED ROI CALCULATION")`
			`print("=" * 80)`

			`# Extract page`
			`doc = fitz.open(pdf_path)`
			`page = doc[0]`
			`mat = fitz.Matrix(300 / 72, 300 / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_data = pix.tobytes("png")`
			`img_array = np.frombuffer(img_data, dtype=np.uint8)`
			`page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)`
			`doc.close()`

			`print(f"\nPage size: {page_img.shape}")`
			`h, w = page_img.shape[:2]`

			`# Load template and match`
			`template = imread_unicode(template_path, cv2.IMREAD_COLOR)`

			`print("\nRunning template matching...")`
			`match_res = locate_template_multi_scale(page_img, template)`

			`if not match_res.get('success'):`
			`print(f"ERROR: Template matching failed: {match_res.get('reason')}")`
			`sys.exit(1)`

			`print(f"Match succeeded: confidence={match_res['max_val']:.3f}")`

			`# Calculate ROI with NEW formula`
			`x, y = match_res['match_center']`
			`template_h = match_res['template_h']`
			`template_w = match_res['template_w']`

			`print(f"\nCalculating ROI with NEW formula...")`
			`print(f" Logo center: ({x}, {y})")`
			`print(f" Template size: {template_w}x{template_h}")`

			`# NEW ROI calculation: extend down by template_h * 4`
			`roi_x1 = int(max(0, x))`
			`roi_y1 = int(max(0, y - template_h // 2))`
			`roi_x2 = int(min(w, x + min(600, w - x)))`
			`roi_y2 = int(min(h, y + template_h * 4)) # NEW: extend down by 4x`

			`print(f"\nNEW ROI coordinates:")`
			`print(f" ROI: ({roi_x1}, {roi_y1}) -> ({roi_x2}, {roi_y2})")`
			`print(f" ROI size: {roi_x2 - roi_x1}x{roi_y2 - roi_y1}")`

			`rel_x1 = roi_x1 / w * 100`
			`rel_y1 = roi_y1 / h * 100`
			`rel_x2 = roi_x2 / w * 100`
			`rel_y2 = roi_y2 / h * 100`
			`print(f" Relative: ({rel_x1:.1f}%, {rel_y1:.1f}%) -> ({rel_x2:.1f}%, {rel_y2:.1f}%)")`

			`# Extract ROI`
			`roi_img = page_img[roi_y1:roi_y2, roi_x1:roi_x2]`
			`print(f"\nActual ROI size: {roi_img.shape}")`

			`# Save ROI`
			`os.makedirs("test_debug_new", exist_ok=True)`
			`cv2.imwrite("test_debug_new/roi_debug.png", roi_img)`
			`print("ROI saved to: test_debug_new/roi_debug.png")`

			`# Run OCR on ROI`
			`print("\nRunning OCR on NEW ROI...")`
			`ocr = PaddleOCR(lang='ch')`
			`ocr_result = ocr.predict(roi_img)`

			`if ocr_result and len(ocr_result) > 0:`
			`res = ocr_result[0]`
			`texts = res.get('rec_texts', [])`
			`scores = res.get('rec_scores', [])`

			`print(f"\nOCR found {len(texts)} text lines:")`
			`found_4400 = False`
			`found_2100 = False`
			`for i, (text, score) in enumerate(zip(texts, scores)):`
			`numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))`
			`if numbers or score > 0.5:`
			`print(f" [{i}] '{text}' (score: {score:.2f})")`
			`if numbers:`
			`print(f" Numbers: {numbers}")`
			`if "440023010130" in numbers:`
			`print(f" ^ Found 440023010130 (report number)")`
			`found_4400 = True`
			`if "210020349096" in numbers:`
			`print(f" ^ Found 210020349096 (CORRECT CMA CODE!)")`
			`found_2100 = True`

			`print("\n" + "=" * 80)`
			`print("RESULT")`
			`print("=" * 80)`
			`if found_2100:`
			`print("SUCCESS: Found correct CMA code 210020349096!")`
			`elif found_4400:`
			`print("FAILED: Still finding 440023010130 instead of 210020349096")`
			`else:`
			`print("FAILED: No CMA codes found")`
			`else:`
			`print("ERROR: OCR returned no results")`

			`print("=" * 80)`