report-detect/archive/temp_scripts/test_roi_fix.py

142 lines
4.1 KiB
Python
Raw Permalink Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
"""
Test the fixed ROI calculation
"""
import subprocess
import sys
# Clear all Python cache first
print("Clearing Python cache...")
subprocess.run(["python", "-c", """
import os, shutil
for root, dirs, files in os.walk('.'):
for d in dirs[:200]:
if d == '__pycache__':
try:
shutil.rmtree(os.path.join(root, d))
except:
pass
"""], capture_output=True)
# Now run the test with fresh Python
import os
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
import fitz
import numpy as np
import cv2
import re
from paddleocr import PaddleOCR
# Fresh import
import importlib
import cma_extraction_template_primary
importlib.reload(cma_extraction_template_primary)
from cma_extraction_template_primary import locate_template_multi_scale, imread_unicode
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
template_path = "template/CMA_Logo.png"
print("=" * 80)
print("TESTING FIXED ROI CALCULATION")
print("=" * 80)
# Extract page
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()
print(f"\nPage size: {page_img.shape}")
h, w = page_img.shape[:2]
# Load template and match
template = imread_unicode(template_path, cv2.IMREAD_COLOR)
print("\nRunning template matching...")
match_res = locate_template_multi_scale(page_img, template)
if not match_res.get('success'):
print(f"ERROR: Template matching failed: {match_res.get('reason')}")
sys.exit(1)
print(f"Match succeeded: confidence={match_res['max_val']:.3f}")
# Calculate ROI with NEW formula
x, y = match_res['match_center']
template_h = match_res['template_h']
template_w = match_res['template_w']
print(f"\nCalculating ROI with NEW formula...")
print(f" Logo center: ({x}, {y})")
print(f" Template size: {template_w}x{template_h}")
# NEW ROI calculation: extend down by template_h * 4
roi_x1 = int(max(0, x))
roi_y1 = int(max(0, y - template_h // 2))
roi_x2 = int(min(w, x + min(600, w - x)))
roi_y2 = int(min(h, y + template_h * 4)) # NEW: extend down by 4x
print(f"\nNEW ROI coordinates:")
print(f" ROI: ({roi_x1}, {roi_y1}) -> ({roi_x2}, {roi_y2})")
print(f" ROI size: {roi_x2 - roi_x1}x{roi_y2 - roi_y1}")
rel_x1 = roi_x1 / w * 100
rel_y1 = roi_y1 / h * 100
rel_x2 = roi_x2 / w * 100
rel_y2 = roi_y2 / h * 100
print(f" Relative: ({rel_x1:.1f}%, {rel_y1:.1f}%) -> ({rel_x2:.1f}%, {rel_y2:.1f}%)")
# Extract ROI
roi_img = page_img[roi_y1:roi_y2, roi_x1:roi_x2]
print(f"\nActual ROI size: {roi_img.shape}")
# Save ROI
os.makedirs("test_debug_new", exist_ok=True)
cv2.imwrite("test_debug_new/roi_debug.png", roi_img)
print("ROI saved to: test_debug_new/roi_debug.png")
# Run OCR on ROI
print("\nRunning OCR on NEW ROI...")
ocr = PaddleOCR(lang='ch')
ocr_result = ocr.predict(roi_img)
if ocr_result and len(ocr_result) > 0:
res = ocr_result[0]
texts = res.get('rec_texts', [])
scores = res.get('rec_scores', [])
print(f"\nOCR found {len(texts)} text lines:")
found_4400 = False
found_2100 = False
for i, (text, score) in enumerate(zip(texts, scores)):
numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
if numbers or score > 0.5:
print(f" [{i}] '{text}' (score: {score:.2f})")
if numbers:
print(f" Numbers: {numbers}")
if "440023010130" in numbers:
print(f" ^ Found 440023010130 (report number)")
found_4400 = True
if "210020349096" in numbers:
print(f" ^ Found 210020349096 (CORRECT CMA CODE!)")
found_2100 = True
print("\n" + "=" * 80)
print("RESULT")
print("=" * 80)
if found_2100:
print("SUCCESS: Found correct CMA code 210020349096!")
elif found_4400:
print("FAILED: Still finding 440023010130 instead of 210020349096")
else:
print("FAILED: No CMA codes found")
else:
print("ERROR: OCR returned no results")
print("=" * 80)