""" Test the fixed ROI calculation """ import subprocess import sys # Clear all Python cache first print("Clearing Python cache...") subprocess.run(["python", "-c", """ import os, shutil for root, dirs, files in os.walk('.'): for d in dirs[:200]: if d == '__pycache__': try: shutil.rmtree(os.path.join(root, d)) except: pass """], capture_output=True) # Now run the test with fresh Python import os os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" import fitz import numpy as np import cv2 import re from paddleocr import PaddleOCR # Fresh import import importlib import cma_extraction_template_primary importlib.reload(cma_extraction_template_primary) from cma_extraction_template_primary import locate_template_multi_scale, imread_unicode pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf" template_path = "template/CMA_Logo.png" print("=" * 80) print("TESTING FIXED ROI CALCULATION") print("=" * 80) # Extract page doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() print(f"\nPage size: {page_img.shape}") h, w = page_img.shape[:2] # Load template and match template = imread_unicode(template_path, cv2.IMREAD_COLOR) print("\nRunning template matching...") match_res = locate_template_multi_scale(page_img, template) if not match_res.get('success'): print(f"ERROR: Template matching failed: {match_res.get('reason')}") sys.exit(1) print(f"Match succeeded: confidence={match_res['max_val']:.3f}") # Calculate ROI with NEW formula x, y = match_res['match_center'] template_h = match_res['template_h'] template_w = match_res['template_w'] print(f"\nCalculating ROI with NEW formula...") print(f" Logo center: ({x}, {y})") print(f" Template size: {template_w}x{template_h}") # NEW ROI calculation: extend down by template_h * 4 roi_x1 = int(max(0, x)) roi_y1 = int(max(0, y - template_h // 2)) roi_x2 = int(min(w, x + min(600, w - x))) roi_y2 = int(min(h, y + template_h * 4)) # NEW: extend down by 4x print(f"\nNEW ROI coordinates:") print(f" ROI: ({roi_x1}, {roi_y1}) -> ({roi_x2}, {roi_y2})") print(f" ROI size: {roi_x2 - roi_x1}x{roi_y2 - roi_y1}") rel_x1 = roi_x1 / w * 100 rel_y1 = roi_y1 / h * 100 rel_x2 = roi_x2 / w * 100 rel_y2 = roi_y2 / h * 100 print(f" Relative: ({rel_x1:.1f}%, {rel_y1:.1f}%) -> ({rel_x2:.1f}%, {rel_y2:.1f}%)") # Extract ROI roi_img = page_img[roi_y1:roi_y2, roi_x1:roi_x2] print(f"\nActual ROI size: {roi_img.shape}") # Save ROI os.makedirs("test_debug_new", exist_ok=True) cv2.imwrite("test_debug_new/roi_debug.png", roi_img) print("ROI saved to: test_debug_new/roi_debug.png") # Run OCR on ROI print("\nRunning OCR on NEW ROI...") ocr = PaddleOCR(lang='ch') ocr_result = ocr.predict(roi_img) if ocr_result and len(ocr_result) > 0: res = ocr_result[0] texts = res.get('rec_texts', []) scores = res.get('rec_scores', []) print(f"\nOCR found {len(texts)} text lines:") found_4400 = False found_2100 = False for i, (text, score) in enumerate(zip(texts, scores)): numbers = re.findall(r'\d{11,12}', text.replace(" ", "")) if numbers or score > 0.5: print(f" [{i}] '{text}' (score: {score:.2f})") if numbers: print(f" Numbers: {numbers}") if "440023010130" in numbers: print(f" ^ Found 440023010130 (report number)") found_4400 = True if "210020349096" in numbers: print(f" ^ Found 210020349096 (CORRECT CMA CODE!)") found_2100 = True print("\n" + "=" * 80) print("RESULT") print("=" * 80) if found_2100: print("SUCCESS: Found correct CMA code 210020349096!") elif found_4400: print("FAILED: Still finding 440023010130 instead of 210020349096") else: print("FAILED: No CMA codes found") else: print("ERROR: OCR returned no results") print("=" * 80)