142 lines
4.1 KiB
Python
142 lines
4.1 KiB
Python
|
|
"""
|
||
|
|
Test the fixed ROI calculation
|
||
|
|
"""
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
|
||
|
|
# Clear all Python cache first
|
||
|
|
print("Clearing Python cache...")
|
||
|
|
subprocess.run(["python", "-c", """
|
||
|
|
import os, shutil
|
||
|
|
for root, dirs, files in os.walk('.'):
|
||
|
|
for d in dirs[:200]:
|
||
|
|
if d == '__pycache__':
|
||
|
|
try:
|
||
|
|
shutil.rmtree(os.path.join(root, d))
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
"""], capture_output=True)
|
||
|
|
|
||
|
|
# Now run the test with fresh Python
|
||
|
|
import os
|
||
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
||
|
|
|
||
|
|
import fitz
|
||
|
|
import numpy as np
|
||
|
|
import cv2
|
||
|
|
import re
|
||
|
|
from paddleocr import PaddleOCR
|
||
|
|
|
||
|
|
# Fresh import
|
||
|
|
import importlib
|
||
|
|
import cma_extraction_template_primary
|
||
|
|
importlib.reload(cma_extraction_template_primary)
|
||
|
|
|
||
|
|
from cma_extraction_template_primary import locate_template_multi_scale, imread_unicode
|
||
|
|
|
||
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
||
|
|
template_path = "template/CMA_Logo.png"
|
||
|
|
|
||
|
|
print("=" * 80)
|
||
|
|
print("TESTING FIXED ROI CALCULATION")
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
# Extract page
|
||
|
|
doc = fitz.open(pdf_path)
|
||
|
|
page = doc[0]
|
||
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
||
|
|
pix = page.get_pixmap(matrix=mat)
|
||
|
|
img_data = pix.tobytes("png")
|
||
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
||
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
|
|
doc.close()
|
||
|
|
|
||
|
|
print(f"\nPage size: {page_img.shape}")
|
||
|
|
h, w = page_img.shape[:2]
|
||
|
|
|
||
|
|
# Load template and match
|
||
|
|
template = imread_unicode(template_path, cv2.IMREAD_COLOR)
|
||
|
|
|
||
|
|
print("\nRunning template matching...")
|
||
|
|
match_res = locate_template_multi_scale(page_img, template)
|
||
|
|
|
||
|
|
if not match_res.get('success'):
|
||
|
|
print(f"ERROR: Template matching failed: {match_res.get('reason')}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f"Match succeeded: confidence={match_res['max_val']:.3f}")
|
||
|
|
|
||
|
|
# Calculate ROI with NEW formula
|
||
|
|
x, y = match_res['match_center']
|
||
|
|
template_h = match_res['template_h']
|
||
|
|
template_w = match_res['template_w']
|
||
|
|
|
||
|
|
print(f"\nCalculating ROI with NEW formula...")
|
||
|
|
print(f" Logo center: ({x}, {y})")
|
||
|
|
print(f" Template size: {template_w}x{template_h}")
|
||
|
|
|
||
|
|
# NEW ROI calculation: extend down by template_h * 4
|
||
|
|
roi_x1 = int(max(0, x))
|
||
|
|
roi_y1 = int(max(0, y - template_h // 2))
|
||
|
|
roi_x2 = int(min(w, x + min(600, w - x)))
|
||
|
|
roi_y2 = int(min(h, y + template_h * 4)) # NEW: extend down by 4x
|
||
|
|
|
||
|
|
print(f"\nNEW ROI coordinates:")
|
||
|
|
print(f" ROI: ({roi_x1}, {roi_y1}) -> ({roi_x2}, {roi_y2})")
|
||
|
|
print(f" ROI size: {roi_x2 - roi_x1}x{roi_y2 - roi_y1}")
|
||
|
|
|
||
|
|
rel_x1 = roi_x1 / w * 100
|
||
|
|
rel_y1 = roi_y1 / h * 100
|
||
|
|
rel_x2 = roi_x2 / w * 100
|
||
|
|
rel_y2 = roi_y2 / h * 100
|
||
|
|
print(f" Relative: ({rel_x1:.1f}%, {rel_y1:.1f}%) -> ({rel_x2:.1f}%, {rel_y2:.1f}%)")
|
||
|
|
|
||
|
|
# Extract ROI
|
||
|
|
roi_img = page_img[roi_y1:roi_y2, roi_x1:roi_x2]
|
||
|
|
print(f"\nActual ROI size: {roi_img.shape}")
|
||
|
|
|
||
|
|
# Save ROI
|
||
|
|
os.makedirs("test_debug_new", exist_ok=True)
|
||
|
|
cv2.imwrite("test_debug_new/roi_debug.png", roi_img)
|
||
|
|
print("ROI saved to: test_debug_new/roi_debug.png")
|
||
|
|
|
||
|
|
# Run OCR on ROI
|
||
|
|
print("\nRunning OCR on NEW ROI...")
|
||
|
|
ocr = PaddleOCR(lang='ch')
|
||
|
|
ocr_result = ocr.predict(roi_img)
|
||
|
|
|
||
|
|
if ocr_result and len(ocr_result) > 0:
|
||
|
|
res = ocr_result[0]
|
||
|
|
texts = res.get('rec_texts', [])
|
||
|
|
scores = res.get('rec_scores', [])
|
||
|
|
|
||
|
|
print(f"\nOCR found {len(texts)} text lines:")
|
||
|
|
found_4400 = False
|
||
|
|
found_2100 = False
|
||
|
|
for i, (text, score) in enumerate(zip(texts, scores)):
|
||
|
|
numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
|
||
|
|
if numbers or score > 0.5:
|
||
|
|
print(f" [{i}] '{text}' (score: {score:.2f})")
|
||
|
|
if numbers:
|
||
|
|
print(f" Numbers: {numbers}")
|
||
|
|
if "440023010130" in numbers:
|
||
|
|
print(f" ^ Found 440023010130 (report number)")
|
||
|
|
found_4400 = True
|
||
|
|
if "210020349096" in numbers:
|
||
|
|
print(f" ^ Found 210020349096 (CORRECT CMA CODE!)")
|
||
|
|
found_2100 = True
|
||
|
|
|
||
|
|
print("\n" + "=" * 80)
|
||
|
|
print("RESULT")
|
||
|
|
print("=" * 80)
|
||
|
|
if found_2100:
|
||
|
|
print("SUCCESS: Found correct CMA code 210020349096!")
|
||
|
|
elif found_4400:
|
||
|
|
print("FAILED: Still finding 440023010130 instead of 210020349096")
|
||
|
|
else:
|
||
|
|
print("FAILED: No CMA codes found")
|
||
|
|
else:
|
||
|
|
print("ERROR: OCR returned no results")
|
||
|
|
|
||
|
|
print("=" * 80)
|