""" Search for CMA code position on the page """ import fitz import numpy as np import cv2 from paddleocr import PaddleOCR import os os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf" print("=" * 80) print("SEARCHING FOR CMA CODE 210020349096") print("=" * 80) # Extract page doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) # Try to get text before closing try: text = page.get_text() has_cma_in_text = '210020349096' in text except: has_cma_in_text = False doc.close() print(f"\nPage size: {page_img.shape}") print(f"\nPDF text contains '210020349096': {has_cma_in_text}") # Try to find CMA code with full-page OCR print("\nRunning full-page OCR...") ocr = PaddleOCR(lang='ch') ocr_result = ocr.predict(page_img) if ocr_result and len(ocr_result) > 0: res = ocr_result[0] texts = res.get('rec_texts', []) boxes = res.get('rec_boxes', []) scores = res.get('rec_scores', []) print(f"\nOCR found {len(texts)} text lines") import re found = False for i, (text, box, score) in enumerate(zip(texts, boxes, scores)): # Find 11-12 digit numbers numbers = re.findall(r'\d{11,12}', text.replace(" ", "")) if numbers: # Calculate box center x_coords = [int(p[0]) for p in box] y_coords = [int(p[1]) for p in box] x_center = sum(x_coords) // 4 y_center = sum(y_coords) // 4 h, w = page_img.shape[:2] rel_x = x_center / w * 100 rel_y = y_center / h * 100 print(f"\nLine {i}: '{text}'") print(f" Numbers: {numbers}") print(f" Position: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)") print(f" Score: {score:.2f}") if "210020349096" in numbers: print(f" ^ THIS IS THE CORRECT CMA CODE!") found = True # Calculate where it is relative to logo print(f"\n Logo center was at: (1427, 885) -> (57.5%, 25.2%)") print(f" CMA code is at: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)") print(f" Difference: X+{x_center-1427}, Y+{y_center-885}") if "440023010130" in numbers: print(f" ^ This is 440023010130 (report number)") if not found: print("\n⚠️ WARNING: CMA code 210020349096 NOT FOUND in OCR results!") print(" This means either:") print(" 1. The CMA code is in an image that OCR can't read") print(" 2. The CMA code is handwritten") print(" 3. The PDF doesn't contain this CMA code") print("\n" + "=" * 80)