""" Simple script to find CMA code position """ import fitz, numpy as np, cv2, os, re os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" from paddleocr import PaddleOCR pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf" doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() h, w = page_img.shape[:2] print(f"Page: {w}x{h}") ocr = PaddleOCR(lang='ch') ocr_result = ocr.predict(page_img) if ocr_result and len(ocr_result) > 0: res = ocr_result[0] texts = res.get('rec_texts', []) for i, text in enumerate(texts): if "210020349096" in text: print(f"Line {i}: {text}") print(f"Index: {i}") # Print nearby lines print(f"Nearby lines:") for j in range(max(0, i-2), min(len(texts), i+3)): print(f" [{j}] {texts[j]}") break else: print("NOT FOUND in texts") print("All lines with 11-12 digits:") for i, text in enumerate(texts): nums = re.findall(r'\d{11,12}', text) if nums: print(f" [{i}] {text}: {nums}")