""" Find all 11-12 digit numbers on the page """ import fitz import numpy as np import cv2 from paddleocr import PaddleOCR import os import re os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf" print("=" * 80) print("FINDING ALL 11-12 DIGIT NUMBERS") print("=" * 80) # Extract page doc = fitz.open(pdf_path) page = doc[0] mat = fitz.Matrix(300 / 72, 300 / 72) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img_array = np.frombuffer(img_data, dtype=np.uint8) page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) doc.close() print(f"\nPage size: {page_img.shape}") # Run OCR print("\nRunning full-page OCR...") ocr = PaddleOCR(lang='ch') ocr_result = ocr.predict(page_img) if ocr_result and len(ocr_result) > 0: res = ocr_result[0] texts = res.get('rec_texts', []) scores = res.get('rec_scores', []) print(f"\nOCR found {len(texts)} text lines") # Find all 11-12 digit numbers all_numbers = {} for i, (text, score) in enumerate(zip(texts, scores)): numbers = re.findall(r'\d{11,12}', text.replace(" ", "")) for num in numbers: if num not in all_numbers: all_numbers[num] = [] all_numbers[num].append((i, text, score)) print(f"\nFound {len(all_numbers)} unique 11-12 digit numbers:") for num in sorted(all_numbers.keys()): occurrences = all_numbers[num] print(f"\n {num}:") for idx, text, score in occurrences: print(f" [{idx}] '{text}' (score: {score:.2f})") if num == "210020349096": print(f" ^ THIS IS THE CORRECT CMA CODE! ✓") elif num == "440023010130": print(f" ^ This is 440023010130 (report number)") print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) if "210020349096" in all_numbers: print("✓ CMA code 210020349096 FOUND in OCR results!") elif "440023010130" in all_numbers: print("✗ Only 440023010130 found (report number), NOT the CMA code!") else: print("✗ Neither 210020349096 nor 440023010130 found") print(" Possible reasons:") print(" 1. CMA code is in a different format") print(" 2. CMA code is in an image/font that OCR can't recognize") print(" 3. This PDF doesn't contain 210020349096")