report-detect/archive/tools/find_numbers.py

"""
Find all 11-12 digit numbers on the page
"""
import fitz
import numpy as np
import cv2
from paddleocr import PaddleOCR
import os
import re

os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"

print("=" * 80)
print("FINDING ALL 11-12 DIGIT NUMBERS")
print("=" * 80)

# Extract page
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()

print(f"\nPage size: {page_img.shape}")

# Run OCR
print("\nRunning full-page OCR...")
ocr = PaddleOCR(lang='ch')
ocr_result = ocr.predict(page_img)

if ocr_result and len(ocr_result) > 0:
    res = ocr_result[0]
    texts = res.get('rec_texts', [])
    scores = res.get('rec_scores', [])

    print(f"\nOCR found {len(texts)} text lines")

    # Find all 11-12 digit numbers
    all_numbers = {}
    for i, (text, score) in enumerate(zip(texts, scores)):
        numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
        for num in numbers:
            if num not in all_numbers:
                all_numbers[num] = []
            all_numbers[num].append((i, text, score))

    print(f"\nFound {len(all_numbers)} unique 11-12 digit numbers:")
    for num in sorted(all_numbers.keys()):
        occurrences = all_numbers[num]
        print(f"\n  {num}:")
        for idx, text, score in occurrences:
            print(f"    [{idx}] '{text}' (score: {score:.2f})")

        if num == "210020349096":
            print(f"    ^ THIS IS THE CORRECT CMA CODE! ✓")
        elif num == "440023010130":
            print(f"    ^ This is 440023010130 (report number)")

print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if "210020349096" in all_numbers:
    print("✓ CMA code 210020349096 FOUND in OCR results!")
elif "440023010130" in all_numbers:
    print("✗ Only 440023010130 found (report number), NOT the CMA code!")
else:
    print("✗ Neither 210020349096 nor 440023010130 found")
    print("  Possible reasons:")
    print("  1. CMA code is in a different format")
    print("  2. CMA code is in an image/font that OCR can't recognize")
    print("  3. This PDF doesn't contain 210020349096")