77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
|
|
"""
|
||
|
|
Find all 11-12 digit numbers on the page
|
||
|
|
"""
|
||
|
|
import fitz
|
||
|
|
import numpy as np
|
||
|
|
import cv2
|
||
|
|
from paddleocr import PaddleOCR
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
|
||
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
||
|
|
|
||
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
||
|
|
|
||
|
|
print("=" * 80)
|
||
|
|
print("FINDING ALL 11-12 DIGIT NUMBERS")
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
# Extract page
|
||
|
|
doc = fitz.open(pdf_path)
|
||
|
|
page = doc[0]
|
||
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
||
|
|
pix = page.get_pixmap(matrix=mat)
|
||
|
|
img_data = pix.tobytes("png")
|
||
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
||
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
|
|
doc.close()
|
||
|
|
|
||
|
|
print(f"\nPage size: {page_img.shape}")
|
||
|
|
|
||
|
|
# Run OCR
|
||
|
|
print("\nRunning full-page OCR...")
|
||
|
|
ocr = PaddleOCR(lang='ch')
|
||
|
|
ocr_result = ocr.predict(page_img)
|
||
|
|
|
||
|
|
if ocr_result and len(ocr_result) > 0:
|
||
|
|
res = ocr_result[0]
|
||
|
|
texts = res.get('rec_texts', [])
|
||
|
|
scores = res.get('rec_scores', [])
|
||
|
|
|
||
|
|
print(f"\nOCR found {len(texts)} text lines")
|
||
|
|
|
||
|
|
# Find all 11-12 digit numbers
|
||
|
|
all_numbers = {}
|
||
|
|
for i, (text, score) in enumerate(zip(texts, scores)):
|
||
|
|
numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
|
||
|
|
for num in numbers:
|
||
|
|
if num not in all_numbers:
|
||
|
|
all_numbers[num] = []
|
||
|
|
all_numbers[num].append((i, text, score))
|
||
|
|
|
||
|
|
print(f"\nFound {len(all_numbers)} unique 11-12 digit numbers:")
|
||
|
|
for num in sorted(all_numbers.keys()):
|
||
|
|
occurrences = all_numbers[num]
|
||
|
|
print(f"\n {num}:")
|
||
|
|
for idx, text, score in occurrences:
|
||
|
|
print(f" [{idx}] '{text}' (score: {score:.2f})")
|
||
|
|
|
||
|
|
if num == "210020349096":
|
||
|
|
print(f" ^ THIS IS THE CORRECT CMA CODE! ✓")
|
||
|
|
elif num == "440023010130":
|
||
|
|
print(f" ^ This is 440023010130 (report number)")
|
||
|
|
|
||
|
|
print("\n" + "=" * 80)
|
||
|
|
print("SUMMARY")
|
||
|
|
print("=" * 80)
|
||
|
|
if "210020349096" in all_numbers:
|
||
|
|
print("✓ CMA code 210020349096 FOUND in OCR results!")
|
||
|
|
elif "440023010130" in all_numbers:
|
||
|
|
print("✗ Only 440023010130 found (report number), NOT the CMA code!")
|
||
|
|
else:
|
||
|
|
print("✗ Neither 210020349096 nor 440023010130 found")
|
||
|
|
print(" Possible reasons:")
|
||
|
|
print(" 1. CMA code is in a different format")
|
||
|
|
print(" 2. CMA code is in an image/font that OCR can't recognize")
|
||
|
|
print(" 3. This PDF doesn't contain 210020349096")
|