45 lines
1.3 KiB
Python
45 lines
1.3 KiB
Python
"""
|
|
Simple script to find CMA code position
|
|
"""
|
|
import fitz, numpy as np, cv2, os, re
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
from paddleocr import PaddleOCR
|
|
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0]
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
|
doc.close()
|
|
|
|
h, w = page_img.shape[:2]
|
|
print(f"Page: {w}x{h}")
|
|
|
|
ocr = PaddleOCR(lang='ch')
|
|
ocr_result = ocr.predict(page_img)
|
|
|
|
if ocr_result and len(ocr_result) > 0:
|
|
res = ocr_result[0]
|
|
texts = res.get('rec_texts', [])
|
|
|
|
for i, text in enumerate(texts):
|
|
if "210020349096" in text:
|
|
print(f"Line {i}: {text}")
|
|
print(f"Index: {i}")
|
|
|
|
# Print nearby lines
|
|
print(f"Nearby lines:")
|
|
for j in range(max(0, i-2), min(len(texts), i+3)):
|
|
print(f" [{j}] {texts[j]}")
|
|
break
|
|
else:
|
|
print("NOT FOUND in texts")
|
|
print("All lines with 11-12 digits:")
|
|
for i, text in enumerate(texts):
|
|
nums = re.findall(r'\d{11,12}', text)
|
|
if nums:
|
|
print(f" [{i}] {text}: {nums}")
|