93 lines
2.8 KiB
Python
93 lines
2.8 KiB
Python
|
|
"""
|
||
|
|
Search for CMA code position on the page
|
||
|
|
"""
|
||
|
|
import fitz
|
||
|
|
import numpy as np
|
||
|
|
import cv2
|
||
|
|
from paddleocr import PaddleOCR
|
||
|
|
import os
|
||
|
|
|
||
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
||
|
|
|
||
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
||
|
|
|
||
|
|
print("=" * 80)
|
||
|
|
print("SEARCHING FOR CMA CODE 210020349096")
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
# Extract page
|
||
|
|
doc = fitz.open(pdf_path)
|
||
|
|
page = doc[0]
|
||
|
|
mat = fitz.Matrix(300 / 72, 300 / 72)
|
||
|
|
pix = page.get_pixmap(matrix=mat)
|
||
|
|
img_data = pix.tobytes("png")
|
||
|
|
img_array = np.frombuffer(img_data, dtype=np.uint8)
|
||
|
|
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
|
|
|
||
|
|
# Try to get text before closing
|
||
|
|
try:
|
||
|
|
text = page.get_text()
|
||
|
|
has_cma_in_text = '210020349096' in text
|
||
|
|
except:
|
||
|
|
has_cma_in_text = False
|
||
|
|
|
||
|
|
doc.close()
|
||
|
|
|
||
|
|
print(f"\nPage size: {page_img.shape}")
|
||
|
|
print(f"\nPDF text contains '210020349096': {has_cma_in_text}")
|
||
|
|
|
||
|
|
# Try to find CMA code with full-page OCR
|
||
|
|
print("\nRunning full-page OCR...")
|
||
|
|
ocr = PaddleOCR(lang='ch')
|
||
|
|
ocr_result = ocr.predict(page_img)
|
||
|
|
|
||
|
|
if ocr_result and len(ocr_result) > 0:
|
||
|
|
res = ocr_result[0]
|
||
|
|
texts = res.get('rec_texts', [])
|
||
|
|
boxes = res.get('rec_boxes', [])
|
||
|
|
scores = res.get('rec_scores', [])
|
||
|
|
|
||
|
|
print(f"\nOCR found {len(texts)} text lines")
|
||
|
|
|
||
|
|
import re
|
||
|
|
found = False
|
||
|
|
for i, (text, box, score) in enumerate(zip(texts, boxes, scores)):
|
||
|
|
# Find 11-12 digit numbers
|
||
|
|
numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
|
||
|
|
if numbers:
|
||
|
|
# Calculate box center
|
||
|
|
x_coords = [int(p[0]) for p in box]
|
||
|
|
y_coords = [int(p[1]) for p in box]
|
||
|
|
x_center = sum(x_coords) // 4
|
||
|
|
y_center = sum(y_coords) // 4
|
||
|
|
|
||
|
|
h, w = page_img.shape[:2]
|
||
|
|
rel_x = x_center / w * 100
|
||
|
|
rel_y = y_center / h * 100
|
||
|
|
|
||
|
|
print(f"\nLine {i}: '{text}'")
|
||
|
|
print(f" Numbers: {numbers}")
|
||
|
|
print(f" Position: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)")
|
||
|
|
print(f" Score: {score:.2f}")
|
||
|
|
|
||
|
|
if "210020349096" in numbers:
|
||
|
|
print(f" ^ THIS IS THE CORRECT CMA CODE!")
|
||
|
|
found = True
|
||
|
|
|
||
|
|
# Calculate where it is relative to logo
|
||
|
|
print(f"\n Logo center was at: (1427, 885) -> (57.5%, 25.2%)")
|
||
|
|
print(f" CMA code is at: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)")
|
||
|
|
print(f" Difference: X+{x_center-1427}, Y+{y_center-885}")
|
||
|
|
|
||
|
|
if "440023010130" in numbers:
|
||
|
|
print(f" ^ This is 440023010130 (report number)")
|
||
|
|
|
||
|
|
if not found:
|
||
|
|
print("\n⚠️ WARNING: CMA code 210020349096 NOT FOUND in OCR results!")
|
||
|
|
print(" This means either:")
|
||
|
|
print(" 1. The CMA code is in an image that OCR can't read")
|
||
|
|
print(" 2. The CMA code is handwritten")
|
||
|
|
print(" 3. The PDF doesn't contain this CMA code")
|
||
|
|
|
||
|
|
print("\n" + "=" * 80)
|