129 lines
4.2 KiB
Python
129 lines
4.2 KiB
Python
"""
|
|
Debug CMA extraction - handle new PaddleOCR format.
|
|
"""
|
|
import os
|
|
import cv2
|
|
import numpy as np
|
|
import re
|
|
|
|
# Set environment variables
|
|
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
|
|
from paddleocr import PaddleOCR
|
|
|
|
# Initialize OCR
|
|
print('Initializing PaddleOCR...')
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
|
|
|
# Read image
|
|
img = cv2.imread('debug_images/YDQ25_002294_page1.png')
|
|
h, w = img.shape[:2]
|
|
print(f'Image size: {w}x{h}')
|
|
|
|
# Extract top-right area
|
|
top_right = img[0:int(h*0.4), int(w*0.4):w]
|
|
print(f'Top-right area: {top_right.shape[1]}x{top_right.shape[0]}')
|
|
|
|
# OCR on top-right
|
|
print('\nRunning OCR on top-right area...')
|
|
result = ocr.ocr(top_right)
|
|
|
|
print(f'OCR result type: {type(result)}')
|
|
|
|
# Handle new PaddleOCR format (dict with rec_texts)
|
|
rec_texts = []
|
|
rec_scores = []
|
|
|
|
if isinstance(result, dict):
|
|
print('OCR returned dict format (new API)')
|
|
rec_texts = result.get('rec_texts', [])
|
|
rec_scores = result.get('rec_scores', [])
|
|
print(f'Found {len(rec_texts)} text lines')
|
|
for i, text in enumerate(rec_texts):
|
|
print(f'{i+1}. {text}')
|
|
elif isinstance(result, list) and len(result) > 0:
|
|
print('OCR returned list format (old API)')
|
|
if isinstance(result[0], dict):
|
|
rec_texts = result[0].get('rec_texts', [])
|
|
rec_scores = result[0].get('rec_scores', [])
|
|
elif isinstance(result[0], list):
|
|
for line in result[0]:
|
|
if len(line) >= 2:
|
|
text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
|
|
rec_texts.append(text)
|
|
|
|
# Find 11-12 digit numbers
|
|
cma_pattern = re.compile(r'\d{11,12}')
|
|
all_numbers = []
|
|
|
|
for i, text in enumerate(rec_texts):
|
|
cleaned = text.replace(' ', '').replace('-', '').replace(':', '')
|
|
matches = cma_pattern.findall(cleaned)
|
|
for match in matches:
|
|
all_numbers.append({
|
|
'number': match,
|
|
'text': text
|
|
})
|
|
|
|
print(f'\nFound {len(all_numbers)} 11-digit numbers in top-right:')
|
|
for i, num_info in enumerate(all_numbers, 1):
|
|
print(f'{i}. {num_info["number"]} - Text: "{num_info["text"]}"')
|
|
|
|
expected = '240020349096'
|
|
found = any(n['number'] == expected for n in all_numbers)
|
|
print(f'\nExpected CMA {expected}: {"FOUND" if found else "NOT FOUND"}')
|
|
|
|
# Full page OCR
|
|
print('\n' + '='*80)
|
|
print('Running full page OCR...')
|
|
full_result = ocr.ocr(img)
|
|
|
|
full_rec_texts = []
|
|
if isinstance(full_result, dict):
|
|
full_rec_texts = full_result.get('rec_texts', [])
|
|
elif isinstance(full_result, list) and len(full_result) > 0:
|
|
if isinstance(full_result[0], dict):
|
|
full_rec_texts = full_result[0].get('rec_texts', [])
|
|
elif isinstance(full_result[0], list):
|
|
for line in full_result[0]:
|
|
if len(line) >= 2:
|
|
text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
|
|
full_rec_texts.append(text)
|
|
|
|
print(f'Found {len(full_rec_texts)} text lines on full page')
|
|
|
|
# Find all 11-digit numbers
|
|
all_numbers_full = []
|
|
for text in full_rec_texts:
|
|
cleaned = text.replace(' ', '').replace('-', '').replace(':', '')
|
|
matches = cma_pattern.findall(cleaned)
|
|
for match in matches:
|
|
all_numbers_full.append({
|
|
'number': match,
|
|
'text': text
|
|
})
|
|
|
|
print(f'\nFound {len(all_numbers_full)} 11-digit numbers on full page:')
|
|
print('First 20:')
|
|
for i, num_info in enumerate(all_numbers_full[:20], 1):
|
|
text_preview = num_info["text"][:80]
|
|
print(f'{i}. {num_info["number"]} - Text: "{text_preview}"')
|
|
|
|
found_full = any(n['number'] == expected for n in all_numbers_full)
|
|
print(f'\nExpected CMA {expected} on full page: {"FOUND" if found_full else "NOT FOUND"}')
|
|
|
|
# Conclusion
|
|
print('\n' + '='*80)
|
|
print('ANALYSIS COMPLETE')
|
|
print('='*80)
|
|
if found_full:
|
|
print(f'SUCCESS: Expected CMA {expected} was found')
|
|
else:
|
|
print(f'FAILURE: Expected CMA {expected} was NOT found')
|
|
print('\nPossible reasons:')
|
|
print('1. CMA code is on a different page (not page 1)')
|
|
print('2. CMA code is in a graphic/image that OCR cannot read')
|
|
print('3. The CMA code format is different (not 11 digits)')
|
|
print('4. The expected CMA code in results.json is incorrect')
|
|
print('\nRecommendation: Check other pages of the PDF or verify the expected CMA code')
|