""" Debug CMA extraction issues for specific PDFs. """ import os import cv2 import numpy as np import re # Set environment variables os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' from paddleocr import PaddleOCR # Initialize OCR print('Initializing PaddleOCR...') ocr = PaddleOCR(use_angle_cls=True, lang='ch') # Read image img = cv2.imread('debug_images/YDQ25_002294_page1.png') h, w = img.shape[:2] print(f'Image size: {w}x{h}') # Extract top-right area (CMA logo usually there) top_right = img[0:int(h*0.4), int(w*0.4):w] cv2.imwrite('debug_images/YDQ25_002294_top_right.png', top_right) print(f'Top-right area saved: {top_right.shape[1]}x{top_right.shape[0]}') # OCR on top-right print('\nRunning OCR on top-right area...') result = ocr.ocr(top_right) print(f'OCR result type: {type(result)}') if result: print(f'OCR result length: {len(result)}') if len(result) > 0: print(f'OCR result[0] type: {type(result[0])}') print(f'OCR result[0]: {result[0]}') # Find 11-digit numbers cma_pattern = re.compile(r'\d{11}') all_numbers = [] # Handle different result formats if result is None: print('OCR returned None') elif isinstance(result, list) and len(result) > 0: ocr_data = result[0] if ocr_data is None: print('OCR result[0] is None') elif isinstance(ocr_data, list): print(f'Found {len(ocr_data)} text lines') for i, line in enumerate(ocr_data[:20]): try: if len(line) >= 2: text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1]) print(f'{i+1}. {text}') # Find 11-digit numbers cleaned = text.replace(' ', '').replace('-', '').replace(':', '') matches = cma_pattern.findall(cleaned) for match in matches: all_numbers.append({ 'number': match, 'text': text }) except Exception as e: print(f'Error processing line {i}: {e}') continue print(f'\nFound {len(all_numbers)} 11-digit numbers in top-right:') for i, num_info in enumerate(all_numbers, 1): print(f'{i}. {num_info["number"]} - Text: "{num_info["text"]}"') expected = '240020349096' found = any(n['number'] == expected for n in all_numbers) print(f'\nExpected CMA {expected}: {"FOUND" if found else "NOT FOUND"}') # If not found, try full page OCR if not found: print('\nRunning full page OCR...') full_result = ocr.ocr(img) if full_result and isinstance(full_result, list) and len(full_result) > 0: full_ocr_data = full_result[0] if isinstance(full_ocr_data, list): all_numbers_full = [] for line in full_ocr_data: try: if len(line) >= 2: text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1]) cleaned = text.replace(' ', '').replace('-', '').replace(':', '') matches = cma_pattern.findall(cleaned) for match in matches: all_numbers_full.append({ 'number': match, 'text': text }) except: continue print(f'Found {len(all_numbers_full)} 11-digit numbers on full page') print('\nFirst 15 numbers:') for i, num_info in enumerate(all_numbers_full[:15], 1): text_preview = num_info["text"][:60] if len(num_info["text"]) > 60 else num_info["text"] print(f'{i}. {num_info["number"]} - Text: "{text_preview}..."') found_full = any(n['number'] == expected for n in all_numbers_full) print(f'\nExpected CMA {expected} on full page: {"FOUND" if found_full else "NOT FOUND"}') if not found_full: print('\nCONCLUSION:') print(f'The expected CMA code {expected} is NOT present in the OCR output.') print('Possible reasons:') print('1. CMA code is not on the first page') print('2. CMA code is in an image/graphic format that OCR cannot read') print('3. CMA code is handwritten or in a special font') print('4. The expected CMA code in results.json is incorrect')