""" Debug CMA extraction - handle new PaddleOCR format. """ import os import cv2 import numpy as np import re # Set environment variables os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' from paddleocr import PaddleOCR # Initialize OCR print('Initializing PaddleOCR...') ocr = PaddleOCR(use_angle_cls=True, lang='ch') # Read image img = cv2.imread('debug_images/YDQ25_002294_page1.png') h, w = img.shape[:2] print(f'Image size: {w}x{h}') # Extract top-right area top_right = img[0:int(h*0.4), int(w*0.4):w] print(f'Top-right area: {top_right.shape[1]}x{top_right.shape[0]}') # OCR on top-right print('\nRunning OCR on top-right area...') result = ocr.ocr(top_right) print(f'OCR result type: {type(result)}') # Handle new PaddleOCR format (dict with rec_texts) rec_texts = [] rec_scores = [] if isinstance(result, dict): print('OCR returned dict format (new API)') rec_texts = result.get('rec_texts', []) rec_scores = result.get('rec_scores', []) print(f'Found {len(rec_texts)} text lines') for i, text in enumerate(rec_texts): print(f'{i+1}. {text}') elif isinstance(result, list) and len(result) > 0: print('OCR returned list format (old API)') if isinstance(result[0], dict): rec_texts = result[0].get('rec_texts', []) rec_scores = result[0].get('rec_scores', []) elif isinstance(result[0], list): for line in result[0]: if len(line) >= 2: text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1]) rec_texts.append(text) # Find 11-12 digit numbers cma_pattern = re.compile(r'\d{11,12}') all_numbers = [] for i, text in enumerate(rec_texts): cleaned = text.replace(' ', '').replace('-', '').replace(':', '') matches = cma_pattern.findall(cleaned) for match in matches: all_numbers.append({ 'number': match, 'text': text }) print(f'\nFound {len(all_numbers)} 11-digit numbers in top-right:') for i, num_info in enumerate(all_numbers, 1): print(f'{i}. {num_info["number"]} - Text: "{num_info["text"]}"') expected = '240020349096' found = any(n['number'] == expected for n in all_numbers) print(f'\nExpected CMA {expected}: {"FOUND" if found else "NOT FOUND"}') # Full page OCR print('\n' + '='*80) print('Running full page OCR...') full_result = ocr.ocr(img) full_rec_texts = [] if isinstance(full_result, dict): full_rec_texts = full_result.get('rec_texts', []) elif isinstance(full_result, list) and len(full_result) > 0: if isinstance(full_result[0], dict): full_rec_texts = full_result[0].get('rec_texts', []) elif isinstance(full_result[0], list): for line in full_result[0]: if len(line) >= 2: text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1]) full_rec_texts.append(text) print(f'Found {len(full_rec_texts)} text lines on full page') # Find all 11-digit numbers all_numbers_full = [] for text in full_rec_texts: cleaned = text.replace(' ', '').replace('-', '').replace(':', '') matches = cma_pattern.findall(cleaned) for match in matches: all_numbers_full.append({ 'number': match, 'text': text }) print(f'\nFound {len(all_numbers_full)} 11-digit numbers on full page:') print('First 20:') for i, num_info in enumerate(all_numbers_full[:20], 1): text_preview = num_info["text"][:80] print(f'{i}. {num_info["number"]} - Text: "{text_preview}"') found_full = any(n['number'] == expected for n in all_numbers_full) print(f'\nExpected CMA {expected} on full page: {"FOUND" if found_full else "NOT FOUND"}') # Conclusion print('\n' + '='*80) print('ANALYSIS COMPLETE') print('='*80) if found_full: print(f'SUCCESS: Expected CMA {expected} was found') else: print(f'FAILURE: Expected CMA {expected} was NOT found') print('\nPossible reasons:') print('1. CMA code is on a different page (not page 1)') print('2. CMA code is in a graphic/image that OCR cannot read') print('3. The CMA code format is different (not 11 digits)') print('4. The expected CMA code in results.json is incorrect') print('\nRecommendation: Check other pages of the PDF or verify the expected CMA code')