report-detect/archive/temp_scripts/analyze_ydq_v2.py

"""
Debug CMA extraction - handle new PaddleOCR format.
"""
import os
import cv2
import numpy as np
import re

# Set environment variables
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

from paddleocr import PaddleOCR

# Initialize OCR
print('Initializing PaddleOCR...')
ocr = PaddleOCR(use_angle_cls=True, lang='ch')

# Read image
img = cv2.imread('debug_images/YDQ25_002294_page1.png')
h, w = img.shape[:2]
print(f'Image size: {w}x{h}')

# Extract top-right area
top_right = img[0:int(h*0.4), int(w*0.4):w]
print(f'Top-right area: {top_right.shape[1]}x{top_right.shape[0]}')

# OCR on top-right
print('\nRunning OCR on top-right area...')
result = ocr.ocr(top_right)

print(f'OCR result type: {type(result)}')

# Handle new PaddleOCR format (dict with rec_texts)
rec_texts = []
rec_scores = []

if isinstance(result, dict):
    print('OCR returned dict format (new API)')
    rec_texts = result.get('rec_texts', [])
    rec_scores = result.get('rec_scores', [])
    print(f'Found {len(rec_texts)} text lines')
    for i, text in enumerate(rec_texts):
        print(f'{i+1}. {text}')
elif isinstance(result, list) and len(result) > 0:
    print('OCR returned list format (old API)')
    if isinstance(result[0], dict):
        rec_texts = result[0].get('rec_texts', [])
        rec_scores = result[0].get('rec_scores', [])
    elif isinstance(result[0], list):
        for line in result[0]:
            if len(line) >= 2:
                text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
                rec_texts.append(text)

# Find 11-12 digit numbers
cma_pattern = re.compile(r'\d{11,12}')
all_numbers = []

for i, text in enumerate(rec_texts):
    cleaned = text.replace(' ', '').replace('-', '').replace(':', '')
    matches = cma_pattern.findall(cleaned)
    for match in matches:
        all_numbers.append({
            'number': match,
            'text': text
        })

print(f'\nFound {len(all_numbers)} 11-digit numbers in top-right:')
for i, num_info in enumerate(all_numbers, 1):
    print(f'{i}. {num_info["number"]} - Text: "{num_info["text"]}"')

expected = '240020349096'
found = any(n['number'] == expected for n in all_numbers)
print(f'\nExpected CMA {expected}: {"FOUND" if found else "NOT FOUND"}')

# Full page OCR
print('\n' + '='*80)
print('Running full page OCR...')
full_result = ocr.ocr(img)

full_rec_texts = []
if isinstance(full_result, dict):
    full_rec_texts = full_result.get('rec_texts', [])
elif isinstance(full_result, list) and len(full_result) > 0:
    if isinstance(full_result[0], dict):
        full_rec_texts = full_result[0].get('rec_texts', [])
    elif isinstance(full_result[0], list):
        for line in full_result[0]:
            if len(line) >= 2:
                text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
                full_rec_texts.append(text)

print(f'Found {len(full_rec_texts)} text lines on full page')

# Find all 11-digit numbers
all_numbers_full = []
for text in full_rec_texts:
    cleaned = text.replace(' ', '').replace('-', '').replace(':', '')
    matches = cma_pattern.findall(cleaned)
    for match in matches:
        all_numbers_full.append({
            'number': match,
            'text': text
        })

print(f'\nFound {len(all_numbers_full)} 11-digit numbers on full page:')
print('First 20:')
for i, num_info in enumerate(all_numbers_full[:20], 1):
    text_preview = num_info["text"][:80]
    print(f'{i}. {num_info["number"]} - Text: "{text_preview}"')

found_full = any(n['number'] == expected for n in all_numbers_full)
print(f'\nExpected CMA {expected} on full page: {"FOUND" if found_full else "NOT FOUND"}')

# Conclusion
print('\n' + '='*80)
print('ANALYSIS COMPLETE')
print('='*80)
if found_full:
    print(f'SUCCESS: Expected CMA {expected} was found')
else:
    print(f'FAILURE: Expected CMA {expected} was NOT found')
    print('\nPossible reasons:')
    print('1. CMA code is on a different page (not page 1)')
    print('2. CMA code is in a graphic/image that OCR cannot read')
    print('3. The CMA code format is different (not 11 digits)')
    print('4. The expected CMA code in results.json is incorrect')
    print('\nRecommendation: Check other pages of the PDF or verify the expected CMA code')
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Debug CMA extraction - handle new PaddleOCR format.`
			`"""`
			`import os`
			`import cv2`
			`import numpy as np`
			`import re`

			`# Set environment variables`
			`os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'`

			`from paddleocr import PaddleOCR`

			`# Initialize OCR`
			`print('Initializing PaddleOCR...')`
			`ocr = PaddleOCR(use_angle_cls=True, lang='ch')`

			`# Read image`
			`img = cv2.imread('debug_images/YDQ25_002294_page1.png')`
			`h, w = img.shape[:2]`
			`print(f'Image size: {w}x{h}')`

			`# Extract top-right area`
			`top_right = img[0:int(h0.4), int(w0.4):w]`
			`print(f'Top-right area: {top_right.shape[1]}x{top_right.shape[0]}')`

			`# OCR on top-right`
			`print('\nRunning OCR on top-right area...')`
			`result = ocr.ocr(top_right)`

			`print(f'OCR result type: {type(result)}')`

			`# Handle new PaddleOCR format (dict with rec_texts)`
			`rec_texts = []`
			`rec_scores = []`

			`if isinstance(result, dict):`
			`print('OCR returned dict format (new API)')`
			`rec_texts = result.get('rec_texts', [])`
			`rec_scores = result.get('rec_scores', [])`
			`print(f'Found {len(rec_texts)} text lines')`
			`for i, text in enumerate(rec_texts):`
			`print(f'{i+1}. {text}')`
			`elif isinstance(result, list) and len(result) > 0:`
			`print('OCR returned list format (old API)')`
			`if isinstance(result[0], dict):`
			`rec_texts = result[0].get('rec_texts', [])`
			`rec_scores = result[0].get('rec_scores', [])`
			`elif isinstance(result[0], list):`
			`for line in result[0]:`
			`if len(line) >= 2:`
			`text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])`
			`rec_texts.append(text)`

			`# Find 11-12 digit numbers`
			`cma_pattern = re.compile(r'\d{11,12}')`
			`all_numbers = []`

			`for i, text in enumerate(rec_texts):`
			`cleaned = text.replace(' ', '').replace('-', '').replace(':', '')`
			`matches = cma_pattern.findall(cleaned)`
			`for match in matches:`
			`all_numbers.append({`
			`'number': match,`
			`'text': text`
			`})`

			`print(f'\nFound {len(all_numbers)} 11-digit numbers in top-right:')`
			`for i, num_info in enumerate(all_numbers, 1):`
			`print(f'{i}. {num_info["number"]} - Text: "{num_info["text"]}"')`

			`expected = '240020349096'`
			`found = any(n['number'] == expected for n in all_numbers)`
			`print(f'\nExpected CMA {expected}: {"FOUND" if found else "NOT FOUND"}')`

			`# Full page OCR`
			`print('\n' + '='*80)`
			`print('Running full page OCR...')`
			`full_result = ocr.ocr(img)`

			`full_rec_texts = []`
			`if isinstance(full_result, dict):`
			`full_rec_texts = full_result.get('rec_texts', [])`
			`elif isinstance(full_result, list) and len(full_result) > 0:`
			`if isinstance(full_result[0], dict):`
			`full_rec_texts = full_result[0].get('rec_texts', [])`
			`elif isinstance(full_result[0], list):`
			`for line in full_result[0]:`
			`if len(line) >= 2:`
			`text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])`
			`full_rec_texts.append(text)`

			`print(f'Found {len(full_rec_texts)} text lines on full page')`

			`# Find all 11-digit numbers`
			`all_numbers_full = []`
			`for text in full_rec_texts:`
			`cleaned = text.replace(' ', '').replace('-', '').replace(':', '')`
			`matches = cma_pattern.findall(cleaned)`
			`for match in matches:`
			`all_numbers_full.append({`
			`'number': match,`
			`'text': text`
			`})`

			`print(f'\nFound {len(all_numbers_full)} 11-digit numbers on full page:')`
			`print('First 20:')`
			`for i, num_info in enumerate(all_numbers_full[:20], 1):`
			`text_preview = num_info["text"][:80]`
			`print(f'{i}. {num_info["number"]} - Text: "{text_preview}"')`

			`found_full = any(n['number'] == expected for n in all_numbers_full)`
			`print(f'\nExpected CMA {expected} on full page: {"FOUND" if found_full else "NOT FOUND"}')`

			`# Conclusion`
			`print('\n' + '='*80)`
			`print('ANALYSIS COMPLETE')`
			`print('='*80)`
			`if found_full:`
			`print(f'SUCCESS: Expected CMA {expected} was found')`
			`else:`
			`print(f'FAILURE: Expected CMA {expected} was NOT found')`
			`print('\nPossible reasons:')`
			`print('1. CMA code is on a different page (not page 1)')`
			`print('2. CMA code is in a graphic/image that OCR cannot read')`
			`print('3. The CMA code format is different (not 11 digits)')`
			`print('4. The expected CMA code in results.json is incorrect')`
			`print('\nRecommendation: Check other pages of the PDF or verify the expected CMA code')`