report-detect/archive/tools/search_cma_position.py

"""
Search for CMA code position on the page
"""
import fitz
import numpy as np
import cv2
from paddleocr import PaddleOCR
import os

os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"

print("=" * 80)
print("SEARCHING FOR CMA CODE 210020349096")
print("=" * 80)

# Extract page
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

# Try to get text before closing
try:
    text = page.get_text()
    has_cma_in_text = '210020349096' in text
except:
    has_cma_in_text = False

doc.close()

print(f"\nPage size: {page_img.shape}")
print(f"\nPDF text contains '210020349096': {has_cma_in_text}")

# Try to find CMA code with full-page OCR
print("\nRunning full-page OCR...")
ocr = PaddleOCR(lang='ch')
ocr_result = ocr.predict(page_img)

if ocr_result and len(ocr_result) > 0:
    res = ocr_result[0]
    texts = res.get('rec_texts', [])
    boxes = res.get('rec_boxes', [])
    scores = res.get('rec_scores', [])

    print(f"\nOCR found {len(texts)} text lines")

    import re
    found = False
    for i, (text, box, score) in enumerate(zip(texts, boxes, scores)):
        # Find 11-12 digit numbers
        numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))
        if numbers:
            # Calculate box center
            x_coords = [int(p[0]) for p in box]
            y_coords = [int(p[1]) for p in box]
            x_center = sum(x_coords) // 4
            y_center = sum(y_coords) // 4

            h, w = page_img.shape[:2]
            rel_x = x_center / w * 100
            rel_y = y_center / h * 100

            print(f"\nLine {i}: '{text}'")
            print(f"  Numbers: {numbers}")
            print(f"  Position: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)")
            print(f"  Score: {score:.2f}")

            if "210020349096" in numbers:
                print(f"  ^ THIS IS THE CORRECT CMA CODE!")
                found = True

                # Calculate where it is relative to logo
                print(f"\n  Logo center was at: (1427, 885) -> (57.5%, 25.2%)")
                print(f"  CMA code is at: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)")
                print(f"  Difference: X+{x_center-1427}, Y+{y_center-885}")

            if "440023010130" in numbers:
                print(f"  ^ This is 440023010130 (report number)")

    if not found:
        print("\n⚠️  WARNING: CMA code 210020349096 NOT FOUND in OCR results!")
        print("    This means either:")
        print("    1. The CMA code is in an image that OCR can't read")
        print("    2. The CMA code is handwritten")
        print("    3. The PDF doesn't contain this CMA code")

print("\n" + "=" * 80)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Search for CMA code position on the page`
			`"""`
			`import fitz`
			`import numpy as np`
			`import cv2`
			`from paddleocr import PaddleOCR`
			`import os`

			`os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"`

			`pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"`

			`print("=" * 80)`
			`print("SEARCHING FOR CMA CODE 210020349096")`
			`print("=" * 80)`

			`# Extract page`
			`doc = fitz.open(pdf_path)`
			`page = doc[0]`
			`mat = fitz.Matrix(300 / 72, 300 / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_data = pix.tobytes("png")`
			`img_array = np.frombuffer(img_data, dtype=np.uint8)`
			`page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)`

			`# Try to get text before closing`
			`try:`
			`text = page.get_text()`
			`has_cma_in_text = '210020349096' in text`
			`except:`
			`has_cma_in_text = False`

			`doc.close()`

			`print(f"\nPage size: {page_img.shape}")`
			`print(f"\nPDF text contains '210020349096': {has_cma_in_text}")`

			`# Try to find CMA code with full-page OCR`
			`print("\nRunning full-page OCR...")`
			`ocr = PaddleOCR(lang='ch')`
			`ocr_result = ocr.predict(page_img)`

			`if ocr_result and len(ocr_result) > 0:`
			`res = ocr_result[0]`
			`texts = res.get('rec_texts', [])`
			`boxes = res.get('rec_boxes', [])`
			`scores = res.get('rec_scores', [])`

			`print(f"\nOCR found {len(texts)} text lines")`

			`import re`
			`found = False`
			`for i, (text, box, score) in enumerate(zip(texts, boxes, scores)):`
			`# Find 11-12 digit numbers`
			`numbers = re.findall(r'\d{11,12}', text.replace(" ", ""))`
			`if numbers:`
			`# Calculate box center`
			`x_coords = [int(p[0]) for p in box]`
			`y_coords = [int(p[1]) for p in box]`
			`x_center = sum(x_coords) // 4`
			`y_center = sum(y_coords) // 4`

			`h, w = page_img.shape[:2]`
			`rel_x = x_center / w * 100`
			`rel_y = y_center / h * 100`

			`print(f"\nLine {i}: '{text}'")`
			`print(f" Numbers: {numbers}")`
			`print(f" Position: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)")`
			`print(f" Score: {score:.2f}")`

			`if "210020349096" in numbers:`
			`print(f" ^ THIS IS THE CORRECT CMA CODE!")`
			`found = True`

			`# Calculate where it is relative to logo`
			`print(f"\n Logo center was at: (1427, 885) -> (57.5%, 25.2%)")`
			`print(f" CMA code is at: ({x_center}, {y_center}) -> ({rel_x:.1f}%, {rel_y:.1f}%)")`
			`print(f" Difference: X+{x_center-1427}, Y+{y_center-885}")`

			`if "440023010130" in numbers:`
			`print(f" ^ This is 440023010130 (report number)")`

			`if not found:`
			`print("\n⚠️ WARNING: CMA code 210020349096 NOT FOUND in OCR results!")`
			`print(" This means either:")`
			`print(" 1. The CMA code is in an image that OCR can't read")`
			`print(" 2. The CMA code is handwritten")`
			`print(" 3. The PDF doesn't contain this CMA code")`

			`print("\n" + "=" * 80)`