report-detect/archive/temp_scripts/simple_find.py

"""
Simple script to find CMA code position
"""
import fitz, numpy as np, cv2, os, re
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
from paddleocr import PaddleOCR

pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
doc = fitz.open(pdf_path)
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()

h, w = page_img.shape[:2]
print(f"Page: {w}x{h}")

ocr = PaddleOCR(lang='ch')
ocr_result = ocr.predict(page_img)

if ocr_result and len(ocr_result) > 0:
    res = ocr_result[0]
    texts = res.get('rec_texts', [])

    for i, text in enumerate(texts):
        if "210020349096" in text:
            print(f"Line {i}: {text}")
            print(f"Index: {i}")

            # Print nearby lines
            print(f"Nearby lines:")
            for j in range(max(0, i-2), min(len(texts), i+3)):
                print(f"  [{j}] {texts[j]}")
            break
    else:
        print("NOT FOUND in texts")
        print("All lines with 11-12 digits:")
        for i, text in enumerate(texts):
            nums = re.findall(r'\d{11,12}', text)
            if nums:
                print(f"  [{i}] {text}: {nums}")
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Simple script to find CMA code position`
			`"""`
			`import fitz, numpy as np, cv2, os, re`
			`os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"`
			`from paddleocr import PaddleOCR`

			`pdf_path = "src/test/resources/data/pdfs/YDQ23_001838.pdf"`
			`doc = fitz.open(pdf_path)`
			`page = doc[0]`
			`mat = fitz.Matrix(300 / 72, 300 / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_data = pix.tobytes("png")`
			`img_array = np.frombuffer(img_data, dtype=np.uint8)`
			`page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)`
			`doc.close()`

			`h, w = page_img.shape[:2]`
			`print(f"Page: {w}x{h}")`

			`ocr = PaddleOCR(lang='ch')`
			`ocr_result = ocr.predict(page_img)`

			`if ocr_result and len(ocr_result) > 0:`
			`res = ocr_result[0]`
			`texts = res.get('rec_texts', [])`

			`for i, text in enumerate(texts):`
			`if "210020349096" in text:`
			`print(f"Line {i}: {text}")`
			`print(f"Index: {i}")`

			`# Print nearby lines`
			`print(f"Nearby lines:")`
			`for j in range(max(0, i-2), min(len(texts), i+3)):`
			`print(f" [{j}] {texts[j]}")`
			`break`
			`else:`
			`print("NOT FOUND in texts")`
			`print("All lines with 11-12 digits:")`
			`for i, text in enumerate(texts):`
			`nums = re.findall(r'\d{11,12}', text)`
			`if nums:`
			`print(f" [{i}] {text}: {nums}")`