report-detect/archive/temp_scripts/test_single_pdf.py

"""
Quick test to verify the new fallback mechanism works.
"""
import sys
import os
import fitz
import numpy as np
import cv2
from pathlib import Path

os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

# Force reimport to get latest changes
if 'test_accuracy_batch_full' in sys.modules:
    del sys.modules['test_accuracy_batch_full']
if 'cma_extraction_template_primary' in sys.modules:
    del sys.modules['cma_extraction_template_primary']

from test_accuracy_batch_full import process_cma_template_extraction, extract_pdf_page
from paddleocr import PaddleOCR

# Test with one of the failing PDFs
pdf_name = "财政部关于请协助提供相关材料的函_pages4-9.pdf"
pdf_path = Path("src/test/resources/data/pdfs") / pdf_name

print(f"Testing: {pdf_name}")
print("=" * 80)

# Extract page
doc = fitz.open(str(pdf_path))
page = doc[0]
mat = fitz.Matrix(300 / 72, 300 / 72)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img_array = np.frombuffer(img_data, dtype=np.uint8)
page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
doc.close()

print(f"Image size: {page_img.shape}")

# Initialize OCR
print("\nInitializing PaddleOCR...")
ocr = PaddleOCR(lang='ch')

# Run template matching extraction
print("\nRunning template matching extraction...")
result = process_cma_template_extraction(page_img, ocr, output_dir="test_output")

print("\n" + "=" * 80)
print("RESULT")
print("=" * 80)
print(f"Success: {result['success']}")
print(f"CMA Code: {result.get('code', 'N/A')}")
print(f"Confidence: {result.get('confidence', 0):.2f}")
print("=" * 80)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Quick test to verify the new fallback mechanism works.`
			`"""`
			`import sys`
			`import os`
			`import fitz`
			`import numpy as np`
			`import cv2`
			`from pathlib import Path`

			`os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"`

			`# Force reimport to get latest changes`
			`if 'test_accuracy_batch_full' in sys.modules:`
			`del sys.modules['test_accuracy_batch_full']`
			`if 'cma_extraction_template_primary' in sys.modules:`
			`del sys.modules['cma_extraction_template_primary']`

			`from test_accuracy_batch_full import process_cma_template_extraction, extract_pdf_page`
			`from paddleocr import PaddleOCR`

			`# Test with one of the failing PDFs`
			`pdf_name = "财政部关于请协助提供相关材料的函_pages4-9.pdf"`
			`pdf_path = Path("src/test/resources/data/pdfs") / pdf_name`

			`print(f"Testing: {pdf_name}")`
			`print("=" * 80)`

			`# Extract page`
			`doc = fitz.open(str(pdf_path))`
			`page = doc[0]`
			`mat = fitz.Matrix(300 / 72, 300 / 72)`
			`pix = page.get_pixmap(matrix=mat)`
			`img_data = pix.tobytes("png")`
			`img_array = np.frombuffer(img_data, dtype=np.uint8)`
			`page_img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)`
			`doc.close()`

			`print(f"Image size: {page_img.shape}")`

			`# Initialize OCR`
			`print("\nInitializing PaddleOCR...")`
			`ocr = PaddleOCR(lang='ch')`

			`# Run template matching extraction`
			`print("\nRunning template matching extraction...")`
			`result = process_cma_template_extraction(page_img, ocr, output_dir="test_output")`

			`print("\n" + "=" * 80)`
			`print("RESULT")`
			`print("=" * 80)`
			`print(f"Success: {result['success']}")`
			`print(f"CMA Code: {result.get('code', 'N/A')}")`
			`print(f"Confidence: {result.get('confidence', 0):.2f}")`
			`print("=" * 80)`