report-detect/archive/tools/extract_pdf_pages.py

"""
Extract and save first page of PDF for visual inspection.
"""
import os
import sys
import cv2
import numpy as np
import fitz  # PyMuPDF

pdf_dir = "src/test/resources/data/pdfs"
test_files = [
    ("YDQ25_002294.pdf", "YDQ25_002294_page1.png"),
    ("财政部关于请协助提供相关材料的函_pages10-15.pdf", "财政部_pages10-15_page1.png"),
    ("财政部关于请协助提供相关材料的函_pages4-9.pdf", "财政部_pages4-9_page1.png")
]

output_dir = "debug_images"
os.makedirs(output_dir, exist_ok=True)

for pdf_name, output_name in test_files:
    pdf_path = os.path.join(pdf_dir, pdf_name)
    print(f"Processing: {pdf_name}")

    try:
        doc = fitz.open(pdf_path)
        page = doc[0]
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

        # Convert to BGR
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        elif pix.n == 3:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        elif pix.n == 1:
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

        doc.close()

        output_path = os.path.join(output_dir, output_name)
        cv2.imwrite(output_path, img)
        print(f"  Saved: {output_path}")
        print(f"  Size: {img.shape[1]}x{img.shape[0]}")

    except Exception as e:
        print(f"  ERROR: {e}")

print(f"\nAll images saved to: {output_dir}/")
print("Please manually inspect these images to see if CMA logo is present.")
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Extract and save first page of PDF for visual inspection.`
			`"""`
			`import os`
			`import sys`
			`import cv2`
			`import numpy as np`
			`import fitz # PyMuPDF`

			`pdf_dir = "src/test/resources/data/pdfs"`
			`test_files = [`
			`("YDQ25_002294.pdf", "YDQ25_002294_page1.png"),`
			`("财政部关于请协助提供相关材料的函_pages10-15.pdf", "财政部_pages10-15_page1.png"),`
			`("财政部关于请协助提供相关材料的函_pages4-9.pdf", "财政部_pages4-9_page1.png")`
			`]`

			`output_dir = "debug_images"`
			`os.makedirs(output_dir, exist_ok=True)`

			`for pdf_name, output_name in test_files:`
			`pdf_path = os.path.join(pdf_dir, pdf_name)`
			`print(f"Processing: {pdf_name}")`

			`try:`
			`doc = fitz.open(pdf_path)`
			`page = doc[0]`
			`pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))`
			`img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)`

			`# Convert to BGR`
			`if pix.n == 4:`
			`img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)`
			`elif pix.n == 3:`
			`img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)`
			`elif pix.n == 1:`
			`img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)`

			`doc.close()`

			`output_path = os.path.join(output_dir, output_name)`
			`cv2.imwrite(output_path, img)`
			`print(f" Saved: {output_path}")`
			`print(f" Size: {img.shape[1]}x{img.shape[0]}")`

			`except Exception as e:`
			`print(f" ERROR: {e}")`

			`print(f"\nAll images saved to: {output_dir}/")`
			`print("Please manually inspect these images to see if CMA logo is present.")`