50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
"""
|
|
Extract and save first page of PDF for visual inspection.
|
|
"""
|
|
import os
|
|
import sys
|
|
import cv2
|
|
import numpy as np
|
|
import fitz # PyMuPDF
|
|
|
|
pdf_dir = "src/test/resources/data/pdfs"
|
|
test_files = [
|
|
("YDQ25_002294.pdf", "YDQ25_002294_page1.png"),
|
|
("财政部关于请协助提供相关材料的函_pages10-15.pdf", "财政部_pages10-15_page1.png"),
|
|
("财政部关于请协助提供相关材料的函_pages4-9.pdf", "财政部_pages4-9_page1.png")
|
|
]
|
|
|
|
output_dir = "debug_images"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for pdf_name, output_name in test_files:
|
|
pdf_path = os.path.join(pdf_dir, pdf_name)
|
|
print(f"Processing: {pdf_name}")
|
|
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0]
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
|
|
|
# Convert to BGR
|
|
if pix.n == 4:
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
|
|
elif pix.n == 3:
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
|
elif pix.n == 1:
|
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
|
|
|
doc.close()
|
|
|
|
output_path = os.path.join(output_dir, output_name)
|
|
cv2.imwrite(output_path, img)
|
|
print(f" Saved: {output_path}")
|
|
print(f" Size: {img.shape[1]}x{img.shape[0]}")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
|
|
print(f"\nAll images saved to: {output_dir}/")
|
|
print("Please manually inspect these images to see if CMA logo is present.")
|