report-detect/archive/temp_scripts/test_crt_direct.py

41 lines
1.2 KiB
Python

"""
直接测试CRT提取函数
"""
from test_accuracy_batch_full import extract_institution_from_crt
import sys
# Redirect stdout to avoid encoding issues
class UTF8Stdout:
def write(self, text):
if isinstance(text, str):
text = text.encode('utf-8', errors='replace').decode('utf-8')
sys.stdout.buffer.write(text.encode('utf-8', errors='replace'))
def flush(self):
sys.stdout.buffer.flush()
print("Testing CRT extraction...")
pdf_path = "src/test/resources/data/pdfs/YDQ25_002294.pdf"
result = extract_institution_from_crt(pdf_path)
print(f"\nResult for {pdf_path}:")
print(f" Type: {type(result)}")
print(f" Length: {len(result)}")
print(f" Content: {result}")
# Also test YDQ23_001838.pdf
pdf_path2 = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
result2 = extract_institution_from_crt(pdf_path2)
print(f"\nResult for {pdf_path2}:")
print(f" Type: {type(result2)}")
print(f" Length: {len(result2)}")
print(f" Content: {result2}")
# Check if expected institution is in results
expected = "广东产品质量监督检验研究院"
print(f"\nExpected institution: {expected}")
print(f" Found in PDF1: {expected in result}")
print(f" Found in PDF2: {expected in result2}")