41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
"""
|
|
直接测试CRT提取函数
|
|
"""
|
|
from test_accuracy_batch_full import extract_institution_from_crt
|
|
import sys
|
|
|
|
# Redirect stdout to avoid encoding issues
|
|
class UTF8Stdout:
|
|
def write(self, text):
|
|
if isinstance(text, str):
|
|
text = text.encode('utf-8', errors='replace').decode('utf-8')
|
|
sys.stdout.buffer.write(text.encode('utf-8', errors='replace'))
|
|
|
|
def flush(self):
|
|
sys.stdout.buffer.flush()
|
|
|
|
print("Testing CRT extraction...")
|
|
|
|
pdf_path = "src/test/resources/data/pdfs/YDQ25_002294.pdf"
|
|
result = extract_institution_from_crt(pdf_path)
|
|
|
|
print(f"\nResult for {pdf_path}:")
|
|
print(f" Type: {type(result)}")
|
|
print(f" Length: {len(result)}")
|
|
print(f" Content: {result}")
|
|
|
|
# Also test YDQ23_001838.pdf
|
|
pdf_path2 = "src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
|
result2 = extract_institution_from_crt(pdf_path2)
|
|
|
|
print(f"\nResult for {pdf_path2}:")
|
|
print(f" Type: {type(result2)}")
|
|
print(f" Length: {len(result2)}")
|
|
print(f" Content: {result2}")
|
|
|
|
# Check if expected institution is in results
|
|
expected = "广东产品质量监督检验研究院"
|
|
print(f"\nExpected institution: {expected}")
|
|
print(f" Found in PDF1: {expected in result}")
|
|
print(f" Found in PDF2: {expected in result2}")
|