report-detect/archive/temp_scripts/test_smart_logic.py

103 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
测试改进的CMA提取逻辑使用模拟数据
"""
import re
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)
# 模拟OCR结果基于之前成功运行的结果
mock_ocr_results = {
"YDQ23_001838.pdf": {
"texts": [
"广东产品质量监督检验研究院",
"210020349096", # 正确的CMA码
"CNASL0153",
"440023010130", # 报告编号(干扰项)
"TESTING"
],
"scores": [0.95, 1.00, 0.92, 0.99, 0.98]
}
}
def extract_cma_smart(ocr_texts, ocr_scores, pdf_name):
"""
改进的CMA码提取逻辑
1. 优先选择以"2"开头的12位数字
2. 如果没有,选择置信度最高的
"""
pattern = re.compile(r'\d{11,12}')
logger.info(f"\nProcessing {pdf_name}...")
logger.info(f"OCR texts: {len(ocr_texts)} lines")
# 查找所有11-12位数字
candidates = []
for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)):
matches = pattern.findall(text.replace(" ", ""))
for num in matches:
candidates.append({
'code': num,
'confidence': float(score),
'text': text,
'line': i
})
if not candidates:
logger.warning("No 11-12 digit numbers found")
return {'success': False, 'code': None, 'method': 'no_candidates'}
logger.info(f"Found {len(candidates)} candidates:")
for c in candidates:
logger.info(f" - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})")
# 优先选择以"2"开头的
candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')]
if candidates_starting_with_2:
candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True)
best = candidates_starting_with_2[0]
logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})")
return {
'success': True,
'code': best['code'],
'confidence': best['confidence'],
'method': 'template_matching_smart'
}
else:
candidates.sort(key=lambda x: x['confidence'], reverse=True)
best = candidates[0]
logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})")
return {
'success': True,
'code': best['code'],
'confidence': best['confidence'],
'method': 'fullpage_ocr'
}
# 测试
print("="*80)
print("TESTING IMPROVED CMA EXTRACTION LOGIC")
print("="*80)
data = mock_ocr_results["YDQ23_001838.pdf"]
result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf")
print("\n" + "="*80)
print("RESULT")
print("="*80)
print(f"Success: {result['success']}")
print(f"CMA Code: {result['code']}")
print(f"Method: {result['method']}")
print(f"Confidence: {result['confidence']:.2f}")
expected = "210020349096"
if result['code'] == expected:
print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}")
print("The improved logic correctly prioritizes '2'-prefixed CMA codes!")
else:
print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}")
print("="*80)