103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
"""
|
||
测试改进的CMA提取逻辑(使用模拟数据)
|
||
"""
|
||
import re
|
||
import logging
|
||
|
||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 模拟OCR结果(基于之前成功运行的结果)
|
||
mock_ocr_results = {
|
||
"YDQ23_001838.pdf": {
|
||
"texts": [
|
||
"广东产品质量监督检验研究院",
|
||
"210020349096", # 正确的CMA码
|
||
"CNASL0153",
|
||
"440023010130", # 报告编号(干扰项)
|
||
"TESTING"
|
||
],
|
||
"scores": [0.95, 1.00, 0.92, 0.99, 0.98]
|
||
}
|
||
}
|
||
|
||
def extract_cma_smart(ocr_texts, ocr_scores, pdf_name):
|
||
"""
|
||
改进的CMA码提取逻辑:
|
||
1. 优先选择以"2"开头的12位数字
|
||
2. 如果没有,选择置信度最高的
|
||
"""
|
||
pattern = re.compile(r'\d{11,12}')
|
||
|
||
logger.info(f"\nProcessing {pdf_name}...")
|
||
logger.info(f"OCR texts: {len(ocr_texts)} lines")
|
||
|
||
# 查找所有11-12位数字
|
||
candidates = []
|
||
for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)):
|
||
matches = pattern.findall(text.replace(" ", ""))
|
||
for num in matches:
|
||
candidates.append({
|
||
'code': num,
|
||
'confidence': float(score),
|
||
'text': text,
|
||
'line': i
|
||
})
|
||
|
||
if not candidates:
|
||
logger.warning("No 11-12 digit numbers found")
|
||
return {'success': False, 'code': None, 'method': 'no_candidates'}
|
||
|
||
logger.info(f"Found {len(candidates)} candidates:")
|
||
for c in candidates:
|
||
logger.info(f" - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})")
|
||
|
||
# 优先选择以"2"开头的
|
||
candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')]
|
||
|
||
if candidates_starting_with_2:
|
||
candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True)
|
||
best = candidates_starting_with_2[0]
|
||
logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})")
|
||
return {
|
||
'success': True,
|
||
'code': best['code'],
|
||
'confidence': best['confidence'],
|
||
'method': 'template_matching_smart'
|
||
}
|
||
else:
|
||
candidates.sort(key=lambda x: x['confidence'], reverse=True)
|
||
best = candidates[0]
|
||
logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})")
|
||
return {
|
||
'success': True,
|
||
'code': best['code'],
|
||
'confidence': best['confidence'],
|
||
'method': 'fullpage_ocr'
|
||
}
|
||
|
||
# 测试
|
||
print("="*80)
|
||
print("TESTING IMPROVED CMA EXTRACTION LOGIC")
|
||
print("="*80)
|
||
|
||
data = mock_ocr_results["YDQ23_001838.pdf"]
|
||
result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf")
|
||
|
||
print("\n" + "="*80)
|
||
print("RESULT")
|
||
print("="*80)
|
||
print(f"Success: {result['success']}")
|
||
print(f"CMA Code: {result['code']}")
|
||
print(f"Method: {result['method']}")
|
||
print(f"Confidence: {result['confidence']:.2f}")
|
||
|
||
expected = "210020349096"
|
||
if result['code'] == expected:
|
||
print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}")
|
||
print("The improved logic correctly prioritizes '2'-prefixed CMA codes!")
|
||
else:
|
||
print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}")
|
||
|
||
print("="*80)
|