103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
|
|
"""
|
|||
|
|
测试改进的CMA提取逻辑(使用模拟数据)
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# 模拟OCR结果(基于之前成功运行的结果)
|
|||
|
|
mock_ocr_results = {
|
|||
|
|
"YDQ23_001838.pdf": {
|
|||
|
|
"texts": [
|
|||
|
|
"广东产品质量监督检验研究院",
|
|||
|
|
"210020349096", # 正确的CMA码
|
|||
|
|
"CNASL0153",
|
|||
|
|
"440023010130", # 报告编号(干扰项)
|
|||
|
|
"TESTING"
|
|||
|
|
],
|
|||
|
|
"scores": [0.95, 1.00, 0.92, 0.99, 0.98]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def extract_cma_smart(ocr_texts, ocr_scores, pdf_name):
|
|||
|
|
"""
|
|||
|
|
改进的CMA码提取逻辑:
|
|||
|
|
1. 优先选择以"2"开头的12位数字
|
|||
|
|
2. 如果没有,选择置信度最高的
|
|||
|
|
"""
|
|||
|
|
pattern = re.compile(r'\d{11,12}')
|
|||
|
|
|
|||
|
|
logger.info(f"\nProcessing {pdf_name}...")
|
|||
|
|
logger.info(f"OCR texts: {len(ocr_texts)} lines")
|
|||
|
|
|
|||
|
|
# 查找所有11-12位数字
|
|||
|
|
candidates = []
|
|||
|
|
for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)):
|
|||
|
|
matches = pattern.findall(text.replace(" ", ""))
|
|||
|
|
for num in matches:
|
|||
|
|
candidates.append({
|
|||
|
|
'code': num,
|
|||
|
|
'confidence': float(score),
|
|||
|
|
'text': text,
|
|||
|
|
'line': i
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
if not candidates:
|
|||
|
|
logger.warning("No 11-12 digit numbers found")
|
|||
|
|
return {'success': False, 'code': None, 'method': 'no_candidates'}
|
|||
|
|
|
|||
|
|
logger.info(f"Found {len(candidates)} candidates:")
|
|||
|
|
for c in candidates:
|
|||
|
|
logger.info(f" - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})")
|
|||
|
|
|
|||
|
|
# 优先选择以"2"开头的
|
|||
|
|
candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')]
|
|||
|
|
|
|||
|
|
if candidates_starting_with_2:
|
|||
|
|
candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True)
|
|||
|
|
best = candidates_starting_with_2[0]
|
|||
|
|
logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})")
|
|||
|
|
return {
|
|||
|
|
'success': True,
|
|||
|
|
'code': best['code'],
|
|||
|
|
'confidence': best['confidence'],
|
|||
|
|
'method': 'template_matching_smart'
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
candidates.sort(key=lambda x: x['confidence'], reverse=True)
|
|||
|
|
best = candidates[0]
|
|||
|
|
logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})")
|
|||
|
|
return {
|
|||
|
|
'success': True,
|
|||
|
|
'code': best['code'],
|
|||
|
|
'confidence': best['confidence'],
|
|||
|
|
'method': 'fullpage_ocr'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 测试
|
|||
|
|
print("="*80)
|
|||
|
|
print("TESTING IMPROVED CMA EXTRACTION LOGIC")
|
|||
|
|
print("="*80)
|
|||
|
|
|
|||
|
|
data = mock_ocr_results["YDQ23_001838.pdf"]
|
|||
|
|
result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf")
|
|||
|
|
|
|||
|
|
print("\n" + "="*80)
|
|||
|
|
print("RESULT")
|
|||
|
|
print("="*80)
|
|||
|
|
print(f"Success: {result['success']}")
|
|||
|
|
print(f"CMA Code: {result['code']}")
|
|||
|
|
print(f"Method: {result['method']}")
|
|||
|
|
print(f"Confidence: {result['confidence']:.2f}")
|
|||
|
|
|
|||
|
|
expected = "210020349096"
|
|||
|
|
if result['code'] == expected:
|
|||
|
|
print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}")
|
|||
|
|
print("The improved logic correctly prioritizes '2'-prefixed CMA codes!")
|
|||
|
|
else:
|
|||
|
|
print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}")
|
|||
|
|
|
|||
|
|
print("="*80)
|