""" 测试改进的CMA提取逻辑(使用模拟数据) """ import re import logging logging.basicConfig(level=logging.INFO, format='%(message)s') logger = logging.getLogger(__name__) # 模拟OCR结果(基于之前成功运行的结果) mock_ocr_results = { "YDQ23_001838.pdf": { "texts": [ "广东产品质量监督检验研究院", "210020349096", # 正确的CMA码 "CNASL0153", "440023010130", # 报告编号(干扰项) "TESTING" ], "scores": [0.95, 1.00, 0.92, 0.99, 0.98] } } def extract_cma_smart(ocr_texts, ocr_scores, pdf_name): """ 改进的CMA码提取逻辑: 1. 优先选择以"2"开头的12位数字 2. 如果没有,选择置信度最高的 """ pattern = re.compile(r'\d{11,12}') logger.info(f"\nProcessing {pdf_name}...") logger.info(f"OCR texts: {len(ocr_texts)} lines") # 查找所有11-12位数字 candidates = [] for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)): matches = pattern.findall(text.replace(" ", "")) for num in matches: candidates.append({ 'code': num, 'confidence': float(score), 'text': text, 'line': i }) if not candidates: logger.warning("No 11-12 digit numbers found") return {'success': False, 'code': None, 'method': 'no_candidates'} logger.info(f"Found {len(candidates)} candidates:") for c in candidates: logger.info(f" - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})") # 优先选择以"2"开头的 candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')] if candidates_starting_with_2: candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True) best = candidates_starting_with_2[0] logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})") return { 'success': True, 'code': best['code'], 'confidence': best['confidence'], 'method': 'template_matching_smart' } else: candidates.sort(key=lambda x: x['confidence'], reverse=True) best = candidates[0] logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})") return { 'success': True, 'code': best['code'], 'confidence': best['confidence'], 'method': 'fullpage_ocr' } # 测试 print("="*80) print("TESTING IMPROVED CMA EXTRACTION LOGIC") print("="*80) data = mock_ocr_results["YDQ23_001838.pdf"] result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf") print("\n" + "="*80) print("RESULT") print("="*80) print(f"Success: {result['success']}") print(f"CMA Code: {result['code']}") print(f"Method: {result['method']}") print(f"Confidence: {result['confidence']:.2f}") expected = "210020349096" if result['code'] == expected: print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}") print("The improved logic correctly prioritizes '2'-prefixed CMA codes!") else: print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}") print("="*80)