report-detect/archive/temp_scripts/test_smart_logic.py

103 lines
3.3 KiB
Python
Raw Permalink Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
"""
测试改进的CMA提取逻辑使用模拟数据
"""
import re
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)
# 模拟OCR结果基于之前成功运行的结果
mock_ocr_results = {
"YDQ23_001838.pdf": {
"texts": [
"广东产品质量监督检验研究院",
"210020349096", # 正确的CMA码
"CNASL0153",
"440023010130", # 报告编号(干扰项)
"TESTING"
],
"scores": [0.95, 1.00, 0.92, 0.99, 0.98]
}
}
def extract_cma_smart(ocr_texts, ocr_scores, pdf_name):
"""
改进的CMA码提取逻辑
1. 优先选择以"2"开头的12位数字
2. 如果没有选择置信度最高的
"""
pattern = re.compile(r'\d{11,12}')
logger.info(f"\nProcessing {pdf_name}...")
logger.info(f"OCR texts: {len(ocr_texts)} lines")
# 查找所有11-12位数字
candidates = []
for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)):
matches = pattern.findall(text.replace(" ", ""))
for num in matches:
candidates.append({
'code': num,
'confidence': float(score),
'text': text,
'line': i
})
if not candidates:
logger.warning("No 11-12 digit numbers found")
return {'success': False, 'code': None, 'method': 'no_candidates'}
logger.info(f"Found {len(candidates)} candidates:")
for c in candidates:
logger.info(f" - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})")
# 优先选择以"2"开头的
candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')]
if candidates_starting_with_2:
candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True)
best = candidates_starting_with_2[0]
logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})")
return {
'success': True,
'code': best['code'],
'confidence': best['confidence'],
'method': 'template_matching_smart'
}
else:
candidates.sort(key=lambda x: x['confidence'], reverse=True)
best = candidates[0]
logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})")
return {
'success': True,
'code': best['code'],
'confidence': best['confidence'],
'method': 'fullpage_ocr'
}
# 测试
print("="*80)
print("TESTING IMPROVED CMA EXTRACTION LOGIC")
print("="*80)
data = mock_ocr_results["YDQ23_001838.pdf"]
result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf")
print("\n" + "="*80)
print("RESULT")
print("="*80)
print(f"Success: {result['success']}")
print(f"CMA Code: {result['code']}")
print(f"Method: {result['method']}")
print(f"Confidence: {result['confidence']:.2f}")
expected = "210020349096"
if result['code'] == expected:
print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}")
print("The improved logic correctly prioritizes '2'-prefixed CMA codes!")
else:
print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}")
print("="*80)