report-detect/archive/temp_scripts/test_smart_logic.py

"""
测试改进的CMA提取逻辑（使用模拟数据）
"""
import re
import logging

logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

# 模拟OCR结果（基于之前成功运行的结果）
mock_ocr_results = {
    "YDQ23_001838.pdf": {
        "texts": [
            "广东产品质量监督检验研究院",
            "210020349096",  # 正确的CMA码
            "CNASL0153",
            "440023010130",  # 报告编号（干扰项）
            "TESTING"
        ],
        "scores": [0.95, 1.00, 0.92, 0.99, 0.98]
    }
}

def extract_cma_smart(ocr_texts, ocr_scores, pdf_name):
    """
    改进的CMA码提取逻辑：
    1. 优先选择以"2"开头的12位数字
    2. 如果没有，选择置信度最高的
    """
    pattern = re.compile(r'\d{11,12}')

    logger.info(f"\nProcessing {pdf_name}...")
    logger.info(f"OCR texts: {len(ocr_texts)} lines")

    # 查找所有11-12位数字
    candidates = []
    for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)):
        matches = pattern.findall(text.replace(" ", ""))
        for num in matches:
            candidates.append({
                'code': num,
                'confidence': float(score),
                'text': text,
                'line': i
            })

    if not candidates:
        logger.warning("No 11-12 digit numbers found")
        return {'success': False, 'code': None, 'method': 'no_candidates'}

    logger.info(f"Found {len(candidates)} candidates:")
    for c in candidates:
        logger.info(f"  - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})")

    # 优先选择以"2"开头的
    candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')]

    if candidates_starting_with_2:
        candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True)
        best = candidates_starting_with_2[0]
        logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})")
        return {
            'success': True,
            'code': best['code'],
            'confidence': best['confidence'],
            'method': 'template_matching_smart'
        }
    else:
        candidates.sort(key=lambda x: x['confidence'], reverse=True)
        best = candidates[0]
        logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})")
        return {
            'success': True,
            'code': best['code'],
            'confidence': best['confidence'],
            'method': 'fullpage_ocr'
        }

# 测试
print("="*80)
print("TESTING IMPROVED CMA EXTRACTION LOGIC")
print("="*80)

data = mock_ocr_results["YDQ23_001838.pdf"]
result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf")

print("\n" + "="*80)
print("RESULT")
print("="*80)
print(f"Success: {result['success']}")
print(f"CMA Code: {result['code']}")
print(f"Method: {result['method']}")
print(f"Confidence: {result['confidence']:.2f}")

expected = "210020349096"
if result['code'] == expected:
    print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}")
    print("The improved logic correctly prioritizes '2'-prefixed CMA codes!")
else:
    print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}")

print("="*80)
-												chore(project): conservative cleanup - archive temp scripts and old docs

Major cleanup to improve project organization and maintainability.

Changes:
- Moved 34 temp/debug/test scripts to archive/temp_scripts/
- Moved 9 auxiliary tools to archive/tools/
- Moved 3 CRT test scripts to archive/crt_tests/
- Moved 4 OCR test scripts to archive/ocr_tests/
- Moved 14 old documentation files to archive/docs/
- Deleted 4 useless files (duplicates, temp files)

Root directory:
- Before: 67 files (cluttered)
- After: 10 core files (clean and organized)

Core files retained:
- test_accuracy_batch_full.py (main script)
- cma_extraction_template_primary.py (CMA extraction)
- cma_extraction_final.py (backup CMA extraction)
- CLAUDE.md (project guide)
- TEST_ACCURACY_BATCH_README.md (usage guide)
- TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs)
- CLEANUP_PLAN.md (cleanup plan)
- CLEANUP_SUMMARY.md (this file)
- IMPLEMENTATION_SUMMARY.md (implementation summary)
- requirements.txt (dependencies)

Archive structure:
archive/
├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.)
├── tools/ (9 files: find_, show_, visualize_, etc.)
├── crt_tests/ (3 files: CRT extraction tests)
├── ocr_tests/ (4 files: OCR timeout tests)
└── docs/ (14 files: old reports and guides)

Benefits:
✓ Cleaner root directory - easier navigation
✓ Better organization - clear separation of concerns
✓ Preserved history - all files archived, not deleted
✓ Improved maintainability - easier to find active files
✓ Better git history - removed 198 deleted files from tracking

No functional changes - all core functionality preserved.

Related:
- TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis
- CLEANUP_PLAN.md - detailed cleanup plan

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-03 14:35:06 +08:00
+								"""
 								测试改进的CMA提取逻辑（使用模拟数据）
 								"""
 								import re
 								import logging
 								logging.basicConfig(level=logging.INFO, format='%(message)s')
 								logger = logging.getLogger(__name__)
 								# 模拟OCR结果（基于之前成功运行的结果）
 								mock_ocr_results = {
 								    "YDQ23_001838.pdf": {
 								        "texts": [
 								            "广东产品质量监督检验研究院",
 								            "210020349096",  # 正确的CMA码
 								            "CNASL0153",
 								            "440023010130",  # 报告编号（干扰项）
 								            "TESTING"
 								        ],
 								        "scores": [0.95, 1.00, 0.92, 0.99, 0.98]
 								    }
 								}
 								def extract_cma_smart(ocr_texts, ocr_scores, pdf_name):
 								    """
 								    改进的CMA码提取逻辑：
 . 优先选择以"2"开头的12位数字
 . 如果没有，选择置信度最高的
 								    """
 								    pattern = re.compile(r'\d{11,12}')
 								    logger.info(f"\nProcessing {pdf_name}...")
 								    logger.info(f"OCR texts: {len(ocr_texts)} lines")
 								    # 查找所有11-12位数字
 								    candidates = []
 								    for i, (text, score) in enumerate(zip(ocr_texts, ocr_scores)):
 								        matches = pattern.findall(text.replace(" ", ""))
 								        for num in matches:
 								            candidates.append({
 								                'code': num,
 								                'confidence': float(score),
 								                'text': text,
 								                'line': i
 								            })
 								    if not candidates:
 								        logger.warning("No 11-12 digit numbers found")
 								        return {'success': False, 'code': None, 'method': 'no_candidates'}
 								    logger.info(f"Found {len(candidates)} candidates:")
 								    for c in candidates:
 								        logger.info(f"  - {c['code']} (conf: {c['confidence']:.2f}, from line {c['line']})")
 								    # 优先选择以"2"开头的
 								    candidates_starting_with_2 = [c for c in candidates if c['code'].startswith('2')]
 								    if candidates_starting_with_2:
 								        candidates_starting_with_2.sort(key=lambda x: x['confidence'], reverse=True)
 								        best = candidates_starting_with_2[0]
 								        logger.info(f"✓ Selected (starts with '2'): {best['code']} (confidence: {best['confidence']:.2f})")
 								        return {
 								            'success': True,
 								            'code': best['code'],
 								            'confidence': best['confidence'],
 								            'method': 'template_matching_smart'
 								        }
 								    else:
 								        candidates.sort(key=lambda x: x['confidence'], reverse=True)
 								        best = candidates[0]
 								        logger.info(f"✓ Selected (highest confidence): {best['code']} (confidence: {best['confidence']:.2f})")
 								        return {
 								            'success': True,
 								            'code': best['code'],
 								            'confidence': best['confidence'],
 								            'method': 'fullpage_ocr'
 								        }
 								# 测试
 								print("="*80)
 								print("TESTING IMPROVED CMA EXTRACTION LOGIC")
 								print("="*80)
 								data = mock_ocr_results["YDQ23_001838.pdf"]
 								result = extract_cma_smart(data["texts"], data["scores"], "YDQ23_001838.pdf")
 								print("\n" + "="*80)
 								print("RESULT")
 								print("="*80)
 								print(f"Success: {result['success']}")
 								print(f"CMA Code: {result['code']}")
 								print(f"Method: {result['method']}")
 								print(f"Confidence: {result['confidence']:.2f}")
 								expected = "210020349096"
 								if result['code'] == expected:
 								    print(f"\n✓✓✓ CORRECT! Expected: {expected}, Got: {result['code']}")
 								    print("The improved logic correctly prioritizes '2'-prefixed CMA codes!")
 								else:
 								    print(f"\n✗✗✗ WRONG! Expected: {expected}, Got: {result['code']}")
 								print("="*80)