report-detect/archive/temp_scripts/run_single_test.py

"""
Run single test with detailed debug output for YDQ23_001838.pdf
"""
import sys
import os

# Clear ALL cache
print("=" * 80)
print("CLEARING CACHE")
print("=" * 80)
import shutil
import subprocess

# Clear Python cache
try:
    result = subprocess.run(['find', '.', '-name', '__pycache__', '-type', 'd', '-exec', 'rm', '-rf', '{}', '+'],
                          capture_output=True, shell=False)
    print(f"Cache cleared (exit code: {result.returncode})")
except:
    print("Using alternative cache clear...")
    for root, dirs, files in os.walk("."):
        for d in dirs[:100]:  # Limit to avoid timeout
            if d == "__pycache__":
                try:
                    shutil.rmtree(os.path.join(root, d))
                    print(f"  Removed: {os.path.join(root, d)}")
                except:
                    pass

# Clear module cache
modules_to_clear = list(sys.modules.keys())
for module in modules_to_clear:
    if module.startswith('cma_extraction') or module.startswith('test_accuracy') or module.startswith('paddleocr'):
        del sys.modules[module]
print(f"Cleared {len(modules_to_clear)} modules from memory")

print("\n" + "=" * 80)
print("IMPORTING MODULES")
print("=" * 80)

# Set environment
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

# Import fresh
from test_accuracy_batch_full import process_single_pdf
from pathlib import Path
import json
from paddleocr import PaddleOCR

print("Modules imported successfully\n")

# Test configuration
pdf_name = "YDQ23_001838.pdf"
pdf_dir = Path("src/test/resources/data/pdfs")
output_dir = Path("test_reports_debug") / pdf_name
output_dir.mkdir(parents=True, exist_ok=True)

# Load expected results
results_file = Path("src/test/resources/data/results.json")
with open(results_file, 'r', encoding='utf-8') as f:
    expected_results = json.load(f)

expected_cma = expected_results.get(pdf_name, {}).get('cma')
expected_inst = expected_results.get(pdf_name, {}).get('institution')

print("=" * 80)
print("TEST CONFIGURATION")
print("=" * 80)
print(f"PDF: {pdf_name}")
print(f"Expected CMA: {expected_cma}")
print(f"Expected Institution: {expected_inst}")
print(f"Output: {output_dir}")
print()

# Initialize OCR
print("Initializing PaddleOCR...")
ocr_engine = PaddleOCR(lang='ch')
print("OCR initialized\n")

# Run test
print("=" * 80)
print("RUNNING TEST")
print("=" * 80)

result = process_single_pdf(
    pdf_name=pdf_name,
    expected_cma=expected_cma,
    expected_inst=expected_inst,
    pdf_dir=pdf_dir,
    output_dir=output_dir,
    ocr_engine=ocr_engine,
    ocr_model="ppocr_v5",
    vl_pipeline=None
)

# Display results
print("\n" + "=" * 80)
print("TEST RESULTS")
print("=" * 80)
print(f"Expected CMA: {expected_cma}")
print(f"Extracted CMA: {result['extracted'].get('cma', 'N/A')}")
print(f"CMA Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}")
print(f"CMA Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%")
print()
print(f"Expected Institution: {expected_inst}")
print(f"Extracted Institution: {result['extracted'].get('institution', 'N/A')}")
print(f"Institution Match: {result['comparison']['institution'].get('match_type', 'UNKNOWN')}")
print(f"Institution Similarity: {result['comparison']['institution'].get('similarity', 0):.1f}%")
print()

# Check result
if result['extracted'].get('cma') == expected_cma:
    print("✓ CMA EXTRACTION SUCCESSFUL")
    sys.exit(0)
else:
    print("✗ CMA EXTRACTION FAILED")
    print(f"\nExtracted: {result['extracted'].get('cma')}")
    print(f"Expected: {expected_cma}")
    print("\nCheck debug output in:", output_dir)
    sys.exit(1)
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`Run single test with detailed debug output for YDQ23_001838.pdf`
			`"""`
			`import sys`
			`import os`

			`# Clear ALL cache`
			`print("=" * 80)`
			`print("CLEARING CACHE")`
			`print("=" * 80)`
			`import shutil`
			`import subprocess`

			`# Clear Python cache`
			`try:`
			`result = subprocess.run(['find', '.', '-name', '__pycache__', '-type', 'd', '-exec', 'rm', '-rf', '{}', '+'],`
			`capture_output=True, shell=False)`
			`print(f"Cache cleared (exit code: {result.returncode})")`
			`except:`
			`print("Using alternative cache clear...")`
			`for root, dirs, files in os.walk("."):`
			`for d in dirs[:100]: # Limit to avoid timeout`
			`if d == "__pycache__":`
			`try:`
			`shutil.rmtree(os.path.join(root, d))`
			`print(f" Removed: {os.path.join(root, d)}")`
			`except:`
			`pass`

			`# Clear module cache`
			`modules_to_clear = list(sys.modules.keys())`
			`for module in modules_to_clear:`
			`if module.startswith('cma_extraction') or module.startswith('test_accuracy') or module.startswith('paddleocr'):`
			`del sys.modules[module]`
			`print(f"Cleared {len(modules_to_clear)} modules from memory")`

			`print("\n" + "=" * 80)`
			`print("IMPORTING MODULES")`
			`print("=" * 80)`

			`# Set environment`
			`os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"`

			`# Import fresh`
			`from test_accuracy_batch_full import process_single_pdf`
			`from pathlib import Path`
			`import json`
			`from paddleocr import PaddleOCR`

			`print("Modules imported successfully\n")`

			`# Test configuration`
			`pdf_name = "YDQ23_001838.pdf"`
			`pdf_dir = Path("src/test/resources/data/pdfs")`
			`output_dir = Path("test_reports_debug") / pdf_name`
			`output_dir.mkdir(parents=True, exist_ok=True)`

			`# Load expected results`
			`results_file = Path("src/test/resources/data/results.json")`
			`with open(results_file, 'r', encoding='utf-8') as f:`
			`expected_results = json.load(f)`

			`expected_cma = expected_results.get(pdf_name, {}).get('cma')`
			`expected_inst = expected_results.get(pdf_name, {}).get('institution')`

			`print("=" * 80)`
			`print("TEST CONFIGURATION")`
			`print("=" * 80)`
			`print(f"PDF: {pdf_name}")`
			`print(f"Expected CMA: {expected_cma}")`
			`print(f"Expected Institution: {expected_inst}")`
			`print(f"Output: {output_dir}")`
			`print()`

			`# Initialize OCR`
			`print("Initializing PaddleOCR...")`
			`ocr_engine = PaddleOCR(lang='ch')`
			`print("OCR initialized\n")`

			`# Run test`
			`print("=" * 80)`
			`print("RUNNING TEST")`
			`print("=" * 80)`

			`result = process_single_pdf(`
			`pdf_name=pdf_name,`
			`expected_cma=expected_cma,`
			`expected_inst=expected_inst,`
			`pdf_dir=pdf_dir,`
			`output_dir=output_dir,`
			`ocr_engine=ocr_engine,`
			`ocr_model="ppocr_v5",`
			`vl_pipeline=None`
			`)`

			`# Display results`
			`print("\n" + "=" * 80)`
			`print("TEST RESULTS")`
			`print("=" * 80)`
			`print(f"Expected CMA: {expected_cma}")`
			`print(f"Extracted CMA: {result['extracted'].get('cma', 'N/A')}")`
			`print(f"CMA Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}")`
			`print(f"CMA Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%")`
			`print()`
			`print(f"Expected Institution: {expected_inst}")`
			`print(f"Extracted Institution: {result['extracted'].get('institution', 'N/A')}")`
			`print(f"Institution Match: {result['comparison']['institution'].get('match_type', 'UNKNOWN')}")`
			`print(f"Institution Similarity: {result['comparison']['institution'].get('similarity', 0):.1f}%")`
			`print()`

			`# Check result`
			`if result['extracted'].get('cma') == expected_cma:`
			`print("✓ CMA EXTRACTION SUCCESSFUL")`
			`sys.exit(0)`
			`else:`
			`print("✗ CMA EXTRACTION FAILED")`
			`print(f"\nExtracted: {result['extracted'].get('cma')}")`
			`print(f"Expected: {expected_cma}")`
			`print("\nCheck debug output in:", output_dir)`
			`sys.exit(1)`