report-detect/scripts/verify_ocr_logic.py

50 lines
1.3 KiB
Python
Raw Normal View History

2026-02-05 13:57:22 +08:00
import cv2
import os
import sys
from paddleocr import TextRecognition
import logging
import difflib
logging.getLogger('ppocr').setLevel(logging.ERROR)
TARGET_TEXT = "威凯检测技术有限公司"
def similarity(s1, s2):
if not s1 or not s2: return 0.0
return difflib.SequenceMatcher(None, s1, s2).ratio()
def test_many(debug_dir):
print(f"Testing OCR on many files in: {debug_dir}")
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
ocr_model = TextRecognition(model_name="PP-OCRv4_server_rec")
files = [f for f in os.listdir(debug_dir) if f.endswith(".png")]
files = files[:100] # Test first 100
results = []
for f in files:
path = os.path.join(debug_dir, f)
img = cv2.imread(path)
if img is None: continue
res = ocr_model.predict(img)
text = ""
if res:
for item in res:
if 'rec_text' in item:
text += item['rec_text']
sim = similarity(text, TARGET_TEXT)
if text:
results.append((f, text, sim))
print(f"File: {f} | Text: {text} | Sim: {sim:.4f}")
if not results:
print("No text found in any of the tested files.")
else:
print(f"Found text in {len(results)} files.")
if __name__ == "__main__":
test_many("brute_results_v8/debug_candidates")