import cv2 import os import sys from paddleocr import TextRecognition import logging import difflib logging.getLogger('ppocr').setLevel(logging.ERROR) TARGET_TEXT = "威凯检测技术有限公司" def similarity(s1, s2): if not s1 or not s2: return 0.0 return difflib.SequenceMatcher(None, s1, s2).ratio() def test_many(debug_dir): print(f"Testing OCR on many files in: {debug_dir}") os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" ocr_model = TextRecognition(model_name="PP-OCRv4_server_rec") files = [f for f in os.listdir(debug_dir) if f.endswith(".png")] files = files[:100] # Test first 100 results = [] for f in files: path = os.path.join(debug_dir, f) img = cv2.imread(path) if img is None: continue res = ocr_model.predict(img) text = "" if res: for item in res: if 'rec_text' in item: text += item['rec_text'] sim = similarity(text, TARGET_TEXT) if text: results.append((f, text, sim)) print(f"File: {f} | Text: {text} | Sim: {sim:.4f}") if not results: print("No text found in any of the tested files.") else: print(f"Found text in {len(results)} files.") if __name__ == "__main__": test_many("brute_results_v8/debug_candidates")