50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
|
|
import cv2
|
|
import os
|
|
import sys
|
|
from paddleocr import TextRecognition
|
|
import logging
|
|
import difflib
|
|
|
|
logging.getLogger('ppocr').setLevel(logging.ERROR)
|
|
|
|
TARGET_TEXT = "威凯检测技术有限公司"
|
|
|
|
def similarity(s1, s2):
|
|
if not s1 or not s2: return 0.0
|
|
return difflib.SequenceMatcher(None, s1, s2).ratio()
|
|
|
|
def test_many(debug_dir):
|
|
print(f"Testing OCR on many files in: {debug_dir}")
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
ocr_model = TextRecognition(model_name="PP-OCRv4_server_rec")
|
|
|
|
files = [f for f in os.listdir(debug_dir) if f.endswith(".png")]
|
|
files = files[:100] # Test first 100
|
|
|
|
results = []
|
|
for f in files:
|
|
path = os.path.join(debug_dir, f)
|
|
img = cv2.imread(path)
|
|
if img is None: continue
|
|
|
|
res = ocr_model.predict(img)
|
|
text = ""
|
|
if res:
|
|
for item in res:
|
|
if 'rec_text' in item:
|
|
text += item['rec_text']
|
|
|
|
sim = similarity(text, TARGET_TEXT)
|
|
if text:
|
|
results.append((f, text, sim))
|
|
print(f"File: {f} | Text: {text} | Sim: {sim:.4f}")
|
|
|
|
if not results:
|
|
print("No text found in any of the tested files.")
|
|
else:
|
|
print(f"Found text in {len(results)} files.")
|
|
|
|
if __name__ == "__main__":
|
|
test_many("brute_results_v8/debug_candidates")
|