import cv2 import os import json import difflib from paddleocr import PaddleOCR def similarity(s1, s2): return difflib.SequenceMatcher(None, s1, s2).ratio() def test_unwarps(): target = "威凯检测技术有限公司" input_dir = "manual_unwarp_v2" if not os.path.exists(input_dir): print(f"Error: {input_dir} not found") return ocr = PaddleOCR(use_angle_cls=True, lang='ch') files = [f for f in os.listdir(input_dir) if f.endswith(".png")] results = [] for fname in files: fpath = os.path.join(input_dir, fname) img = cv2.imread(fpath) if img is None: continue # Try OCR on the original strip res = ocr.ocr(img) # Also try padding the strip to make it taller # PaddleOCR sometimes struggles with very thin images h, w = img.shape[:2] padded = cv2.copyMakeBorder(img, 20, 20, 0, 0, cv2.BORDER_CONSTANT, value=[255, 255, 255]) res_padded = ocr.ocr(padded) # Helper to extract text def get_best_text_from_res(ocr_res): if not ocr_res: return "", 0.0 best_t = "" best_s = 0.0 for page in ocr_res: if 'rec_texts' in page: for text in page['rec_texts']: clean_text = text.replace(" ", "") sim = similarity(target, clean_text) if sim > best_s: best_s = sim best_t = clean_text elif isinstance(page, list): for line in page: if isinstance(line, list) and len(line) > 1: clean_text = line[1][0].replace(" ", "") sim = similarity(target, clean_text) if sim > best_s: best_s = sim best_t = clean_text return best_t, best_s text1, sim1 = get_best_text_from_res(res) text2, sim2 = get_best_text_from_res(res_padded) best_text = text1 if sim1 >= sim2 else text2 best_sim = max(sim1, sim2) if best_sim > 0: print(f"File: {fname} | Sim: {best_sim:.4f} | Text: {best_text}") results.append({ "file": fname, "sim": best_sim, "text": best_text }) results.sort(key=lambda x: x['sim'], reverse=True) with open("unwarp_ocr_results.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) if __name__ == "__main__": test_unwarps()