80 lines
2.6 KiB
Python
80 lines
2.6 KiB
Python
|
|
|
||
|
|
import cv2
|
||
|
|
import os
|
||
|
|
import json
|
||
|
|
import difflib
|
||
|
|
from paddleocr import PaddleOCR
|
||
|
|
|
||
|
|
def similarity(s1, s2):
|
||
|
|
return difflib.SequenceMatcher(None, s1, s2).ratio()
|
||
|
|
|
||
|
|
def test_unwarps():
|
||
|
|
target = "威凯检测技术有限公司"
|
||
|
|
input_dir = "manual_unwarp_v2"
|
||
|
|
if not os.path.exists(input_dir):
|
||
|
|
print(f"Error: {input_dir} not found")
|
||
|
|
return
|
||
|
|
|
||
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
||
|
|
|
||
|
|
files = [f for f in os.listdir(input_dir) if f.endswith(".png")]
|
||
|
|
results = []
|
||
|
|
|
||
|
|
for fname in files:
|
||
|
|
fpath = os.path.join(input_dir, fname)
|
||
|
|
img = cv2.imread(fpath)
|
||
|
|
if img is None: continue
|
||
|
|
|
||
|
|
# Try OCR on the original strip
|
||
|
|
res = ocr.ocr(img)
|
||
|
|
|
||
|
|
# Also try padding the strip to make it taller
|
||
|
|
# PaddleOCR sometimes struggles with very thin images
|
||
|
|
h, w = img.shape[:2]
|
||
|
|
padded = cv2.copyMakeBorder(img, 20, 20, 0, 0, cv2.BORDER_CONSTANT, value=[255, 255, 255])
|
||
|
|
res_padded = ocr.ocr(padded)
|
||
|
|
|
||
|
|
# Helper to extract text
|
||
|
|
def get_best_text_from_res(ocr_res):
|
||
|
|
if not ocr_res: return "", 0.0
|
||
|
|
best_t = ""
|
||
|
|
best_s = 0.0
|
||
|
|
for page in ocr_res:
|
||
|
|
if 'rec_texts' in page:
|
||
|
|
for text in page['rec_texts']:
|
||
|
|
clean_text = text.replace(" ", "")
|
||
|
|
sim = similarity(target, clean_text)
|
||
|
|
if sim > best_s:
|
||
|
|
best_s = sim
|
||
|
|
best_t = clean_text
|
||
|
|
elif isinstance(page, list):
|
||
|
|
for line in page:
|
||
|
|
if isinstance(line, list) and len(line) > 1:
|
||
|
|
clean_text = line[1][0].replace(" ", "")
|
||
|
|
sim = similarity(target, clean_text)
|
||
|
|
if sim > best_s:
|
||
|
|
best_s = sim
|
||
|
|
best_t = clean_text
|
||
|
|
return best_t, best_s
|
||
|
|
|
||
|
|
text1, sim1 = get_best_text_from_res(res)
|
||
|
|
text2, sim2 = get_best_text_from_res(res_padded)
|
||
|
|
|
||
|
|
best_text = text1 if sim1 >= sim2 else text2
|
||
|
|
best_sim = max(sim1, sim2)
|
||
|
|
|
||
|
|
if best_sim > 0:
|
||
|
|
print(f"File: {fname} | Sim: {best_sim:.4f} | Text: {best_text}")
|
||
|
|
results.append({
|
||
|
|
"file": fname,
|
||
|
|
"sim": best_sim,
|
||
|
|
"text": best_text
|
||
|
|
})
|
||
|
|
|
||
|
|
results.sort(key=lambda x: x['sim'], reverse=True)
|
||
|
|
with open("unwarp_ocr_results.json", "w", encoding="utf-8") as f:
|
||
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
test_unwarps()
|