report-detect/scripts/seal_ocr.py

74 lines
2.3 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Seal Text Recognition using SealRecognition Pipeline
This pipeline handles curved/arc text in seals properly
"""
import sys
import json
def main():
if len(sys.argv) < 2:
print(json.dumps({"error": "Usage: python seal_ocr.py <image_path> [output_path]"}))
sys.exit(1)
image_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else "./seal_output/"
try:
from paddleocr import SealRecognition
import os
# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)
# Initialize SealRecognition pipeline
# This handles curved text detection and recognition
pipeline = SealRecognition(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
)
print(f"Processing: {image_path}", file=sys.stderr)
# Run pipeline
output = pipeline.predict(image_path)
all_texts = []
for res in output:
# Save visualization
res.save_to_img(output_path)
res.save_to_json(output_path)
# Extract text from result
if hasattr(res, 'rec_texts'):
for text in res.rec_texts:
if text:
all_texts.append(str(text))
# Try different attribute names
if hasattr(res, 'ocr_result'):
for item in res.ocr_result:
if isinstance(item, dict) and 'text' in item:
all_texts.append(str(item['text']))
elif isinstance(item, (list, tuple)) and len(item) > 1:
all_texts.append(str(item[1]))
# Convert result to JSON-safe format
result_dict = {
"success": True,
"output_path": output_path,
"texts": all_texts,
"combined_text": " ".join(all_texts)
}
print(json.dumps(result_dict, ensure_ascii=False))
except Exception as e:
import traceback
print(json.dumps({"error": str(e), "traceback": traceback.format_exc()}))
sys.exit(1)
if __name__ == "__main__":
main()