report-detect/scripts/seal_text_extract.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Seal Text Extraction Pipeline
1. Use LayoutDetection to find seal regions
2. Crop seal region from image
3. Use SealTextDetection to find text areas in seal
4. Use TextRecognition to extract text from detected areas
"""
import sys
import json
import os
from PIL import Image
import numpy as np

def main():
    if len(sys.argv) < 2:
        print(json.dumps({"error": "Usage: python seal_text_extract.py <image_path> [output_path]"}))
        sys.exit(1)

    image_path = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else "seal_text_output.png"

    try:
        from paddleocr import LayoutDetection, SealTextDetection, TextRecognition
        from PIL import ImageDraw, ImageFont

        # Step 1: Detect layout and find seal regions
        print("Step 1: Detecting layout...", file=sys.stderr)
        layout_model = LayoutDetection(model_name="PP-DocLayout_plus-L")
        layout_output = layout_model.predict(image_path, batch_size=1, layout_nms=True)

        seal_boxes = []
        for res in layout_output:
            for box in res["boxes"]:
                if box["label"] == "seal":
                    seal_boxes.append({
                        "coordinate": box["coordinate"],
                        "score": box["score"]
                    })

        if not seal_boxes:
            print(json.dumps({"success": False, "error": "No seal detected in image"}))
            sys.exit(0)

        print(f"Found {len(seal_boxes)} seal(s)", file=sys.stderr)

        # Step 2: Initialize seal text detection and text recognition
        seal_det_model = SealTextDetection(model_name="PP-OCRv4_server_seal_det")
        text_rec_model = TextRecognition(model_name="PP-OCRv4_server_rec")

        # Load original image for cropping and visualization
        img = Image.open(image_path)
        draw = ImageDraw.Draw(img)

        try:
            font = ImageFont.truetype("msyh.ttc", 18)
        except:
            font = ImageFont.load_default()

        all_results = []

        for idx, seal in enumerate(seal_boxes):
            x1, y1, x2, y2 = seal["coordinate"]

            # Crop seal region with padding
            padding = 10
            crop_x1 = max(0, int(x1) - padding)
            crop_y1 = max(0, int(y1) - padding)
            crop_x2 = min(img.width, int(x2) + padding)
            crop_y2 = min(img.height, int(y2) + padding)

            seal_crop = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
            seal_crop_path = f"temp_seal_{idx}.png"
            seal_crop.save(seal_crop_path)

            print(f"Step 2: Detecting text in seal {idx}...", file=sys.stderr)

            # Step 3: Detect text areas in seal
            seal_det_output = seal_det_model.predict(seal_crop_path, batch_size=1)

            seal_texts = []
            for det_res in seal_det_output:
                dt_polys = det_res["dt_polys"]
                dt_scores = det_res["dt_scores"]

                print(f"Found {len(dt_polys)} text region(s) in seal", file=sys.stderr)

                # Step 4: For each detected text region, crop and recognize
                for poly_idx, poly in enumerate(dt_polys):
                    poly = np.array(poly)
                    # Get bounding box of polygon
                    min_x, min_y = poly.min(axis=0)
                    max_x, max_y = poly.max(axis=0)

                    # Crop text region
                    text_crop = seal_crop.crop((int(min_x), int(min_y), int(max_x), int(max_y)))
                    text_crop_path = f"temp_text_{idx}_{poly_idx}.png"
                    text_crop.save(text_crop_path)

                    # Recognize text
                    rec_output = text_rec_model.predict(text_crop_path, batch_size=1)

                    for rec_res in rec_output:
                        text = rec_res.get("rec_text", "")
                        score = rec_res.get("rec_score", 0)

                        # Convert numpy types to Python native types
                        if hasattr(score, 'item'):
                            score = score.item()

                        seal_texts.append({
                            "text": str(text),
                            "score": float(score),
                            "poly": [[float(p[0]), float(p[1])] for p in poly]
                        })


                        # Draw on original image
                        abs_poly = [[int(p[0] + crop_x1), int(p[1] + crop_y1)] for p in poly]
                        draw.polygon([tuple(p) for p in abs_poly], outline=(255, 0, 255), width=2)

                    # Clean up temp file
                    os.remove(text_crop_path)

            all_results.append({
                "seal_box": [float(c) for c in seal["coordinate"]],
                "seal_score": float(seal["score"]),
                "texts": seal_texts
            })


            # Clean up temp seal crop
            os.remove(seal_crop_path)

            # Draw seal box
            draw.rectangle([x1, y1, x2, y2], outline=(255, 0, 255), width=3)

        # Save visualization
        img.save(output_path)

        # Combine all extracted texts
        combined_texts = []
        for result in all_results:
            for text_item in result["texts"]:
                combined_texts.append(text_item["text"])

        print(json.dumps({
            "success": True,
            "output_path": output_path,
            "seals": all_results,
            "combined_text": " ".join(combined_texts)
        }, ensure_ascii=False))

    except Exception as e:
        import traceback
        print(json.dumps({"error": str(e), "traceback": traceback.format_exc()}))
        sys.exit(1)

if __name__ == "__main__":
    main()