report-detect/v_verify_logic.py

import os
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"

import cv2
import numpy as np
import math
import fitz  # PyMuPDF
import paddlex as px  # Using Paddlex for Layout
from paddleocr import SealTextDetection

# Tool: Standard Polar Unwarp
def polar_unwarp(img, center, radius, start_theta, angular_extent):
    if angular_extent <= 0: return None
    strip_w = int(angular_extent * radius)
    strip_h = int(radius * 0.6)
    if strip_w <= 0 or strip_h <= 0: return None
    strip = np.zeros((strip_h, strip_w, 3), dtype=np.uint8)
    for y in range(strip_h):
        r = radius - y
        for x in range(strip_w):
            theta = start_theta + angular_extent * (x / strip_w)
            src_x = center[0] + r * math.cos(theta)
            src_y = center[1] + r * math.sin(theta)
            if 0 <= src_x < img.shape[1]-1 and 0 <= src_y < img.shape[0]-1:
                patch = cv2.getRectSubPix(img, (1, 1), (float(src_x), float(src_y)))
                strip[y, x] = patch[0, 0]
            else:
                strip[y, x] = [255, 255, 255]
    return strip

def calculate_precise_arc(polygons, center):
    initial_clusters = []
    gap_thresh = math.radians(15)
    for poly in polygons:
        thetas = sorted([math.atan2(p[1] - center[1], p[0] - center[0]) for i, p in enumerate(poly)])
        if not thetas: continue
        max_gap = 0
        gap_idx = -1
        for i in range(len(thetas)):
            gap = (thetas[0] + 2*math.pi - thetas[i]) if i == len(thetas)-1 else (thetas[i+1]-thetas[i])
            if gap > max_gap: max_gap = gap; gap_idx = i
        if gap_idx == len(thetas) - 1:
            t_arc = thetas
        else:
            t_arc = thetas[gap_idx+1:] + [t + 2*math.pi for t in thetas[:gap_idx+1]]
        if not t_arc: continue
        curr = [t_arc[0]]
        for i in range(1, len(t_arc)):
            if t_arc[i] - t_arc[i-1] > gap_thresh:
                initial_clusters.append({'start': curr[0], 'end': curr[-1]})
                curr = [t_arc[i]]
            else:
                curr.append(t_arc[i])
        initial_clusters.append({'start': curr[0], 'end': curr[-1]})
    if not initial_clusters: return 0.0, 0.0
    initial_clusters.sort(key=lambda x: x['start'])
    merged = []
    merge_thresh = math.radians(45)
    if initial_clusters:
        curr = initial_clusters[0]
        for i in range(1, len(initial_clusters)):
            nxt = initial_clusters[i]
            if nxt['start'] - curr['end'] < merge_thresh:
                curr['end'] = max(curr['end'], nxt['end'])
            else:
                merged.append(curr)
                curr = nxt
        merged.append(curr)
    candidates = []
    for m in merged:
        st, en = m['start'], m['end']
        ex = en - st
        mid = (st + en) / 2
        dist_to_top = abs(((mid + math.pi/2 + math.pi) % (2*math.pi)) - math.pi)
        weight = math.exp(-0.5 * (dist_to_top / (math.pi/2))**2)
        candidates.append({'start': st, 'end': en, 'extent': ex, 'score': ex * weight})
    candidates.sort(key=lambda x: x['score'], reverse=True)
    best = candidates[0]
    return best['start'], best['end'] - best['start']

def extract_pdf_page(pdf_path, page_num=0):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num)
    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
    if pix.n == 4: img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

def run_layout_detection(image_path):
    print("Initializing Paddlex PP-DocLayout-L...")
    model = px.create_model("PP-DocLayout-L")
    output = model.predict(image_path, batch_size=1)

    all_regions = []
    for res in output:
        # Paddlex 3.0 result structure: res is a dict with 'boxes' key
        boxes = res.get('boxes', [])
        for box in boxes:
            # box structure: label_name, label, score, coordinate
            label_name = box.get('label_name', box.get('label', 'unknown'))
            score = box.get('score', 0.0)
            coords = box.get('coordinate')
            print(f"Detected: {label_name} (Score: {score:.2f}) at {coords}")
            all_regions.append({
                'label': label_name,
                'score': score,
                'box': coords
            })
    return all_regions

def process_full_workflow(pdf_path, output_dir="report_viz"):
    if not os.path.exists(output_dir): os.makedirs(output_dir)
    print(f"Rendering PDF {pdf_path} Page 1...")
    page_img = extract_pdf_page(pdf_path)
    doc_path = os.path.join(output_dir, "doc_page.png")
    cv2.imwrite(doc_path, page_img)

    print("Running Layout Detection via Paddlex...")
    all_regions = run_layout_detection(doc_path)

    page_viz = page_img.copy()
    seal_boxes = []
    for reg in all_regions:
        box = reg.get('box')
        label = reg.get('label')
        score = reg.get('score', 0.0)

        # In Paddlex 3.0 DocLayout, 'seal' is index 16 or name 'seal'
        # Let's match by name.
        is_seal = (label == 'seal')

        if score > 0.2: # Low threshold for debugging
            x1, y1, x2, y2 = [int(v) for v in box]
            color = (0, 0, 255) if is_seal else (0, 255, 0)
            cv2.rectangle(page_viz, (x1, y1), (x2, y2), color, 2)
            cv2.putText(page_viz, f"{label} {score:.2f}", (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1)

            if is_seal:
                seal_boxes.append(box)

    cv2.imwrite(os.path.join(output_dir, "doc_layout_viz.png"), page_viz)

    seal_results = []
    print(f"Processing {len(seal_boxes)} detected seals...")
    det_model = SealTextDetection(model_name="PP-OCRv4_server_seal_det")

    for i, box in enumerate(seal_boxes):
        x1, y1, x2, y2 = [int(v) for v in box]
        pad = 40
        y1_p, y2_p = max(0, y1-pad), min(page_img.shape[0], y2+pad)
        x1_p, x2_p = max(0, x1-pad), min(page_img.shape[1], x2+pad)
        seal_crop = page_img[y1_p:y2_p, x1_p:x2_p]
        crop_path = os.path.join(output_dir, f"seal_crop_{i}.png")
        cv2.imwrite(crop_path, seal_crop)

        print(f"Refining Seal #{i}...")
        output = det_model.predict(crop_path, batch_size=1)
        all_polygons = []
        for res in output:
            # SealTextDetection returns dt_polys directly in the result
            polys = res.get('dt_polys') if isinstance(res, dict) else None
            if polys:
                all_polygons.extend(polys)
                print(f"  Found {len(polys)} text polygons in seal #{i}")

        ch, cw = seal_crop.shape[:2]
        center = [cw // 2, ch // 2]
        radius = min(cw, ch) // 2 - 10

        start_theta, extent = calculate_precise_arc(all_polygons, center)
        marked = seal_crop.copy()
        for p in all_polygons:
            cv2.polylines(marked, [np.array(p, dtype=np.int32)], True, (0, 255, 0), 2)

        unwarp_name = f"seal_unwarp_{i}.png"
        unwarp_path = os.path.join(output_dir, unwarp_name)
        unwarp = None
        if extent > 0:
            unwarp = polar_unwarp(seal_crop, center, radius, start_theta, extent)
            if unwarp is not None:
                cv2.imwrite(unwarp_path, unwarp)
                def draw_line(m, theta, color):
                    x = center[0] + radius * math.cos(theta)
                    y = center[1] + radius * math.sin(theta)
                    cv2.line(m, (int(center[0]), int(center[1])), (int(x), int(y)), color, 2)
                draw_line(marked, start_theta, (255, 0, 0))
                draw_line(marked, start_theta + extent, (0, 0, 255))

        marked_name = f"seal_marked_{i}.png"
        cv2.imwrite(os.path.join(output_dir, marked_name), marked)
        seal_results.append({'index': i, 'crop': f"seal_crop_{i}.png", 'marked': marked_name, 'unwarp': unwarp_name if unwarp is not None else None})

    # Integrated HTML Template
    html = f"""
    <html><body style="font-family: sans-serif; padding: 20px; background: #fdfdfd;">
    <h1>Integrated Workflow: Paddlex Layout Analysis + OCR</h1>
    <div style="background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.05); margin-bottom: 40px;">
        <h3>1. Document Layout Detection (Paddlex PP-DocLayout-L)</h3>
        <p>File: WTS2025-21283.pdf | Detected Regions: {len(all_regions)}</p>
        <img src="doc_layout_viz.png" style="max-width: 100%; border: 1px solid #999;">
    </div>
    <div>
        <h2>2. Refined Seal Extraction & Unwarping</h2>
        {"".join([f'''
        <div style="margin-bottom: 40px; border-bottom: 2px solid #eee; padding-bottom: 20px;">
            <h3>Seal Area #{s['index']}</h3>
            <div style="display: flex; gap: 20px;">
                <div style="background:white; padding:10px; border-radius:4px; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
                    <p style="margin-top:0;">Detection Overlay</p>
                    <img src="{s['marked']}" style="max-height: 350px;">
                </div>
                <div style="flex-grow:1; background:white; padding:10px; border-radius:4px; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
                    <p style="margin-top:0;">Unwarped Organization Name</p>
                    {f'<img src="{s["unwarp"]}" style="max-width: 100%; border: 1px solid #ddd;">' if s['unwarp'] else '<p style="color:red;">No text arc found in this crop.</p>'}
                </div>
            </div>
        </div>
        ''' for s in seal_results]) if seal_results else '<p>No seals detected for unwarping.</p>'}
    </div>
    </body></html>
    """
    with open(os.path.join(output_dir, "index.html"), "w", encoding="utf-8") as f:
        f.write(html)
    print(f"Workflow Complete: {output_dir}/index.html")

if __name__ == "__main__":
    pdf_path = r"src/test/resources/data/pdfs/关于中检测试技术（广东）集团有限公司检验检测资质的调查取证函（局长件）_pages11-14.pdf"
    process_full_workflow(pdf_path)