fs-lawrisk/tools/v_semantic_dupes.py

52 lines
1.7 KiB
Python
Raw Permalink Normal View History

from lawrisk.utils.env_loader import load_env
load_env()
import os
import pg8000.dbapi as pg
def get_conn():
host = os.getenv("LIC_PG_HOST", "172.24.240.1")
port = int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432")))
user = os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres"))
password = os.getenv("LIC_PG_PASSWORD", "")
database = os.getenv("LIC_PG_DATABASE", "licensing_risks")
return pg.connect(host=host, port=port, user=user, password=password, database=database)
def find_semantic_duplicates():
conn = get_conn()
cur = conn.cursor()
# We look for (region, permit, content, serial) that appear more than once,
# even if they have different risk_ids.
sql = """
SELECT
rpr.region_id,
rpr.permit_id,
rk.risk_content,
rpr.serial_number,
COUNT(*),
ARRAY_AGG(rk.id::text) as risk_ids,
ARRAY_AGG(rpr.risk_id::text) as link_ids
FROM region_permit_risks rpr
JOIN risks rk ON rk.id = rpr.risk_id
GROUP BY rpr.region_id, rpr.permit_id, rk.risk_content, rpr.serial_number
HAVING COUNT(*) > 1
"""
cur.execute(sql)
rows = cur.fetchall()
print(f"Found {len(rows)} semantic duplicate groups.")
for row in rows:
rid, pid, content, serial, count, risk_ids, link_ids = row
print(f"Region: {rid}, Permit: {pid}")
print(f"Content: {repr(content[:50])}...")
print(f"Serial: {serial}, Count: {count}")
print(f"Risk IDs to merge: {risk_ids}")
print("-" * 20)
conn.close()
if __name__ == "__main__":
find_semantic_duplicates()