from lawrisk.utils.env_loader import load_env load_env() import os import pg8000.dbapi as pg def get_conn(): host = os.getenv("LIC_PG_HOST", "172.24.240.1") port = int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432"))) user = os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres")) password = os.getenv("LIC_PG_PASSWORD", "") database = os.getenv("LIC_PG_DATABASE", "licensing_risks") return pg.connect(host=host, port=port, user=user, password=password, database=database) def find_semantic_duplicates(): conn = get_conn() cur = conn.cursor() # We look for (region, permit, content, serial) that appear more than once, # even if they have different risk_ids. sql = """ SELECT rpr.region_id, rpr.permit_id, rk.risk_content, rpr.serial_number, COUNT(*), ARRAY_AGG(rk.id::text) as risk_ids, ARRAY_AGG(rpr.risk_id::text) as link_ids FROM region_permit_risks rpr JOIN risks rk ON rk.id = rpr.risk_id GROUP BY rpr.region_id, rpr.permit_id, rk.risk_content, rpr.serial_number HAVING COUNT(*) > 1 """ cur.execute(sql) rows = cur.fetchall() print(f"Found {len(rows)} semantic duplicate groups.") for row in rows: rid, pid, content, serial, count, risk_ids, link_ids = row print(f"Region: {rid}, Permit: {pid}") print(f"Content: {repr(content[:50])}...") print(f"Serial: {serial}, Count: {count}") print(f"Risk IDs to merge: {risk_ids}") print("-" * 20) conn.close() if __name__ == "__main__": find_semantic_duplicates()