52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
|
|
from lawrisk.utils.env_loader import load_env
|
||
|
|
load_env()
|
||
|
|
|
||
|
|
import os
|
||
|
|
import pg8000.dbapi as pg
|
||
|
|
|
||
|
|
def get_conn():
|
||
|
|
host = os.getenv("LIC_PG_HOST", "172.24.240.1")
|
||
|
|
port = int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432")))
|
||
|
|
user = os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres"))
|
||
|
|
password = os.getenv("LIC_PG_PASSWORD", "")
|
||
|
|
database = os.getenv("LIC_PG_DATABASE", "licensing_risks")
|
||
|
|
return pg.connect(host=host, port=port, user=user, password=password, database=database)
|
||
|
|
|
||
|
|
def find_semantic_duplicates():
|
||
|
|
conn = get_conn()
|
||
|
|
cur = conn.cursor()
|
||
|
|
|
||
|
|
# We look for (region, permit, content, serial) that appear more than once,
|
||
|
|
# even if they have different risk_ids.
|
||
|
|
sql = """
|
||
|
|
SELECT
|
||
|
|
rpr.region_id,
|
||
|
|
rpr.permit_id,
|
||
|
|
rk.risk_content,
|
||
|
|
rpr.serial_number,
|
||
|
|
COUNT(*),
|
||
|
|
ARRAY_AGG(rk.id::text) as risk_ids,
|
||
|
|
ARRAY_AGG(rpr.risk_id::text) as link_ids
|
||
|
|
FROM region_permit_risks rpr
|
||
|
|
JOIN risks rk ON rk.id = rpr.risk_id
|
||
|
|
GROUP BY rpr.region_id, rpr.permit_id, rk.risk_content, rpr.serial_number
|
||
|
|
HAVING COUNT(*) > 1
|
||
|
|
"""
|
||
|
|
cur.execute(sql)
|
||
|
|
rows = cur.fetchall()
|
||
|
|
|
||
|
|
print(f"Found {len(rows)} semantic duplicate groups.")
|
||
|
|
|
||
|
|
for row in rows:
|
||
|
|
rid, pid, content, serial, count, risk_ids, link_ids = row
|
||
|
|
print(f"Region: {rid}, Permit: {pid}")
|
||
|
|
print(f"Content: {repr(content[:50])}...")
|
||
|
|
print(f"Serial: {serial}, Count: {count}")
|
||
|
|
print(f"Risk IDs to merge: {risk_ids}")
|
||
|
|
print("-" * 20)
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
find_semantic_duplicates()
|