fs-lawrisk/tools/v_agg_dupes.py

44 lines
1.4 KiB
Python

from lawrisk.utils.env_loader import load_env
load_env()
import os
import pg8000.dbapi as pg
def get_conn():
host = os.getenv("LIC_PG_HOST", "172.24.240.1")
port = int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432")))
user = os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres"))
password = os.getenv("LIC_PG_PASSWORD", "")
database = os.getenv("LIC_PG_DATABASE", "licensing_risks")
return pg.connect(host=host, port=port, user=user, password=password, database=database)
def detect_any_duplicates():
conn = get_conn()
cur = conn.cursor()
# Let's find all risks per permit and see if any have same content
cur.execute("""
SELECT
rpr.region_id,
r.name as region_name,
rpr.permit_id,
p.name as permit_name,
rk.risk_content,
COUNT(*)
FROM region_permit_risks rpr
JOIN regions r ON r.id = rpr.region_id
JOIN permits p ON p.id = rpr.permit_id
JOIN risks rk ON rk.id = rpr.risk_id
GROUP BY rpr.region_id, r.name, rpr.permit_id, p.name, rk.risk_content
HAVING COUNT(*) > 1
""")
rows = cur.fetchall()
print(f"Total duplicate content groups for same permit: {len(rows)}")
for row in rows:
print(f" [{row[1]}] {row[3]}: {repr(row[4][:50])} (Count: {row[5]})")
conn.close()
if __name__ == "__main__":
detect_any_duplicates()