fs-lawrisk/tools/detect_duplicates.py

54 lines
2.0 KiB
Python
Raw Permalink Normal View History

from lawrisk.utils.env_loader import load_env
load_env()
from lawrisk.services.licensing_repo import _lic_pg_conn
import json
def inspect_risk_tables():
with _lic_pg_conn() as conn:
cur = conn.cursor()
# Check for duplicate combinations of (region_id, permit_id, risk_id)
# This shouldn't happen if there is a primary key, but let's check.
print("\n--- Check 4: Same (Region, Permit, RiskID) ---")
cur.execute("""
SELECT region_id, permit_id, risk_id, COUNT(*)
FROM region_permit_risks
GROUP BY region_id, permit_id, risk_id
HAVING COUNT(*) > 1
""")
dupes4 = cur.fetchall()
print(f"Duplicates (R, P, RiskID): {len(dupes4)}")
# Maybe the duplication is because of multiple imports?
# Let's look at all risks for a specific permit that might have issues.
print("\n--- Listing first 10 permits and their risk counts ---")
cur.execute("""
SELECT p.name, rpd.region_id, COUNT(rpr.risk_id)
FROM region_permit_details rpd
JOIN permits p ON p.id = rpd.permit_id
LEFT JOIN region_permit_risks rpr ON rpr.permit_id = rpd.permit_id AND rpr.region_id = rpd.region_id
GROUP BY p.name, rpd.region_id
ORDER BY COUNT(rpr.risk_id) DESC
LIMIT 10
""")
for row in cur.fetchall():
print(f"Permit: {row[0]}, Region: {row[1]}, Risks: {row[2]}")
# Let's check if there are risks with identical content in the risks table itself.
print("\n--- Check 5: Identical Content in 'risks' table ---")
cur.execute("""
SELECT risk_content, COUNT(*)
FROM risks
GROUP BY risk_content
HAVING COUNT(*) > 1
LIMIT 10
""")
dupes5 = cur.fetchall()
print(f"Identical content in 'risks' table: {len(dupes5)}")
for d in dupes5:
print(f" Count: {d[1]}, Content: {repr(d[0][:100])}...")
if __name__ == "__main__":
inspect_risk_tables()