fs-lawrisk/tools/report_duplicates.py

71 lines
2.6 KiB
Python
Raw Permalink Normal View History

from lawrisk.utils.env_loader import load_env
load_env()
from lawrisk.services.licensing_repo import _lic_pg_conn
import json
def find_duplicates():
with _lic_pg_conn() as conn:
cur = conn.cursor()
# Find (Region, Permit) pairs that have duplicate risk content
print("Searching for duplicate content within same permit...")
cur.execute("""
SELECT
r.name as region_name,
p.name as permit_name,
rk.risk_content,
COUNT(*) as occurrences,
ARRAY_AGG(rk.id::text) as risk_ids,
ARRAY_AGG(rpr.serial_number) as serials
FROM region_permit_risks rpr
JOIN regions r ON r.id = rpr.region_id
JOIN permits p ON p.id = rpr.permit_id
JOIN risks rk ON rk.id = rpr.risk_id
GROUP BY r.name, p.name, rk.risk_content
HAVING COUNT(*) > 1
""")
dupes_content = cur.fetchall()
print(f"Number of permits with duplicate content: {len(dupes_content)}")
for d in dupes_content[:20]:
print(f"Region: {d[0]}")
print(f"Permit: {d[1]}")
print(f"Content (trimmed): {repr(d[2][:50])}...")
print(f"Count: {d[3]}")
print(f"Risk IDs: {d[4]}")
print(f"Serials: {d[5]}")
print("-" * 40)
# Find (Region, Permit) pairs that have duplicate serial numbers
print("\nSearching for duplicate serial numbers within same permit...")
cur.execute("""
SELECT
r.name as region_name,
p.name as permit_name,
rpr.serial_number,
COUNT(*) as occurrences,
ARRAY_AGG(rk.id::text) as risk_ids,
ARRAY_AGG(rk.risk_content) as contents
FROM region_permit_risks rpr
JOIN regions r ON r.id = rpr.region_id
JOIN permits p ON p.id = rpr.permit_id
JOIN risks rk ON rk.id = rpr.risk_id
WHERE rpr.serial_number IS NOT NULL AND rpr.serial_number != ''
GROUP BY r.name, p.name, rpr.serial_number
HAVING COUNT(*) > 1
""")
dupes_serial = cur.fetchall()
print(f"Number of permits with duplicate serials: {len(dupes_serial)}")
for d in dupes_serial[:20]:
print(f"Region: {d[0]}")
print(f"Permit: {d[1]}")
print(f"Serial: {d[2]}")
print(f"Count: {d[3]}")
# print(f"Contents: {[c[:30] for c in d[5]]}")
print("-" * 40)
if __name__ == "__main__":
find_duplicates()