from lawrisk.utils.env_loader import load_env load_env() from lawrisk.services.licensing_repo import _lic_pg_conn import json def find_duplicates(): with _lic_pg_conn() as conn: cur = conn.cursor() # Find (Region, Permit) pairs that have duplicate risk content print("Searching for duplicate content within same permit...") cur.execute(""" SELECT r.name as region_name, p.name as permit_name, rk.risk_content, COUNT(*) as occurrences, ARRAY_AGG(rk.id::text) as risk_ids, ARRAY_AGG(rpr.serial_number) as serials FROM region_permit_risks rpr JOIN regions r ON r.id = rpr.region_id JOIN permits p ON p.id = rpr.permit_id JOIN risks rk ON rk.id = rpr.risk_id GROUP BY r.name, p.name, rk.risk_content HAVING COUNT(*) > 1 """) dupes_content = cur.fetchall() print(f"Number of permits with duplicate content: {len(dupes_content)}") for d in dupes_content[:20]: print(f"Region: {d[0]}") print(f"Permit: {d[1]}") print(f"Content (trimmed): {repr(d[2][:50])}...") print(f"Count: {d[3]}") print(f"Risk IDs: {d[4]}") print(f"Serials: {d[5]}") print("-" * 40) # Find (Region, Permit) pairs that have duplicate serial numbers print("\nSearching for duplicate serial numbers within same permit...") cur.execute(""" SELECT r.name as region_name, p.name as permit_name, rpr.serial_number, COUNT(*) as occurrences, ARRAY_AGG(rk.id::text) as risk_ids, ARRAY_AGG(rk.risk_content) as contents FROM region_permit_risks rpr JOIN regions r ON r.id = rpr.region_id JOIN permits p ON p.id = rpr.permit_id JOIN risks rk ON rk.id = rpr.risk_id WHERE rpr.serial_number IS NOT NULL AND rpr.serial_number != '' GROUP BY r.name, p.name, rpr.serial_number HAVING COUNT(*) > 1 """) dupes_serial = cur.fetchall() print(f"Number of permits with duplicate serials: {len(dupes_serial)}") for d in dupes_serial[:20]: print(f"Region: {d[0]}") print(f"Permit: {d[1]}") print(f"Serial: {d[2]}") print(f"Count: {d[3]}") # print(f"Contents: {[c[:30] for c in d[5]]}") print("-" * 40) if __name__ == "__main__": find_duplicates()