71 lines
2.6 KiB
Python
71 lines
2.6 KiB
Python
from lawrisk.utils.env_loader import load_env
|
|
load_env()
|
|
|
|
from lawrisk.services.licensing_repo import _lic_pg_conn
|
|
import json
|
|
|
|
def find_duplicates():
|
|
with _lic_pg_conn() as conn:
|
|
cur = conn.cursor()
|
|
|
|
# Find (Region, Permit) pairs that have duplicate risk content
|
|
print("Searching for duplicate content within same permit...")
|
|
cur.execute("""
|
|
SELECT
|
|
r.name as region_name,
|
|
p.name as permit_name,
|
|
rk.risk_content,
|
|
COUNT(*) as occurrences,
|
|
ARRAY_AGG(rk.id::text) as risk_ids,
|
|
ARRAY_AGG(rpr.serial_number) as serials
|
|
FROM region_permit_risks rpr
|
|
JOIN regions r ON r.id = rpr.region_id
|
|
JOIN permits p ON p.id = rpr.permit_id
|
|
JOIN risks rk ON rk.id = rpr.risk_id
|
|
GROUP BY r.name, p.name, rk.risk_content
|
|
HAVING COUNT(*) > 1
|
|
""")
|
|
dupes_content = cur.fetchall()
|
|
|
|
print(f"Number of permits with duplicate content: {len(dupes_content)}")
|
|
for d in dupes_content[:20]:
|
|
print(f"Region: {d[0]}")
|
|
print(f"Permit: {d[1]}")
|
|
print(f"Content (trimmed): {repr(d[2][:50])}...")
|
|
print(f"Count: {d[3]}")
|
|
print(f"Risk IDs: {d[4]}")
|
|
print(f"Serials: {d[5]}")
|
|
print("-" * 40)
|
|
|
|
# Find (Region, Permit) pairs that have duplicate serial numbers
|
|
print("\nSearching for duplicate serial numbers within same permit...")
|
|
cur.execute("""
|
|
SELECT
|
|
r.name as region_name,
|
|
p.name as permit_name,
|
|
rpr.serial_number,
|
|
COUNT(*) as occurrences,
|
|
ARRAY_AGG(rk.id::text) as risk_ids,
|
|
ARRAY_AGG(rk.risk_content) as contents
|
|
FROM region_permit_risks rpr
|
|
JOIN regions r ON r.id = rpr.region_id
|
|
JOIN permits p ON p.id = rpr.permit_id
|
|
JOIN risks rk ON rk.id = rpr.risk_id
|
|
WHERE rpr.serial_number IS NOT NULL AND rpr.serial_number != ''
|
|
GROUP BY r.name, p.name, rpr.serial_number
|
|
HAVING COUNT(*) > 1
|
|
""")
|
|
dupes_serial = cur.fetchall()
|
|
|
|
print(f"Number of permits with duplicate serials: {len(dupes_serial)}")
|
|
for d in dupes_serial[:20]:
|
|
print(f"Region: {d[0]}")
|
|
print(f"Permit: {d[1]}")
|
|
print(f"Serial: {d[2]}")
|
|
print(f"Count: {d[3]}")
|
|
# print(f"Contents: {[c[:30] for c in d[5]]}")
|
|
print("-" * 40)
|
|
|
|
if __name__ == "__main__":
|
|
find_duplicates()
|