from lawrisk.utils.env_loader import load_env load_env() from lawrisk.services.licensing_repo import _lic_pg_conn import json def get_duplicates_report(): with _lic_pg_conn() as conn: cur = conn.cursor() # Join risks and region_permit_risks sql = """ SELECT r.name as region_name, p.name as permit_name, rk.risk_content, rpr.serial_number, rpr.risk_id, rpr.region_id, rpr.permit_id FROM region_permit_risks rpr JOIN regions r ON r.id = rpr.region_id JOIN permits p ON p.id = rpr.permit_id JOIN risks rk ON rk.id = rpr.risk_id """ cur.execute(sql) rows = cur.fetchall() # Track by (Region, Permit) permit_risks = {} for row in rows: reg_name, p_name, content, serial, rid, pid, r_id = row[0], row[1], row[2], row[3], row[4], row[5], row[6] key = (row[5], row[6]) # (region_id, permit_id) if key not in permit_risks: permit_risks[key] = { "region": reg_name, "permit": p_name, "risks": [] } permit_risks[key]["risks"].append({ "content": content.strip() if content else "", "serial": serial, "risk_id": row[4] }) dupes_count = 0 total_perms = 0 report = [] for key, data in permit_risks.items(): risks = data["risks"] seen_content = {} seen_serial = {} p_dupes = [] for r in risks: # Content duplicate check if r["content"] in seen_content: p_dupes.append(f"DUPE_CONTENT: {repr(r['content'][:50])} (IDs: {r['risk_id']} vs {seen_content[r['content']]})") else: seen_content[r["content"]] = r["risk_id"] # Serial duplicate check if r["serial"] and r["serial"] in seen_serial: p_dupes.append(f"DUPE_SERIAL: {r['serial']} (IDs: {r['risk_id']} vs {seen_serial[r['serial']]})") else: if r["serial"]: seen_serial[r["serial"]] = r["risk_id"] if p_dupes: dupes_count += 1 report.append(f"--- {data['region']} | {data['permit']} ---") for msg in p_dupes: report.append(f" {msg}") total_perms += 1 print(f"Checked {total_perms} permits.") print(f"Found {dupes_count} permits with duplicates.") with open("risk_duplicates_report.txt", "w", encoding="utf-8") as f: f.write("\n".join(report)) print("Detailed report written to risk_duplicates_report.txt") if __name__ == "__main__": get_duplicates_report()