fs-lawrisk/tools/investigate_duplicates.py

55 lines
1.5 KiB
Python
Raw Normal View History

import os
import pg8000.dbapi as pg
import collections
try:
from lawrisk.utils.env_loader import load_env
load_env()
except ImportError:
pass
def check_risks_detail():
LIC_DEFAULT_DB = "licensing_risks"
conn = pg.connect(
host=os.getenv("LIC_PG_HOST", "172.24.240.1"),
port=int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432"))),
user=os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres")),
password=os.getenv("LIC_PG_PASSWORD", ""),
database=os.getenv("LIC_PG_DATABASE", LIC_DEFAULT_DB)
)
cur = conn.cursor()
target_pid = '99824b83-c184-407a-92f7-bea1d221195c' # Electronic Cigarette
print(f"--- Risks for Permit {target_pid} ---")
cur.execute("""
SELECT r.risk_content
FROM region_permit_risks rpr
JOIN risks r ON rpr.risk_id = r.id
WHERE rpr.permit_id = %s
""", (target_pid,))
risks = [r[0] for r in cur.fetchall()]
print(f"Total Risks: {len(risks)}")
counts = collections.Counter(risks)
dupes = {k: v for k, v in counts.items() if v > 1}
if dupes:
print(f"!!! Found {len(dupes)} duplicated contents!")
for txt, c in list(dupes.items())[:3]:
print(f" - (Count {c}) '{str(txt)[:50]}...'")
else:
print("No exact text duplicates.")
print("\n--- Showing first 5 unique risks ---")
unique_risks = list(counts.keys())
for r in unique_risks[:5]:
print(f" - {str(r)[:60]}...")
conn.close()
if __name__ == "__main__":
check_risks_detail()