fs-lawrisk/tools/reimport_themes.py

169 lines
6.0 KiB
Python
Raw Normal View History

import os
import pandas as pd
import pg8000.dbapi as pg
from lawrisk.utils.env_loader import load_env
def fix_bindings():
load_env()
conn_params = {
"host": os.getenv("LIC_PG_HOST", "172.24.240.1"),
"port": int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432"))),
"user": os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres")),
"password": os.getenv("LIC_PG_PASSWORD", ""),
"database": os.getenv("LIC_PG_DATABASE", "licensing_risks"),
}
excel_path = "主题-事项绑定.xlsx"
if not os.path.exists(excel_path):
print(f"Excel file not found: {excel_path}")
return
print("Reading Excel...")
try:
# Try header at row 1 (0-indexed) which is row 2 in Excel
df = pd.read_excel(excel_path, header=1)
except Exception as e:
print(f"Read error: {e}")
return
# Normalize column names
print("Raw Columns:", df.columns.tolist())
df.columns = [str(c).strip() for c in df.columns]
print("Normalized Columns:", df.columns.tolist())
# Fill merged cells (forward fill)
col_theme = '主题'
if col_theme not in df.columns:
# Prefer "主题" only, but "主题事项" is okay if "主题" not found
candidates = [c for c in df.columns if "主题" in c]
if candidates:
# Sort by length, prefer shorter '主题'
candidates.sort(key=len)
col_theme = candidates[0]
print(f"Using Theme Column: '{col_theme}'")
if col_theme in df.columns:
df[col_theme] = df[col_theme].fillna(method='ffill')
else:
print("Theme column missing! Aborting.")
return
col_permit = '审批事项' # Default to what we saw
if col_permit not in df.columns:
# heuristic
candidates = []
for c in df.columns:
if "审批事项" in c: candidates.append(c)
elif "许可事项" in c: candidates.append(c)
elif "事项" in c and c != col_theme: candidates.append(c)
if candidates:
col_permit = candidates[0]
print(f"Using columns: Theme='{col_theme}', Permit='{col_permit}'")
excel_bindings = df[[col_theme, col_permit]].dropna(subset=[col_permit])
# Also drop if theme is null (though we filled it)
excel_bindings = excel_bindings.dropna(subset=[col_theme])
print(f"Found {len(excel_bindings)} bindings in Excel.")
try:
conn = pg.connect(**conn_params)
cur = conn.cursor()
# Get DB Permits (市级)
print("Fetching DB Permits...")
cur.execute("""
SELECT p.id, p.name
FROM region_permit_details rpd
JOIN permits p ON p.id = rpd.permit_id
JOIN regions r ON r.id = rpd.region_id
WHERE r.name = '市级'
""")
db_permits = {row[1].strip(): row[0] for row in cur.fetchall()}
print(f"Found {len(db_permits)} permits in DB for '市级'.")
# Get Themes
cur.execute("SELECT id, name FROM themes")
themes = {row[1].strip(): row[0] for row in cur.fetchall()}
print(f"Found {len(themes)} themes in DB.")
# Check if themes are populated. If only 1 themes, maybe we need to import themes?
# Assuming themes table is populated from previous sessions.
# Get Region ID
cur.execute("SELECT id FROM regions WHERE name = '市级'")
region_id = cur.fetchone()[0]
matches = 0
mismatches = 0
to_insert = []
# Helper for fuzzy matching
def normalize_name(n):
return n.replace("许可", "").replace("", "").replace("", "").replace("", "").replace("(", "").replace(")", "").strip()
for idx, row in excel_bindings.iterrows():
t_name = str(row[col_theme]).strip()
p_name = str(row[col_permit]).strip()
# 1. Resolve Theme
t_id = themes.get(t_name)
if not t_id:
# Try creating theme? Or fuzzy match theme?
pass
# 2. Resolve Permit
p_id = db_permits.get(p_name)
if not p_id:
# Fuzzy
norm_p = normalize_name(p_name)
for db_name, db_pid in db_permits.items():
if normalize_name(db_name) == norm_p:
p_id = db_pid
# print(f"Fuzzy Match: '{p_name}' -> '{db_name}'")
break
# Try containment
if norm_p in normalize_name(db_name) and len(norm_p) > 4:
p_id = db_pid
break
if p_id and t_id:
to_insert.append((region_id, t_id, p_id))
matches += 1
else:
if not p_id:
print(f"Unmatched Excel Permit: '{p_name}'")
# Suggest candidates
candidates = []
for db_name in db_permits.keys():
if p_name in db_name or db_name in p_name:
candidates.append(db_name)
if candidates:
print(f" Possible DB Candidates: {candidates}")
if not t_id:
pass
mismatches += 1
print(f"\nPotential bindings: {matches}, Unmatched: {mismatches}")
if to_insert:
print(f"Inserting {len(to_insert)} bindings...")
for rid, tid, pid in to_insert:
cur.execute("""
INSERT INTO region_theme_permits (region_id, theme_id, permit_id)
VALUES (%s, %s, %s)
ON CONFLICT (region_id, theme_id, permit_id) DO NOTHING
""", (rid, tid, pid))
conn.commit()
print("Bindings inserted/updated.")
conn.close()
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
fix_bindings()