fs-lawrisk/tools/reimport_themes.py

169 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
import pg8000.dbapi as pg
from lawrisk.utils.env_loader import load_env
def fix_bindings():
load_env()
conn_params = {
"host": os.getenv("LIC_PG_HOST", "172.24.240.1"),
"port": int(os.getenv("LIC_PG_PORT", os.getenv("PG_PORT", "5432"))),
"user": os.getenv("LIC_PG_USER", os.getenv("PG_USER", "postgres")),
"password": os.getenv("LIC_PG_PASSWORD", ""),
"database": os.getenv("LIC_PG_DATABASE", "licensing_risks"),
}
excel_path = "主题-事项绑定.xlsx"
if not os.path.exists(excel_path):
print(f"Excel file not found: {excel_path}")
return
print("Reading Excel...")
try:
# Try header at row 1 (0-indexed) which is row 2 in Excel
df = pd.read_excel(excel_path, header=1)
except Exception as e:
print(f"Read error: {e}")
return
# Normalize column names
print("Raw Columns:", df.columns.tolist())
df.columns = [str(c).strip() for c in df.columns]
print("Normalized Columns:", df.columns.tolist())
# Fill merged cells (forward fill)
col_theme = '主题'
if col_theme not in df.columns:
# Prefer "主题" only, but "主题事项" is okay if "主题" not found
candidates = [c for c in df.columns if "主题" in c]
if candidates:
# Sort by length, prefer shorter '主题'
candidates.sort(key=len)
col_theme = candidates[0]
print(f"Using Theme Column: '{col_theme}'")
if col_theme in df.columns:
df[col_theme] = df[col_theme].fillna(method='ffill')
else:
print("Theme column missing! Aborting.")
return
col_permit = '审批事项' # Default to what we saw
if col_permit not in df.columns:
# heuristic
candidates = []
for c in df.columns:
if "审批事项" in c: candidates.append(c)
elif "许可事项" in c: candidates.append(c)
elif "事项" in c and c != col_theme: candidates.append(c)
if candidates:
col_permit = candidates[0]
print(f"Using columns: Theme='{col_theme}', Permit='{col_permit}'")
excel_bindings = df[[col_theme, col_permit]].dropna(subset=[col_permit])
# Also drop if theme is null (though we filled it)
excel_bindings = excel_bindings.dropna(subset=[col_theme])
print(f"Found {len(excel_bindings)} bindings in Excel.")
try:
conn = pg.connect(**conn_params)
cur = conn.cursor()
# Get DB Permits (市级)
print("Fetching DB Permits...")
cur.execute("""
SELECT p.id, p.name
FROM region_permit_details rpd
JOIN permits p ON p.id = rpd.permit_id
JOIN regions r ON r.id = rpd.region_id
WHERE r.name = '市级'
""")
db_permits = {row[1].strip(): row[0] for row in cur.fetchall()}
print(f"Found {len(db_permits)} permits in DB for '市级'.")
# Get Themes
cur.execute("SELECT id, name FROM themes")
themes = {row[1].strip(): row[0] for row in cur.fetchall()}
print(f"Found {len(themes)} themes in DB.")
# Check if themes are populated. If only 1 themes, maybe we need to import themes?
# Assuming themes table is populated from previous sessions.
# Get Region ID
cur.execute("SELECT id FROM regions WHERE name = '市级'")
region_id = cur.fetchone()[0]
matches = 0
mismatches = 0
to_insert = []
# Helper for fuzzy matching
def normalize_name(n):
return n.replace("许可", "").replace("", "").replace("", "").replace("", "").replace("(", "").replace(")", "").strip()
for idx, row in excel_bindings.iterrows():
t_name = str(row[col_theme]).strip()
p_name = str(row[col_permit]).strip()
# 1. Resolve Theme
t_id = themes.get(t_name)
if not t_id:
# Try creating theme? Or fuzzy match theme?
pass
# 2. Resolve Permit
p_id = db_permits.get(p_name)
if not p_id:
# Fuzzy
norm_p = normalize_name(p_name)
for db_name, db_pid in db_permits.items():
if normalize_name(db_name) == norm_p:
p_id = db_pid
# print(f"Fuzzy Match: '{p_name}' -> '{db_name}'")
break
# Try containment
if norm_p in normalize_name(db_name) and len(norm_p) > 4:
p_id = db_pid
break
if p_id and t_id:
to_insert.append((region_id, t_id, p_id))
matches += 1
else:
if not p_id:
print(f"Unmatched Excel Permit: '{p_name}'")
# Suggest candidates
candidates = []
for db_name in db_permits.keys():
if p_name in db_name or db_name in p_name:
candidates.append(db_name)
if candidates:
print(f" Possible DB Candidates: {candidates}")
if not t_id:
pass
mismatches += 1
print(f"\nPotential bindings: {matches}, Unmatched: {mismatches}")
if to_insert:
print(f"Inserting {len(to_insert)} bindings...")
for rid, tid, pid in to_insert:
cur.execute("""
INSERT INTO region_theme_permits (region_id, theme_id, permit_id)
VALUES (%s, %s, %s)
ON CONFLICT (region_id, theme_id, permit_id) DO NOTHING
""", (rid, tid, pid))
conn.commit()
print("Bindings inserted/updated.")
conn.close()
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
fix_bindings()