fs-lawrisk/tools/import_updated_themes.py

197 lines
8.0 KiB
Python
Raw Permalink Normal View History

import os
import pandas as pd
import pg8000.dbapi as pg
from lawrisk.utils.env_loader import load_env
from lawrisk.services import licensing_repo as lic_repo
def bind_v2():
load_env()
# 1. Load Excel Map
excel_path = os.path.join("data", "主题-事项绑定.xlsx")
if not os.path.exists(excel_path):
# Try root as fallback
excel_path = "主题-事项绑定.xlsx"
if not os.path.exists(excel_path):
print(f"Excel file not found!")
return
print(f"Reading Excel: {excel_path}")
df = pd.read_excel(excel_path, header=1)
df.columns = [str(c).strip() for c in df.columns]
# Identify columns
col_theme = '主题'
if col_theme not in df.columns:
for c in df.columns:
if '主题' in c: col_theme = c; break
col_permit = '审批事项'
if col_permit not in df.columns:
for c in df.columns:
if '事项' in c and c != col_theme: col_permit = c; break
if col_theme not in df.columns or col_permit not in df.columns:
print(f"Columns not found in Excel. Found: {df.columns.tolist()}")
return
# Forward fill theme names as they are usually merged in Excel
df[col_theme] = df[col_theme].ffill()
permit_to_themes = {}
for idx, row in df.iterrows():
p = str(row[col_permit]).strip()
t = str(row[col_theme]).strip()
if p and t and p != 'nan' and t != 'nan':
if p not in permit_to_themes:
permit_to_themes[p] = []
if t not in permit_to_themes[p]:
permit_to_themes[p].append(t)
print(f"Loaded {len(permit_to_themes)} unique permit names from Excel mapping.")
# 2. Connect DB
conn = lic_repo._lic_pg_conn()
cur = conn.cursor()
try:
# 3. Get Region ID for '市级' (Main target)
cur.execute("SELECT id FROM regions WHERE name = '市级'")
res = cur.fetchone()
if not res:
print("Region '市级' not found in DB.")
return
region_id = res[0]
# 4. Clear existing bindings for '市级' to ensure fresh re-mapping
print(f"Clearing existing bindings for region '市级'...")
cur.execute("DELETE FROM region_theme_permits WHERE region_id = %s", (region_id,))
print(f"Deleted {cur.rowcount} old bindings.")
# 5. Ensure all themes from Excel exist in DB
cur.execute("SELECT id, name FROM themes")
db_themes = {row[1]: str(row[0]) for row in cur.fetchall()}
all_excel_themes = set()
for t_list in permit_to_themes.values():
for t in t_list:
all_excel_themes.add(t)
for t_name in all_excel_themes:
if t_name not in db_themes:
print(f"Creating missing theme: {t_name}")
new_id = str(pg.uuid.uuid4())
cur.execute("INSERT INTO themes (id, name) VALUES (%s, %s)", (new_id, t_name))
db_themes[t_name] = new_id
# 6. Define mapping overrides for messy names
manual_map = {
"食品经营许可": ["食品经营许可"],
"旅客住宿服务": ["旅馆业特种行业许可证核发"],
"开设旅馆": ["旅馆业特种行业许可证核发"],
"旅馆业特种行业许可": ["旅馆业特种行业许可证核发"],
"药品经营许可(零售)": ["药品经营许可证(零售)"],
"食品生产许可": ["食品生产许可"],
"建设项目环境影响报告表审批": ["建设项目环境影响评价文件审批(广东省厅事项名称) 【国家标准名:“建设项目环境影响评价审批(海洋工程、核与辐射类除外)”】"],
"娱乐场所审批": [
"歌舞娱乐场所从事娱乐场所经营 activity 审批",
"游艺娱乐场所从事娱乐场所经营活动审批,内资娱乐场所变更、延续、补证、注销审批"
],
"第二类医疗器械经营备案": ["第二类医疗器械经营备案"],
"经营高危险性体育项目许可": ["经营高危险性体育项目许可"],
"互联网上网服务营业场所经营单位审批": ["互联网上网服务营业场所信息网络安全审核"],
"营业执照": ["营业执照"],
}
inverted_manual = {}
for excel_n, db_list in manual_map.items():
for db_n in db_list:
inverted_manual.setdefault(db_n, []).append(excel_n)
# 7. Get DB Permits for '市级'
cur.execute("""
SELECT p.id, p.name
FROM region_permit_details rpd
JOIN permits p ON p.id = rpd.permit_id
WHERE rpd.region_id = %s
""", (region_id,))
db_permits = cur.fetchall()
print(f"Scanning {len(db_permits)} DB permits for binding...")
bind_count = 0
def normalize_func(text):
if not text: return ""
# Remove common suffixes and punctuation for better fuzzy matching
return str(text).replace("", "").replace("", "").replace("(", "").replace(")", "").replace("许可", "").replace("", "").replace("审批", "").replace("核发", "").replace("备案", "").replace("业务", "").replace("从事", "").replace("企业", "").replace("设立", "").replace("变更", "").replace(" ", "").strip()
processed = 0
all_bindings = []
for pid_raw, pname in db_permits:
processed += 1
pid = str(pid_raw)
matched_excel_names = []
# A. Manual Map lookup (Case insensitive)
for db_n, excel_names in inverted_manual.items():
if db_n.lower() == pname.lower():
matched_excel_names.extend(excel_names)
# B. Exact Match (Case insensitive)
for en in permit_to_themes.keys():
if en.lower() == pname.lower():
matched_excel_names.append(en)
# C. Fuzzy Match fallback
if not matched_excel_names:
p_norm = normalize_func(pname)
best_match = None
max_score = 0
for excel_p in permit_to_themes.keys():
e_norm = normalize_func(excel_p)
if not e_norm or not p_norm: continue
if e_norm in p_norm or p_norm in e_norm:
# Character intersection score
score = len(set(e_norm) & set(p_norm))
if score > max_score and score >= 2:
max_score = score
best_match = excel_p
if best_match:
matched_excel_names.append(best_match)
# Perform binding for all unique themes associated with matched names
final_themes = set()
for en in matched_excel_names:
if en in permit_to_themes:
for t in permit_to_themes[en]:
final_themes.add(t)
for t_name in final_themes:
tid = db_themes.get(t_name)
if tid:
print(f" -> Binding to Theme: {t_name}")
all_bindings.append((region_id, tid, pid))
else:
print(f"Warning: Theme '{t_name}' not found in DB maps.")
print(f"Inserting {len(all_bindings)} new bindings...")
for b in all_bindings:
cur.execute("""
INSERT INTO region_theme_permits (region_id, theme_id, permit_id)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING
""", b)
bind_count += (1 if cur.rowcount > 0 else 0)
conn.commit()
print(f"Update complete. Created {bind_count} bindings for '市级' region.")
except Exception as e:
conn.rollback()
print(f"Error during update: {e}")
finally:
conn.close()
if __name__ == "__main__":
bind_v2()