fs-lawrisk/tools/import_updated_themes.py

197 lines
8.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
import pg8000.dbapi as pg
from lawrisk.utils.env_loader import load_env
from lawrisk.services import licensing_repo as lic_repo
def bind_v2():
load_env()
# 1. Load Excel Map
excel_path = os.path.join("data", "主题-事项绑定.xlsx")
if not os.path.exists(excel_path):
# Try root as fallback
excel_path = "主题-事项绑定.xlsx"
if not os.path.exists(excel_path):
print(f"Excel file not found!")
return
print(f"Reading Excel: {excel_path}")
df = pd.read_excel(excel_path, header=1)
df.columns = [str(c).strip() for c in df.columns]
# Identify columns
col_theme = '主题'
if col_theme not in df.columns:
for c in df.columns:
if '主题' in c: col_theme = c; break
col_permit = '审批事项'
if col_permit not in df.columns:
for c in df.columns:
if '事项' in c and c != col_theme: col_permit = c; break
if col_theme not in df.columns or col_permit not in df.columns:
print(f"Columns not found in Excel. Found: {df.columns.tolist()}")
return
# Forward fill theme names as they are usually merged in Excel
df[col_theme] = df[col_theme].ffill()
permit_to_themes = {}
for idx, row in df.iterrows():
p = str(row[col_permit]).strip()
t = str(row[col_theme]).strip()
if p and t and p != 'nan' and t != 'nan':
if p not in permit_to_themes:
permit_to_themes[p] = []
if t not in permit_to_themes[p]:
permit_to_themes[p].append(t)
print(f"Loaded {len(permit_to_themes)} unique permit names from Excel mapping.")
# 2. Connect DB
conn = lic_repo._lic_pg_conn()
cur = conn.cursor()
try:
# 3. Get Region ID for '市级' (Main target)
cur.execute("SELECT id FROM regions WHERE name = '市级'")
res = cur.fetchone()
if not res:
print("Region '市级' not found in DB.")
return
region_id = res[0]
# 4. Clear existing bindings for '市级' to ensure fresh re-mapping
print(f"Clearing existing bindings for region '市级'...")
cur.execute("DELETE FROM region_theme_permits WHERE region_id = %s", (region_id,))
print(f"Deleted {cur.rowcount} old bindings.")
# 5. Ensure all themes from Excel exist in DB
cur.execute("SELECT id, name FROM themes")
db_themes = {row[1]: str(row[0]) for row in cur.fetchall()}
all_excel_themes = set()
for t_list in permit_to_themes.values():
for t in t_list:
all_excel_themes.add(t)
for t_name in all_excel_themes:
if t_name not in db_themes:
print(f"Creating missing theme: {t_name}")
new_id = str(pg.uuid.uuid4())
cur.execute("INSERT INTO themes (id, name) VALUES (%s, %s)", (new_id, t_name))
db_themes[t_name] = new_id
# 6. Define mapping overrides for messy names
manual_map = {
"食品经营许可": ["食品经营许可"],
"旅客住宿服务": ["旅馆业特种行业许可证核发"],
"开设旅馆": ["旅馆业特种行业许可证核发"],
"旅馆业特种行业许可": ["旅馆业特种行业许可证核发"],
"药品经营许可(零售)": ["药品经营许可证(零售)"],
"食品生产许可": ["食品生产许可"],
"建设项目环境影响报告表审批": ["建设项目环境影响评价文件审批(广东省厅事项名称) 【国家标准名:“建设项目环境影响评价审批(海洋工程、核与辐射类除外)”】"],
"娱乐场所审批": [
"歌舞娱乐场所从事娱乐场所经营 activity 审批",
"游艺娱乐场所从事娱乐场所经营活动审批,内资娱乐场所变更、延续、补证、注销审批"
],
"第二类医疗器械经营备案": ["第二类医疗器械经营备案"],
"经营高危险性体育项目许可": ["经营高危险性体育项目许可"],
"互联网上网服务营业场所经营单位审批": ["互联网上网服务营业场所信息网络安全审核"],
"营业执照": ["营业执照"],
}
inverted_manual = {}
for excel_n, db_list in manual_map.items():
for db_n in db_list:
inverted_manual.setdefault(db_n, []).append(excel_n)
# 7. Get DB Permits for '市级'
cur.execute("""
SELECT p.id, p.name
FROM region_permit_details rpd
JOIN permits p ON p.id = rpd.permit_id
WHERE rpd.region_id = %s
""", (region_id,))
db_permits = cur.fetchall()
print(f"Scanning {len(db_permits)} DB permits for binding...")
bind_count = 0
def normalize_func(text):
if not text: return ""
# Remove common suffixes and punctuation for better fuzzy matching
return str(text).replace("", "").replace("", "").replace("(", "").replace(")", "").replace("许可", "").replace("", "").replace("审批", "").replace("核发", "").replace("备案", "").replace("业务", "").replace("从事", "").replace("企业", "").replace("设立", "").replace("变更", "").replace(" ", "").strip()
processed = 0
all_bindings = []
for pid_raw, pname in db_permits:
processed += 1
pid = str(pid_raw)
matched_excel_names = []
# A. Manual Map lookup (Case insensitive)
for db_n, excel_names in inverted_manual.items():
if db_n.lower() == pname.lower():
matched_excel_names.extend(excel_names)
# B. Exact Match (Case insensitive)
for en in permit_to_themes.keys():
if en.lower() == pname.lower():
matched_excel_names.append(en)
# C. Fuzzy Match fallback
if not matched_excel_names:
p_norm = normalize_func(pname)
best_match = None
max_score = 0
for excel_p in permit_to_themes.keys():
e_norm = normalize_func(excel_p)
if not e_norm or not p_norm: continue
if e_norm in p_norm or p_norm in e_norm:
# Character intersection score
score = len(set(e_norm) & set(p_norm))
if score > max_score and score >= 2:
max_score = score
best_match = excel_p
if best_match:
matched_excel_names.append(best_match)
# Perform binding for all unique themes associated with matched names
final_themes = set()
for en in matched_excel_names:
if en in permit_to_themes:
for t in permit_to_themes[en]:
final_themes.add(t)
for t_name in final_themes:
tid = db_themes.get(t_name)
if tid:
print(f" -> Binding to Theme: {t_name}")
all_bindings.append((region_id, tid, pid))
else:
print(f"Warning: Theme '{t_name}' not found in DB maps.")
print(f"Inserting {len(all_bindings)} new bindings...")
for b in all_bindings:
cur.execute("""
INSERT INTO region_theme_permits (region_id, theme_id, permit_id)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING
""", b)
bind_count += (1 if cur.rowcount > 0 else 0)
conn.commit()
print(f"Update complete. Created {bind_count} bindings for '市级' region.")
except Exception as e:
conn.rollback()
print(f"Error during update: {e}")
finally:
conn.close()
if __name__ == "__main__":
bind_v2()