From ca4e59babaeb3858850152ec1d4ea07a1c79532d Mon Sep 17 00:00:00 2001 From: hanyuqing <1106611654@qq.com> Date: Mon, 22 Dec 2025 09:52:58 +0800 Subject: [PATCH] 22 --- 1111.py | 165 --------------------------------------------------- 1218.py | 172 ----------------------------------------------------- icd_parse.py | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ icd_parse_name.py | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++++ parse_data.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++ test1217.py | 173 ------------------------------------------------------ 6 files changed, 510 insertions(+), 510 deletions(-) delete mode 100644 1111.py delete mode 100644 1218.py create mode 100644 icd_parse.py create mode 100644 icd_parse_name.py create mode 100644 parse_data.py delete mode 100644 test1217.py diff --git a/1111.py b/1111.py deleted file mode 100644 index 39f0f05..0000000 --- a/1111.py +++ /dev/null @@ -1,165 +0,0 @@ -import json -import re -import os -from neo4j import GraphDatabase - -# === 配置 === -NEO4J_URI = "bolt://localhost:7687" -NEO4J_USER = "neo4j" -NEO4J_PASSWORD = "12345678" # 👈 请确保密码正确 -RELATIONSHIP_FOLDER = r"D:\temp\669" -BATCH_SIZE = 100 - - -def sanitize_relationship_type(rel_type: str) -> str: - """清理关系类型,确保合法""" - if not isinstance(rel_type, str): - rel_type = str(rel_type) - sanitized = re.sub(r"[^a-zA-Z0-9_]", "", rel_type) - if not sanitized or sanitized[0].isdigit(): - sanitized = "REL_" + sanitized - return sanitized or "RELATED" - - -def extract_start_end(rel: dict): - """兼容多种字段名提取 start/end""" - for s_key, e_key in [("start", "end"), ("source", "target"), ("from", "to")]: - s = rel.get(s_key) - e = rel.get(e_key) - if s is not None and e is not None: - return s, e - return None, None - - -def load_relationships_from_file(filepath): - """从单个 JSON 文件加载 relationships""" - with open(filepath, "r", encoding="utf-8-sig") as f: - data = json.load(f) - - relationships = [] - if isinstance(data, list): - for item in data: - if isinstance(item, dict) and "relationships" in item: - relationships.extend(item["relationships"]) - elif isinstance(item, dict): - relationships.append(item) - elif isinstance(data, dict) and "relationships" in data: - relationships = data["relationships"] - else: - relationships = data if isinstance(data, list) else [] - - return relationships - - -def process_relationships(relationships): - """清洗并验证关系列表""" - valid_rels = [] - for rel in relationships: - start_id, end_id = extract_start_end(rel) - rel_type = rel.get("type", "RELATED") - props = rel.get("properties", {}) or {} - - if start_id is None or end_id is None: - continue - - try: - start_id = int(float(start_id)) - end_id = int(float(end_id)) - except (TypeError, ValueError): - continue - - valid_rels.append({ - "start": start_id, - "end": end_id, - "type": sanitize_relationship_type(rel_type), - "props": props - }) - return valid_rels - - -def import_relationships_in_batches(tx, rels, batch_size): - total = len(rels) - created_total = 0 - - for i in range(0, total, batch_size): - batch = rels[i:i + batch_size] - rel_groups = {} - for rel in batch: - rel_groups.setdefault(rel["type"], []).append({ - "start": rel["start"], - "end": rel["end"], - "props": rel["props"] - }) - - created_this_batch = 0 - for rel_type, group in rel_groups.items(): - cypher = f""" - UNWIND $rels AS r - MATCH (a {{nodeId: r.start}}) - MATCH (b {{nodeId: r.end}}) - WITH a, b, r - WHERE a IS NOT NULL AND b IS NOT NULL - MERGE (a)-[rel:`{rel_type}`]->(b) - SET rel += r.props - RETURN count(rel) AS c - """ - result = tx.run(cypher, rels=group).single() - created_this_batch += result["c"] - - created_total += created_this_batch - print(f" ➤ 本批创建关系: {created_this_batch} 条") - - return created_total - - -def main(): - driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) - - # 获取所有 JSON 文件,并按文件名排序(确保 relations_001.json 先于 002) - json_files = [f for f in os.listdir(RELATIONSHIP_FOLDER) if f.startswith("relations_") and f.endswith(".json")] - json_files.sort() # 按字典序排序,适用于 001, 002... 格式 - - if not json_files: - print("❌ 文件夹中没有找到 relations_*.json 文件") - return - - total_global_created = 0 - total_global_processed = 0 - - print(f"📁 找到 {len(json_files)} 个关系文件,开始逐个导入...\n") - - for idx, filename in enumerate(json_files, 1): - filepath = os.path.join(RELATIONSHIP_FOLDER, filename) - print(f"\n📄 [{idx}/{len(json_files)}] 正在处理: {filename}") - - try: - raw_rels = load_relationships_from_file(filepath) - print(f" ➤ 原始关系数: {len(raw_rels)}") - - valid_rels = process_relationships(raw_rels) - print(f" ➤ 有效关系数: {len(valid_rels)}") - - if not valid_rels: - print(" ⚠️ 跳过:无有效关系") - continue - - with driver.session() as session: - created = session.execute_write(import_relationships_in_batches, valid_rels, BATCH_SIZE) - - total_global_created += created - total_global_processed += len(valid_rels) - print(f" ✅ 文件 {filename} 导入完成,创建 {created} 条关系") - - except Exception as e: - print(f" ❌ 处理 {filename} 时出错: {e}") - continue # 继续处理下一个文件 - - print("\n" + "="*60) - print(f"🎉 全部导入完成!") - print(f"📊 总共处理有效关系: {total_global_processed}") - print(f"✅ 总共成功创建关系: {total_global_created}") - driver.close() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/1218.py b/1218.py deleted file mode 100644 index b495df5..0000000 --- a/1218.py +++ /dev/null @@ -1,172 +0,0 @@ -import pandas as pd -import json -import numpy as np -import re - -# === 配置 === -EXCEL_PATH = r"C:\Users\hanyuqing\Desktop\最新国家医保ICD编码\最新国家医保ICD编码\ICD-10医保版数据.xlsx" -OUTPUT_JSON_PATH = "icd10_tree_preserve_order.json" - -# === 章 code 格式化函数(同前)=== -def extract_chapter_number(text): - if not text: - return None - text = str(text).strip() - match = re.search(r'第([一二三四五六七八九十百\d]+)章', text) - if match: - num_str = match.group(1) - chinese_num_map = { - '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, - '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, - '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, - '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, - '二十一': 21, '二十二': 22 - } - if num_str in chinese_num_map: - return chinese_num_map[num_str] - elif num_str.isdigit(): - return int(num_str) - if text.isdigit(): - return int(text) - if text.endswith('.') and text[:-1].isdigit(): - return int(text[:-1]) - return None - -def format_chapter_code(original): - num = extract_chapter_number(original) - if num is not None: - return f"第{num}章" - return str(original).strip() if original else "" - -# === 工具函数 === -def get_clean(val): - if val is None or str(val).strip().lower() in ("", "nan", "none"): - return None - return str(val).strip() - -def make_fields(code, name): - code_str = code or "" - label_str = name or "" - title_str = f"{code_str} {label_str}" if code_str and label_str else (code_str or label_str) - return code_str, label_str, title_str - -# === 构建有序树:每个节点的 children 是 list === -class TreeNode: - def __init__(self, code="", label="", title=""): - self.code = code - self.label = label - self.title = title - self.children = [] # 用 list 保持顺序 - self._child_key_set = set() # 用于快速去重:存储 (code, label) - - def add_or_get_child(self, code, label, title): - key = (code, label) - if key in self._child_key_set: - # 已存在,返回已有节点 - for child in self.children: - if child.code == code and child.label == label: - return child - else: - # 不存在,创建新节点 - new_child = TreeNode(code=code, label=label, title=title) - self.children.append(new_child) - self._child_key_set.add(key) - return new_child - return None # 实际不会走到这里 - -# === 构建根节点 === -root = TreeNode() -root.children = [] # 实际章节点挂在这里 - -chapter_map = {} # key: (code, label) -> TreeNode,避免重复创建章 - -# === 读取 Excel === -df = pd.read_excel( - EXCEL_PATH, - header=1, - dtype=str, - engine='openpyxl' -) -df.columns = df.columns.astype(str).str.strip() -df = df.replace({np.nan: None}) - -for idx, row in df.iterrows(): - raw_chapter = get_clean(row.get("章")) - chapter_name = get_clean(row.get("章的名称")) - - section_code = get_clean(row.get("节代码范围")) - section_name = get_clean(row.get("节名称")) - - category_code = get_clean(row.get("类目代码")) - category_name = get_clean(row.get("类目名称")) - - subcategory_code = get_clean(row.get("亚目代码")) - subcategory_name = get_clean(row.get("亚目名称")) - - diagnosis_code = get_clean(row.get("条目(诊断)代码")) - diagnosis_name = get_clean(row.get("条目(诊断)名称")) - - if not raw_chapter and not chapter_name: - continue - - # === 章 === - chapter_code = format_chapter_code(raw_chapter) - chap_label = chapter_name or chapter_code - chap_title = f"{chapter_code} {chapter_name}" if chapter_name else chapter_code - chap_key = (chapter_code, chap_label) - - if chap_key not in chapter_map: - chapter_node = TreeNode(code=chapter_code, label=chap_label, title=chap_title) - root.children.append(chapter_node) - chapter_map[chap_key] = chapter_node - else: - chapter_node = chapter_map[chap_key] - - current = chapter_node - - # === 节 === - if section_code or section_name: - sec_code, sec_label, sec_title = make_fields(section_code, section_name) - current = current.add_or_get_child(sec_code, sec_label, sec_title) - - # === 类目 === - if category_code or category_name: - cat_code, cat_label, cat_title = make_fields(category_code, category_name) - current = current.add_or_get_child(cat_code, cat_label, cat_title) - - # === 亚目 === - if subcategory_code or subcategory_name: - sub_code, sub_label, sub_title = make_fields(subcategory_code, subcategory_name) - current = current.add_or_get_child(sub_code, sub_label, sub_title) - - # === 条目 === - if diagnosis_code or diagnosis_name: - diag_code, diag_label, diag_title = make_fields(diagnosis_code, diagnosis_name) - current.add_or_get_child(diag_code, diag_label, diag_title) - -# === 转为带 id 的标准格式(DFS 顺序遍历)=== -next_id = 1 - -def node_to_dict(node): - global next_id - item = { - "id": next_id, - "code": node.code, - "label": node.label, - "title": node.title - } - next_id += 1 - - if node.children: - item["children"] = [node_to_dict(child) for child in node.children] - - return item - -treeData = [node_to_dict(chap) for chap in root.children] - -# === 保存 JSON === -with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f: - json.dump(treeData, f, ensure_ascii=False, indent=2) - -print(f"✅ 树形结构已生成,共 {len(treeData)} 个章节点") -print(f"📄 输出文件: {OUTPUT_JSON_PATH}") \ No newline at end of file diff --git a/icd_parse.py b/icd_parse.py new file mode 100644 index 0000000..3aa9a5c --- /dev/null +++ b/icd_parse.py @@ -0,0 +1,173 @@ +import pandas as pd +import json +import numpy as np +import re + +# === 配置 === +EXCEL_PATH = r"C:\Users\hanyuqing\Desktop\最新国家医保ICD编码\最新国家医保ICD编码\ICD-10医保版数据.xlsx" +OUTPUT_JSON_PATH = "icd10_tree_with_level.json" + +# === 章 code 格式化 === +def extract_chapter_number(text): + if not text: + return None + text = str(text).strip() + match = re.search(r'第([一二三四五六七八九十百\d]+)章', text) + if match: + num_str = match.group(1) + chinese_num_map = { + '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, + '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, + '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, + '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, + '二十一': 21, '二十二': 22 + } + if num_str in chinese_num_map: + return chinese_num_map[num_str] + elif num_str.isdigit(): + return int(num_str) + if text.isdigit(): + return int(text) + if text.endswith('.') and text[:-1].isdigit(): + return int(text[:-1]) + return None + +def format_chapter_code(original): + num = extract_chapter_number(original) + if num is not None: + return f"第{num}章" + return str(original).strip() if original else "" + +def get_clean(val): + if val is None or str(val).strip().lower() in ("", "nan", "none"): + return None + return str(val).strip() + +def make_fields(code, name): + code_str = code or "" + label_str = name or "" + title_str = f"{code_str} {label_str}" if code_str and label_str else (code_str or label_str) + return code_str, label_str, title_str + +# === 带 level 的树节点 === +class TreeNode: + def __init__(self, code="", label="", title="", level=""): + self.code = code + self.label = label + self.title = title + self.level = level # 新增字段 + self.children = [] + self._child_key_set = set() # (code, label) + + def add_or_get_child(self, code, label, title, level): + key = (code, label) + if key in self._child_key_set: + for child in self.children: + if child.code == code and child.label == label: + return child + else: + new_child = TreeNode(code=code, label=label, title=title, level=level) + self.children.append(new_child) + self._child_key_set.add(key) + return new_child + return None + +# === 构建树 === +root = TreeNode() +chapter_map = {} + +df = pd.read_excel( + EXCEL_PATH, + header=1, + dtype=str, + engine='openpyxl' +) +df.columns = df.columns.astype(str).str.strip() +df = df.replace({np.nan: None}) + +for idx, row in df.iterrows(): + raw_chapter = get_clean(row.get("章")) + chapter_name = get_clean(row.get("章的名称")) + + section_code = get_clean(row.get("节代码范围")) + section_name = get_clean(row.get("节名称")) + + category_code = get_clean(row.get("类目代码")) + category_name = get_clean(row.get("类目名称")) + + subcategory_code = get_clean(row.get("亚目代码")) + subcategory_name = get_clean(row.get("亚目名称")) + + diagnosis_code = get_clean(row.get("条目(诊断)代码")) + diagnosis_name = get_clean(row.get("条目(诊断)名称")) + + if not raw_chapter and not chapter_name: + continue + + # === 章 === + chapter_code = format_chapter_code(raw_chapter) + chap_label = chapter_name or chapter_code + chap_title = f"{chapter_code} {chapter_name}" if chapter_name else chapter_code + chap_key = (chapter_code, chap_label) + + if chap_key not in chapter_map: + chapter_node = TreeNode( + code=chapter_code, + label=chap_label, + title=chap_title, + level="chapter" + ) + root.children.append(chapter_node) + chapter_map[chap_key] = chapter_node + else: + chapter_node = chapter_map[chap_key] + + current = chapter_node + + # === 节 === + if section_code or section_name: + sec_code, sec_label, sec_title = make_fields(section_code, section_name) + current = current.add_or_get_child(sec_code, sec_label, sec_title, "section") + + # === 类目 === + if category_code or category_name: + cat_code, cat_label, cat_title = make_fields(category_code, category_name) + current = current.add_or_get_child(cat_code, cat_label, cat_title, "category") + + # === 亚目 === + if subcategory_code or subcategory_name: + sub_code, sub_label, sub_title = make_fields(subcategory_code, subcategory_name) + current = current.add_or_get_child(sub_code, sub_label, sub_title, "subcategory") + + # === 条目 === + if diagnosis_code or diagnosis_name: + diag_code, diag_label, diag_title = make_fields(diagnosis_code, diagnosis_name) + current.add_or_get_child(diag_code, diag_label, diag_title, "diagnosis") + +# === 转为带 id 的 dict === +next_id = 1 + +def node_to_dict(node): + global next_id + item = { + "id": next_id, + "code": node.code, + "label": node.label, + "title": node.title, + "level": node.level # 新增 + } + next_id += 1 + + if node.children: + item["children"] = [node_to_dict(child) for child in node.children] + + return item + +treeData = [node_to_dict(chap) for chap in root.children] + +# === 保存 === +with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f: + json.dump(treeData, f, ensure_ascii=False, indent=2) + +print(f"✅ 树形结构已生成,共 {len(treeData)} 个章节点") +print(f"📄 输出文件: {OUTPUT_JSON_PATH}") \ No newline at end of file diff --git a/icd_parse_name.py b/icd_parse_name.py new file mode 100644 index 0000000..b495df5 --- /dev/null +++ b/icd_parse_name.py @@ -0,0 +1,172 @@ +import pandas as pd +import json +import numpy as np +import re + +# === 配置 === +EXCEL_PATH = r"C:\Users\hanyuqing\Desktop\最新国家医保ICD编码\最新国家医保ICD编码\ICD-10医保版数据.xlsx" +OUTPUT_JSON_PATH = "icd10_tree_preserve_order.json" + +# === 章 code 格式化函数(同前)=== +def extract_chapter_number(text): + if not text: + return None + text = str(text).strip() + match = re.search(r'第([一二三四五六七八九十百\d]+)章', text) + if match: + num_str = match.group(1) + chinese_num_map = { + '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, + '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, + '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, + '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, + '二十一': 21, '二十二': 22 + } + if num_str in chinese_num_map: + return chinese_num_map[num_str] + elif num_str.isdigit(): + return int(num_str) + if text.isdigit(): + return int(text) + if text.endswith('.') and text[:-1].isdigit(): + return int(text[:-1]) + return None + +def format_chapter_code(original): + num = extract_chapter_number(original) + if num is not None: + return f"第{num}章" + return str(original).strip() if original else "" + +# === 工具函数 === +def get_clean(val): + if val is None or str(val).strip().lower() in ("", "nan", "none"): + return None + return str(val).strip() + +def make_fields(code, name): + code_str = code or "" + label_str = name or "" + title_str = f"{code_str} {label_str}" if code_str and label_str else (code_str or label_str) + return code_str, label_str, title_str + +# === 构建有序树:每个节点的 children 是 list === +class TreeNode: + def __init__(self, code="", label="", title=""): + self.code = code + self.label = label + self.title = title + self.children = [] # 用 list 保持顺序 + self._child_key_set = set() # 用于快速去重:存储 (code, label) + + def add_or_get_child(self, code, label, title): + key = (code, label) + if key in self._child_key_set: + # 已存在,返回已有节点 + for child in self.children: + if child.code == code and child.label == label: + return child + else: + # 不存在,创建新节点 + new_child = TreeNode(code=code, label=label, title=title) + self.children.append(new_child) + self._child_key_set.add(key) + return new_child + return None # 实际不会走到这里 + +# === 构建根节点 === +root = TreeNode() +root.children = [] # 实际章节点挂在这里 + +chapter_map = {} # key: (code, label) -> TreeNode,避免重复创建章 + +# === 读取 Excel === +df = pd.read_excel( + EXCEL_PATH, + header=1, + dtype=str, + engine='openpyxl' +) +df.columns = df.columns.astype(str).str.strip() +df = df.replace({np.nan: None}) + +for idx, row in df.iterrows(): + raw_chapter = get_clean(row.get("章")) + chapter_name = get_clean(row.get("章的名称")) + + section_code = get_clean(row.get("节代码范围")) + section_name = get_clean(row.get("节名称")) + + category_code = get_clean(row.get("类目代码")) + category_name = get_clean(row.get("类目名称")) + + subcategory_code = get_clean(row.get("亚目代码")) + subcategory_name = get_clean(row.get("亚目名称")) + + diagnosis_code = get_clean(row.get("条目(诊断)代码")) + diagnosis_name = get_clean(row.get("条目(诊断)名称")) + + if not raw_chapter and not chapter_name: + continue + + # === 章 === + chapter_code = format_chapter_code(raw_chapter) + chap_label = chapter_name or chapter_code + chap_title = f"{chapter_code} {chapter_name}" if chapter_name else chapter_code + chap_key = (chapter_code, chap_label) + + if chap_key not in chapter_map: + chapter_node = TreeNode(code=chapter_code, label=chap_label, title=chap_title) + root.children.append(chapter_node) + chapter_map[chap_key] = chapter_node + else: + chapter_node = chapter_map[chap_key] + + current = chapter_node + + # === 节 === + if section_code or section_name: + sec_code, sec_label, sec_title = make_fields(section_code, section_name) + current = current.add_or_get_child(sec_code, sec_label, sec_title) + + # === 类目 === + if category_code or category_name: + cat_code, cat_label, cat_title = make_fields(category_code, category_name) + current = current.add_or_get_child(cat_code, cat_label, cat_title) + + # === 亚目 === + if subcategory_code or subcategory_name: + sub_code, sub_label, sub_title = make_fields(subcategory_code, subcategory_name) + current = current.add_or_get_child(sub_code, sub_label, sub_title) + + # === 条目 === + if diagnosis_code or diagnosis_name: + diag_code, diag_label, diag_title = make_fields(diagnosis_code, diagnosis_name) + current.add_or_get_child(diag_code, diag_label, diag_title) + +# === 转为带 id 的标准格式(DFS 顺序遍历)=== +next_id = 1 + +def node_to_dict(node): + global next_id + item = { + "id": next_id, + "code": node.code, + "label": node.label, + "title": node.title + } + next_id += 1 + + if node.children: + item["children"] = [node_to_dict(child) for child in node.children] + + return item + +treeData = [node_to_dict(chap) for chap in root.children] + +# === 保存 JSON === +with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f: + json.dump(treeData, f, ensure_ascii=False, indent=2) + +print(f"✅ 树形结构已生成,共 {len(treeData)} 个章节点") +print(f"📄 输出文件: {OUTPUT_JSON_PATH}") \ No newline at end of file diff --git a/parse_data.py b/parse_data.py new file mode 100644 index 0000000..39f0f05 --- /dev/null +++ b/parse_data.py @@ -0,0 +1,165 @@ +import json +import re +import os +from neo4j import GraphDatabase + +# === 配置 === +NEO4J_URI = "bolt://localhost:7687" +NEO4J_USER = "neo4j" +NEO4J_PASSWORD = "12345678" # 👈 请确保密码正确 +RELATIONSHIP_FOLDER = r"D:\temp\669" +BATCH_SIZE = 100 + + +def sanitize_relationship_type(rel_type: str) -> str: + """清理关系类型,确保合法""" + if not isinstance(rel_type, str): + rel_type = str(rel_type) + sanitized = re.sub(r"[^a-zA-Z0-9_]", "", rel_type) + if not sanitized or sanitized[0].isdigit(): + sanitized = "REL_" + sanitized + return sanitized or "RELATED" + + +def extract_start_end(rel: dict): + """兼容多种字段名提取 start/end""" + for s_key, e_key in [("start", "end"), ("source", "target"), ("from", "to")]: + s = rel.get(s_key) + e = rel.get(e_key) + if s is not None and e is not None: + return s, e + return None, None + + +def load_relationships_from_file(filepath): + """从单个 JSON 文件加载 relationships""" + with open(filepath, "r", encoding="utf-8-sig") as f: + data = json.load(f) + + relationships = [] + if isinstance(data, list): + for item in data: + if isinstance(item, dict) and "relationships" in item: + relationships.extend(item["relationships"]) + elif isinstance(item, dict): + relationships.append(item) + elif isinstance(data, dict) and "relationships" in data: + relationships = data["relationships"] + else: + relationships = data if isinstance(data, list) else [] + + return relationships + + +def process_relationships(relationships): + """清洗并验证关系列表""" + valid_rels = [] + for rel in relationships: + start_id, end_id = extract_start_end(rel) + rel_type = rel.get("type", "RELATED") + props = rel.get("properties", {}) or {} + + if start_id is None or end_id is None: + continue + + try: + start_id = int(float(start_id)) + end_id = int(float(end_id)) + except (TypeError, ValueError): + continue + + valid_rels.append({ + "start": start_id, + "end": end_id, + "type": sanitize_relationship_type(rel_type), + "props": props + }) + return valid_rels + + +def import_relationships_in_batches(tx, rels, batch_size): + total = len(rels) + created_total = 0 + + for i in range(0, total, batch_size): + batch = rels[i:i + batch_size] + rel_groups = {} + for rel in batch: + rel_groups.setdefault(rel["type"], []).append({ + "start": rel["start"], + "end": rel["end"], + "props": rel["props"] + }) + + created_this_batch = 0 + for rel_type, group in rel_groups.items(): + cypher = f""" + UNWIND $rels AS r + MATCH (a {{nodeId: r.start}}) + MATCH (b {{nodeId: r.end}}) + WITH a, b, r + WHERE a IS NOT NULL AND b IS NOT NULL + MERGE (a)-[rel:`{rel_type}`]->(b) + SET rel += r.props + RETURN count(rel) AS c + """ + result = tx.run(cypher, rels=group).single() + created_this_batch += result["c"] + + created_total += created_this_batch + print(f" ➤ 本批创建关系: {created_this_batch} 条") + + return created_total + + +def main(): + driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) + + # 获取所有 JSON 文件,并按文件名排序(确保 relations_001.json 先于 002) + json_files = [f for f in os.listdir(RELATIONSHIP_FOLDER) if f.startswith("relations_") and f.endswith(".json")] + json_files.sort() # 按字典序排序,适用于 001, 002... 格式 + + if not json_files: + print("❌ 文件夹中没有找到 relations_*.json 文件") + return + + total_global_created = 0 + total_global_processed = 0 + + print(f"📁 找到 {len(json_files)} 个关系文件,开始逐个导入...\n") + + for idx, filename in enumerate(json_files, 1): + filepath = os.path.join(RELATIONSHIP_FOLDER, filename) + print(f"\n📄 [{idx}/{len(json_files)}] 正在处理: {filename}") + + try: + raw_rels = load_relationships_from_file(filepath) + print(f" ➤ 原始关系数: {len(raw_rels)}") + + valid_rels = process_relationships(raw_rels) + print(f" ➤ 有效关系数: {len(valid_rels)}") + + if not valid_rels: + print(" ⚠️ 跳过:无有效关系") + continue + + with driver.session() as session: + created = session.execute_write(import_relationships_in_batches, valid_rels, BATCH_SIZE) + + total_global_created += created + total_global_processed += len(valid_rels) + print(f" ✅ 文件 {filename} 导入完成,创建 {created} 条关系") + + except Exception as e: + print(f" ❌ 处理 {filename} 时出错: {e}") + continue # 继续处理下一个文件 + + print("\n" + "="*60) + print(f"🎉 全部导入完成!") + print(f"📊 总共处理有效关系: {total_global_processed}") + print(f"✅ 总共成功创建关系: {total_global_created}") + driver.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test1217.py b/test1217.py deleted file mode 100644 index 3aa9a5c..0000000 --- a/test1217.py +++ /dev/null @@ -1,173 +0,0 @@ -import pandas as pd -import json -import numpy as np -import re - -# === 配置 === -EXCEL_PATH = r"C:\Users\hanyuqing\Desktop\最新国家医保ICD编码\最新国家医保ICD编码\ICD-10医保版数据.xlsx" -OUTPUT_JSON_PATH = "icd10_tree_with_level.json" - -# === 章 code 格式化 === -def extract_chapter_number(text): - if not text: - return None - text = str(text).strip() - match = re.search(r'第([一二三四五六七八九十百\d]+)章', text) - if match: - num_str = match.group(1) - chinese_num_map = { - '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, - '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, - '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, - '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, - '二十一': 21, '二十二': 22 - } - if num_str in chinese_num_map: - return chinese_num_map[num_str] - elif num_str.isdigit(): - return int(num_str) - if text.isdigit(): - return int(text) - if text.endswith('.') and text[:-1].isdigit(): - return int(text[:-1]) - return None - -def format_chapter_code(original): - num = extract_chapter_number(original) - if num is not None: - return f"第{num}章" - return str(original).strip() if original else "" - -def get_clean(val): - if val is None or str(val).strip().lower() in ("", "nan", "none"): - return None - return str(val).strip() - -def make_fields(code, name): - code_str = code or "" - label_str = name or "" - title_str = f"{code_str} {label_str}" if code_str and label_str else (code_str or label_str) - return code_str, label_str, title_str - -# === 带 level 的树节点 === -class TreeNode: - def __init__(self, code="", label="", title="", level=""): - self.code = code - self.label = label - self.title = title - self.level = level # 新增字段 - self.children = [] - self._child_key_set = set() # (code, label) - - def add_or_get_child(self, code, label, title, level): - key = (code, label) - if key in self._child_key_set: - for child in self.children: - if child.code == code and child.label == label: - return child - else: - new_child = TreeNode(code=code, label=label, title=title, level=level) - self.children.append(new_child) - self._child_key_set.add(key) - return new_child - return None - -# === 构建树 === -root = TreeNode() -chapter_map = {} - -df = pd.read_excel( - EXCEL_PATH, - header=1, - dtype=str, - engine='openpyxl' -) -df.columns = df.columns.astype(str).str.strip() -df = df.replace({np.nan: None}) - -for idx, row in df.iterrows(): - raw_chapter = get_clean(row.get("章")) - chapter_name = get_clean(row.get("章的名称")) - - section_code = get_clean(row.get("节代码范围")) - section_name = get_clean(row.get("节名称")) - - category_code = get_clean(row.get("类目代码")) - category_name = get_clean(row.get("类目名称")) - - subcategory_code = get_clean(row.get("亚目代码")) - subcategory_name = get_clean(row.get("亚目名称")) - - diagnosis_code = get_clean(row.get("条目(诊断)代码")) - diagnosis_name = get_clean(row.get("条目(诊断)名称")) - - if not raw_chapter and not chapter_name: - continue - - # === 章 === - chapter_code = format_chapter_code(raw_chapter) - chap_label = chapter_name or chapter_code - chap_title = f"{chapter_code} {chapter_name}" if chapter_name else chapter_code - chap_key = (chapter_code, chap_label) - - if chap_key not in chapter_map: - chapter_node = TreeNode( - code=chapter_code, - label=chap_label, - title=chap_title, - level="chapter" - ) - root.children.append(chapter_node) - chapter_map[chap_key] = chapter_node - else: - chapter_node = chapter_map[chap_key] - - current = chapter_node - - # === 节 === - if section_code or section_name: - sec_code, sec_label, sec_title = make_fields(section_code, section_name) - current = current.add_or_get_child(sec_code, sec_label, sec_title, "section") - - # === 类目 === - if category_code or category_name: - cat_code, cat_label, cat_title = make_fields(category_code, category_name) - current = current.add_or_get_child(cat_code, cat_label, cat_title, "category") - - # === 亚目 === - if subcategory_code or subcategory_name: - sub_code, sub_label, sub_title = make_fields(subcategory_code, subcategory_name) - current = current.add_or_get_child(sub_code, sub_label, sub_title, "subcategory") - - # === 条目 === - if diagnosis_code or diagnosis_name: - diag_code, diag_label, diag_title = make_fields(diagnosis_code, diagnosis_name) - current.add_or_get_child(diag_code, diag_label, diag_title, "diagnosis") - -# === 转为带 id 的 dict === -next_id = 1 - -def node_to_dict(node): - global next_id - item = { - "id": next_id, - "code": node.code, - "label": node.label, - "title": node.title, - "level": node.level # 新增 - } - next_id += 1 - - if node.children: - item["children"] = [node_to_dict(child) for child in node.children] - - return item - -treeData = [node_to_dict(chap) for chap in root.children] - -# === 保存 === -with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f: - json.dump(treeData, f, ensure_ascii=False, indent=2) - -print(f"✅ 树形结构已生成,共 {len(treeData)} 个章节点") -print(f"📄 输出文件: {OUTPUT_JSON_PATH}") \ No newline at end of file