KGPython/python/parse_data.py


								import json

								import re

								import os

								from neo4j import GraphDatabase


								# === 配置 ===

								NEO4J_URI = "bolt://localhost:7687"

								NEO4J_USER = "neo4j"

								NEO4J_PASSWORD = "12345678"  # 👈 请确保密码正确

								RELATIONSHIP_FOLDER = r"D:\temp\669"

								BATCH_SIZE = 100


								def sanitize_relationship_type(rel_type: str) -> str:

								    """清理关系类型，确保合法"""

								    if not isinstance(rel_type, str):

								        rel_type = str(rel_type)

								    sanitized = re.sub(r"[^a-zA-Z0-9_]", "", rel_type)

								    if not sanitized or sanitized[0].isdigit():

								        sanitized = "REL_" + sanitized

								    return sanitized or "RELATED"


								def extract_start_end(rel: dict):

								    """兼容多种字段名提取 start/end"""

								    for s_key, e_key in [("start", "end"), ("source", "target"), ("from", "to")]:

								        s = rel.get(s_key)

								        e = rel.get(e_key)

								        if s is not None and e is not None:

								            return s, e

								    return None, None


								def load_relationships_from_file(filepath):

								    """从单个 JSON 文件加载 relationships"""

								    with open(filepath, "r", encoding="utf-8-sig") as f:

								        data = json.load(f)


								    relationships = []

								    if isinstance(data, list):

								        for item in data:

								            if isinstance(item, dict) and "relationships" in item:

								                relationships.extend(item["relationships"])

								            elif isinstance(item, dict):

								                relationships.append(item)

								    elif isinstance(data, dict) and "relationships" in data:

								        relationships = data["relationships"]

								    else:

								        relationships = data if isinstance(data, list) else []


								    return relationships


								def process_relationships(relationships):

								    """清洗并验证关系列表"""

								    valid_rels = []

								    for rel in relationships:

								        start_id, end_id = extract_start_end(rel)

								        rel_type = rel.get("type", "RELATED")

								        props = rel.get("properties", {}) or {}


								        if start_id is None or end_id is None:

								            continue


								        try:

								            start_id = int(float(start_id))

								            end_id = int(float(end_id))

								        except (TypeError, ValueError):

								            continue


								        valid_rels.append({

								            "start": start_id,

								            "end": end_id,

								            "type": sanitize_relationship_type(rel_type),

								            "props": props

								        })

								    return valid_rels


								def import_relationships_in_batches(tx, rels, batch_size):

								    total = len(rels)

								    created_total = 0


								    for i in range(0, total, batch_size):

								        batch = rels[i:i + batch_size]

								        rel_groups = {}

								        for rel in batch:

								            rel_groups.setdefault(rel["type"], []).append({

								                "start": rel["start"],

								                "end": rel["end"],

								                "props": rel["props"]

								            })


								        created_this_batch = 0

								        for rel_type, group in rel_groups.items():

								            cypher = f"""

								            UNWIND $rels AS r

								            MATCH (a {{nodeId: r.start}})

								            MATCH (b {{nodeId: r.end}})

								            WITH a, b, r

								            WHERE a IS NOT NULL AND b IS NOT NULL

								            MERGE (a)-[rel:`{rel_type}`]->(b)

								            SET rel += r.props

								            RETURN count(rel) AS c

								            """

								            result = tx.run(cypher, rels=group).single()

								            created_this_batch += result["c"]


								        created_total += created_this_batch

								        print(f"  ➤ 本批创建关系: {created_this_batch} 条")


								    return created_total


								def main():

								    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))


								    # 获取所有 JSON 文件，并按文件名排序（确保 relations_001.json 先于 002）

								    json_files = [f for f in os.listdir(RELATIONSHIP_FOLDER) if f.startswith("relations_") and f.endswith(".json")]

								    json_files.sort()  # 按字典序排序，适用于 001, 002... 格式


								    if not json_files:

								        print("❌ 文件夹中没有找到 relations_*.json 文件")

								        return


								    total_global_created = 0

								    total_global_processed = 0


								    print(f"📁 找到 {len(json_files)} 个关系文件，开始逐个导入...\n")


								    for idx, filename in enumerate(json_files, 1):

								        filepath = os.path.join(RELATIONSHIP_FOLDER, filename)

								        print(f"\n📄 [{idx}/{len(json_files)}] 正在处理: {filename}")


								        try:

								            raw_rels = load_relationships_from_file(filepath)

								            print(f"   ➤ 原始关系数: {len(raw_rels)}")


								            valid_rels = process_relationships(raw_rels)

								            print(f"   ➤ 有效关系数: {len(valid_rels)}")


								            if not valid_rels:

								                print("   ⚠️  跳过：无有效关系")

								                continue


								            with driver.session() as session:

								                created = session.execute_write(import_relationships_in_batches, valid_rels, BATCH_SIZE)


								            total_global_created += created

								            total_global_processed += len(valid_rels)

								            print(f"   ✅ 文件 {filename} 导入完成，创建 {created} 条关系")


								        except Exception as e:

								            print(f"   ❌ 处理 {filename} 时出错: {e}")

								            continue  # 继续处理下一个文件


								    print("\n" + "="*60)

								    print(f"🎉 全部导入完成！")

								    print(f"📊 总共处理有效关系: {total_global_processed}")

								    print(f"✅ 总共成功创建关系: {total_global_created}")

								    driver.close()


								if __name__ == "__main__":

								    main()