KGPython/python/cmekg_aligner.py

# cmekg_aligner.py

from difflib import SequenceMatcher
from typing import List, Dict, Optional, Tuple

class CMeKGAligner:
    def __init__(self, uri, user, password):
        from neo4j import GraphDatabase
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def infer_type_by_name(self, name: str) -> str:
        # 你的类型推断逻辑（保持不变）
        if "片" in name or "胶囊" in name or "注射" in name:
            return "Drug"
        # ... 其他规则
        return "Unknown"

    def find_entities_batch(self, terms: List[str]) -> Dict[str, Optional[Tuple[str, str]]]:
        """
        批量对齐上万条实体，仅一次数据库查询
        :param terms: 原始术语列表（允许重复）
        :return: {原始词: (标准名, 类型) 或 None}
        """
        if not terms:
            return {}

        # 1. 去重并保留顺序（可选）
        unique_terms = list(dict.fromkeys(terms))  # 保持首次出现顺序

        # 2. 一次性从 Neo4j 获取所有可能的候选实体
        with self.driver.session() as session:
            result = session.run(
                """
                UNWIND $terms AS input_name
                MATCH (e)
                WHERE toLower(e.name) CONTAINS toLower(input_name)
                   OR toLower(input_name) CONTAINS toLower(e.name)
                RETURN input_name, e.name AS std_name
                """,
                terms=unique_terms
            )
            # 构建 {input_name: [std_name1, std_name2, ...]}
            candidates_map = {}
            for record in result:
                inp = record["input_name"]
                std = record["std_name"]
                if inp not in candidates_map:
                    candidates_map[inp] = []
                candidates_map[inp].append(std)

        # 3. 对每个输入词，从候选中选最相似的标准名
        output = {}
        for term in terms:  # 遍历原始列表（含重复）
            if term in output:  # 已处理过（因重复）
                continue

            candidates = candidates_map.get(term, [])
            if not candidates:
                output[term] = None
            else:
                # 选与 term 最相似的标准名
                best_std = max(candidates, key=lambda x: SequenceMatcher(None, term, x).ratio())
                entity_type = self.infer_type_by_name(best_std)
                output[term] = (best_std, entity_type)

        return output
all 3 months ago			`# cmekg_aligner.py`

			`from difflib import SequenceMatcher`
			`from typing import List, Dict, Optional, Tuple`

			`class CMeKGAligner:`
			`def __init__(self, uri, user, password):`
			`from neo4j import GraphDatabase`
			`self.driver = GraphDatabase.driver(uri, auth=(user, password))`

			`def infer_type_by_name(self, name: str) -> str:`
			`# 你的类型推断逻辑（保持不变）`
			`if "片" in name or "胶囊" in name or "注射" in name:`
			`return "Drug"`
			`# ... 其他规则`
			`return "Unknown"`

			`def find_entities_batch(self, terms: List[str]) -> Dict[str, Optional[Tuple[str, str]]]:`
			`"""`
			`批量对齐上万条实体，仅一次数据库查询`
			`:param terms: 原始术语列表（允许重复）`
			`:return: {原始词: (标准名, 类型) 或 None}`
			`"""`
			`if not terms:`
			`return {}`

			`# 1. 去重并保留顺序（可选）`
			`unique_terms = list(dict.fromkeys(terms)) # 保持首次出现顺序`

			`# 2. 一次性从 Neo4j 获取所有可能的候选实体`
			`with self.driver.session() as session:`
			`result = session.run(`
			`"""`
			`UNWIND $terms AS input_name`
			`MATCH (e)`
			`WHERE toLower(e.name) CONTAINS toLower(input_name)`
			`OR toLower(input_name) CONTAINS toLower(e.name)`
			`RETURN input_name, e.name AS std_name`
			`""",`
			`terms=unique_terms`
			`)`
			`# 构建 {input_name: [std_name1, std_name2, ...]}`
			`candidates_map = {}`
			`for record in result:`
			`inp = record["input_name"]`
			`std = record["std_name"]`
			`if inp not in candidates_map:`
			`candidates_map[inp] = []`
			`candidates_map[inp].append(std)`

			`# 3. 对每个输入词，从候选中选最相似的标准名`
			`output = {}`
			`for term in terms: # 遍历原始列表（含重复）`
			`if term in output: # 已处理过（因重复）`
			`continue`

			`candidates = candidates_map.get(term, [])`
			`if not candidates:`
			`output[term] = None`
			`else:`
			`# 选与 term 最相似的标准名`
			`best_std = max(candidates, key=lambda x: SequenceMatcher(None, term, x).ratio())`
			`entity_type = self.infer_type_by_name(best_std)`
			`output[term] = (best_std, entity_type)`

			`return output`