# cmekg_aligner.py from difflib import SequenceMatcher from typing import List, Dict, Optional, Tuple class CMeKGAligner: def __init__(self, uri, user, password): from neo4j import GraphDatabase self.driver = GraphDatabase.driver(uri, auth=(user, password)) def infer_type_by_name(self, name: str) -> str: # 你的类型推断逻辑(保持不变) if "片" in name or "胶囊" in name or "注射" in name: return "Drug" # ... 其他规则 return "Unknown" def find_entities_batch(self, terms: List[str]) -> Dict[str, Optional[Tuple[str, str]]]: """ 批量对齐上万条实体,仅一次数据库查询 :param terms: 原始术语列表(允许重复) :return: {原始词: (标准名, 类型) 或 None} """ if not terms: return {} # 1. 去重并保留顺序(可选) unique_terms = list(dict.fromkeys(terms)) # 保持首次出现顺序 # 2. 一次性从 Neo4j 获取所有可能的候选实体 with self.driver.session() as session: result = session.run( """ UNWIND $terms AS input_name MATCH (e) WHERE toLower(e.name) CONTAINS toLower(input_name) OR toLower(input_name) CONTAINS toLower(e.name) RETURN input_name, e.name AS std_name """, terms=unique_terms ) # 构建 {input_name: [std_name1, std_name2, ...]} candidates_map = {} for record in result: inp = record["input_name"] std = record["std_name"] if inp not in candidates_map: candidates_map[inp] = [] candidates_map[inp].append(std) # 3. 对每个输入词,从候选中选最相似的标准名 output = {} for term in terms: # 遍历原始列表(含重复) if term in output: # 已处理过(因重复) continue candidates = candidates_map.get(term, []) if not candidates: output[term] = None else: # 选与 term 最相似的标准名 best_std = max(candidates, key=lambda x: SequenceMatcher(None, term, x).ratio()) entity_type = self.infer_type_by_name(best_std) output[term] = (best_std, entity_type) return output