-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkg_builder_nolinenode.py
More file actions
248 lines (199 loc) · 8.83 KB
/
kg_builder_nolinenode.py
File metadata and controls
248 lines (199 loc) · 8.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
"""
V4 图谱构建器 - 简化版 (w/o LineNode)
消融实验:移除 LineNode 演化链,用于证明 LineNode 的贡献
简化设计:
- 实体只保留最新状态(current_state, updated_at)
- 使用 Entity -[:MENTIONED_IN]-> Note 直接关系
"""
import json
from pathlib import Path
from typing import List, Dict
from knowledge_graph_nolinenode import KnowledgeGraphNoLineNode
from entity_extractor import ContextAwareExtractor, SummaryGenerator, EntitySummaryMerger
from config import DATA_PATHS, STM_CONFIG, LPM_CONFIG
class KGBuilderNoLineNode:
"""
简化版图谱构建器 (w/o LineNode)
构建流程:
1. 创建 Note 节点 + 时序边
2. 遍历 Notes,带上下文提取实体/关系
3. 创建或更新 Entity(只保留最新状态)
4. 创建 Entity -[:MENTIONED_IN]-> Note 关系
5. 定期更新全局摘要
"""
def __init__(self, kg: KnowledgeGraphNoLineNode):
self.kg = kg
self.extractor = ContextAwareExtractor()
self.summary_generator = SummaryGenerator()
self.entity_summary_merger = EntitySummaryMerger()
self.history_window = STM_CONFIG["history_window"]
self.summary_interval = LPM_CONFIG["summary_update_interval"]
self._log_file = None
def _log(self, msg: str):
"""写入构建日志"""
if self._log_file is None:
log_path = Path(__file__).parent / "extraction_log.txt"
self._log_file = open(log_path, "w", encoding="utf-8")
self._log_file.write(msg + "\n")
self._log_file.flush()
def build_from_conversations(self, conversations: List[dict]) -> dict:
"""从对话列表构建图谱"""
print(f"[Builder] Building graph from {len(conversations)} conversations...")
self._log("=" * 60)
self._log(f"Building graph from {len(conversations)} conversations")
self._log("=" * 60)
# Phase 1: 创建 Note 节点
print("[Builder] Phase 1: Creating Note nodes...")
for conv in conversations:
self.kg.create_note(conv)
# Phase 2: 创建时序边
print("[Builder] Phase 2: Creating temporal edges...")
self.kg.create_temporal_edges()
# Phase 3: 带上下文提取实体和关系
print("[Builder] Phase 3: Context-aware entity extraction...")
self._log("\nPhase 3: Entity Extraction\n")
self._extract_with_context(conversations)
# Phase 4: 生成最终全局摘要
print("[Builder] Phase 4: Generating global summary...")
self._update_global_summary()
stats = self.kg.get_stats()
print(f"[Builder] Done. Stats: {stats}")
self._log(f"\n{'=' * 60}")
self._log(f"Build Complete. Stats: {stats}")
# 关闭日志文件
if self._log_file:
self._log_file.close()
return stats
def _extract_with_context(self, conversations: List[dict]) -> None:
"""带上下文遍历提取"""
total = len(conversations)
for i, conv in enumerate(conversations):
note_id = conv.get("id", f"N{i+1}")
seq = int(note_id[1:]) if note_id.startswith("N") else i
# 进度显示
print(f"[Builder] Extracting {i + 1}/{total}...")
# 构建上下文
global_summary = self.kg.get_global_summary()
existing_entities = self._get_entities_for_prompt()
history = self.kg.get_recent_notes(before_seq=seq, limit=self.history_window)
# 提取
result = self.extractor.extract(
note=conv,
global_summary=global_summary,
existing_entities=existing_entities,
history=history
)
# 日志:显示提取结果
entity_count = len(result.entity_updates)
relation_count = len(result.relation_updates)
msg = f" -> Extracted {entity_count} entities, {relation_count} relations"
print(msg)
self._log(f"[{i+1}/{total}] {note_id}: {msg}")
# 显示实体详情
if entity_count > 0:
entities_summary = ", ".join([f"{eu.entity_name}({eu.op})" for eu in result.entity_updates])
entities_msg = f" Entities: {entities_summary}"
print(entities_msg)
self._log(f" {entities_summary}")
# 显示关系详情
if relation_count > 0:
relations_summary = ", ".join([f"{ru.source}-{ru.relation_type}->{ru.target}" for ru in result.relation_updates])
self._log(f" Relations: {relations_summary}")
# 应用 CRUD 操作(简化版)
self._apply_extraction_result(result)
# 定期更新全局摘要
if (i + 1) % self.summary_interval == 0:
self._update_global_summary()
def _get_entities_for_prompt(self) -> List[dict]:
"""获取用于 prompt 的实体列表"""
entities = self.kg.get_all_entities()
return [
{
"name": e.get("name", ""),
"type": e.get("type", ""),
"summary": e.get("summary", "")
}
for e in entities[:LPM_CONFIG["max_entities_in_prompt"]]
]
def _apply_extraction_result(self, result) -> None:
"""
应用提取结果到图谱(简化版)
关键变化:
- 不创建 LineNode
- Entity 只保留最新状态(current_state, updated_at)
- 使用 link_entity_to_note() 创建 MENTIONED_IN 关系
"""
note_id = result.note_id
# 处理实体更新
for eu in result.entity_updates:
if eu.op == "NOOP":
continue
# 创建或更新实体(简化版:只保留最新状态)
self.kg.create_or_update_entity(
name=eu.entity_name,
display_name=eu.display_name,
entity_type=eu.entity_type,
heat=1,
current_state=eu.state_change, # 新增:记录最新状态
updated_at=note_id # 新增:记录最后更新时间
)
# 创建 Entity -[:MENTIONED_IN]-> Note 关系
self.kg.link_entity_to_note(eu.entity_name, note_id)
# 如果有状态变化描述,使用 LLM 合并实体摘要
if eu.op in ["CREATE", "UPDATE"]:
if eu.state_change or eu.context:
current = self.kg.get_entity(eu.entity_name)
if current:
old_summary = current.get("summary", "")
new_info = eu.state_change or eu.context
# 使用 LLM 智能合并摘要
if new_info and new_info not in old_summary:
merged_summary = self.entity_summary_merger.merge(
entity_name=eu.entity_name,
entity_type=eu.entity_type,
old_summary=old_summary,
new_info=new_info
)
self.kg.update_entity_summary(eu.entity_name, merged_summary)
# 注意:简化版不处理关系(如需要可添加)
def _update_global_summary(self) -> None:
"""更新全局摘要"""
entities = self.kg.get_all_entities()
if not entities:
return
entities_for_summary = [
{
"name": e.get("name", ""),
"type": e.get("type", ""),
"summary": e.get("summary", "")
}
for e in entities[:30]
]
new_summary = self.summary_generator.generate(entities_for_summary)
self.kg.set_global_summary(new_summary)
def build_from_benchmark_file(self, filepath: str) -> dict:
"""从 benchmark 文件构建图谱"""
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
conversations = data.get("conversations", [])
return self.build_from_conversations(conversations)
# 保持与 baseline 相同的类名别名
KnowledgeGraphBuilder = KGBuilderNoLineNode
def build_graph(benchmark_file: str, clear_existing: bool = True) -> dict:
"""快捷函数:构建图谱"""
kg = KnowledgeGraphNoLineNode()
if clear_existing:
print("[Builder] Clearing existing data...")
kg.clear_all()
builder = KGBuilderNoLineNode(kg)
stats = builder.build_from_benchmark_file(benchmark_file)
kg.close()
return stats
if __name__ == "__main__":
script_dir = Path(__file__).parent
benchmark_file = script_dir / DATA_PATHS["benchmark_file"]
if benchmark_file.exists():
stats = build_graph(str(benchmark_file))
print(f"Final stats: {stats}")
else:
print(f"File not found: {benchmark_file}")