mine_rucweb/ss.py at main · 508liang/mine_rucweb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
import json
import os
import pickle
from collections import defaultdict
from typing import List, Tuple
import jieba
import numpy as np
from sklearn.preprocessing import StandardScaler

class SearchEngine:
    def __init__(self, db_file='ruc_crawl.db', pages_dir='crawled_pages'):
        self.db_file = db_file
        self.pages_dir = pages_dir
        self.inverted_index = defaultdict(list)
        self.doc_info = {}
        self.doc_freq = defaultdict(int)
        self.total_docs = 0
        self.pagerank_scores = {}
        self.url_to_docid = {}
        self.docid_to_url = {}
        self.avg_doc_length = 0
        self.html_content = {}
        self.stopwords = self.load_stopwords()
        self.ltr_model = None
        self.scaler = None
        self.idf_cache = {}
        self.url_to_id = {}  # URL到ID的映射
        self.doc_titles = {}  # 文档标题映射

    def load_stopwords(self) -> set:
        stopwords = set(['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
                          '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
                          '自己', '这', '那', '里', '个', '为', '与', '及', '等', '之', '中', '对', '把'])
        return stopwords

    def normalize_url(self, url):
        from urllib.parse import urlparse
        if not url:
            return ''
        parsed = urlparse(url.strip())
        scheme = parsed.scheme.lower()
        netloc = parsed.netloc.lower()
        path = parsed.path.rstrip('/') if parsed.path != '/' else '/'
        query = parsed.query
        normalized = f"{scheme}://{netloc}{path}"
        if query:
            normalized += f"?{query}"
        return normalized

    def get_url_hash(self, url):
        import hashlib
        return hashlib.md5(url.encode('utf-8')).hexdigest()

    def load_index(self, filename='search_index.pkl'):
        print("加载索引...")
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        self.inverted_index = defaultdict(list, data['inverted_index'])
        self.doc_info = data['doc_info']
        self.doc_freq = defaultdict(int, data['doc_freq'])
        self.total_docs = data['total_docs']
        self.url_to_docid = data['url_to_docid']
        self.docid_to_url = data['docid_to_url']
        self.avg_doc_length = data.get('avg_doc_length', 0)
        self.idf_cache = data.get('idf_cache', {})

        # 构建url_to_id和doc_titles
        self.url_to_id = {}
        self.doc_titles = {}
        for doc_id, info in self.doc_info.items():
            url = info.get('url', '')
            title = info.get('title', '')
            if url:
                self.url_to_id[url] = {'id': doc_id}
                self.doc_titles[doc_id] = title

        if not self.idf_cache:
            self._precompute_idf()
        print(f"索引加载完成：{self.total_docs} 个文档")

    def load_pagerank(self, pagerank_file='pagerank_results.json'):
        print("加载PageRank分数...")
        if os.path.exists(pagerank_file):
            with open(pagerank_file, 'r', encoding='utf-8') as f:
                self.pagerank_scores = json.load(f)
            print(f"加载了 {len(self.pagerank_scores)} 个PageRank分数")
        else:
            print("PageRank文件不存在，PageRank特征将为0")

    def _precompute_idf(self):
        for term, df in self.doc_freq.items():
            self.idf_cache[term] = np.log((self.total_docs - df + 0.5) / (df + 0.5) + 1)

    def tokenize(self, text: str) -> List[str]:
        if not text:
            return []
        tokens = jieba.cut(text.lower())
        tokens = [t.strip() for t in tokens if t.strip() and t not in self.stopwords and len(t.strip()) > 1]
        return tokens

    def compute_bm25_score_fast(self, query_terms: List[str], doc_id: int, doc_term_dict: dict, k1=1.5, b=0.75) -> float:
        if doc_id not in self.doc_info:
            return 0.0

        score = 0.0
        doc_length = self.doc_info[doc_id]['length']

        for term in query_terms:
            if term not in doc_term_dict:
                continue

            tf = doc_term_dict[term]
            idf = self.idf_cache.get(term, 0)

            numerator = tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * doc_length / self.avg_doc_length)
            score += idf * (numerator / denominator)

        return score

    def get_candidates_fast(self, query_terms: List[str], initial_top_k=100) -> List[Tuple[int, dict]]:
        doc_term_freq = defaultdict(lambda: defaultdict(int))

        for term in query_terms:
            if term in self.inverted_index:
                for doc_id, tf in self.inverted_index[term]:
                    doc_term_freq[doc_id][term] = tf

        if not doc_term_freq:
            return []

        scored_docs = []
        for doc_id, term_dict in doc_term_freq.items():
            score = self.compute_bm25_score_fast(query_terms, doc_id, term_dict)
            scored_docs.append((score, doc_id, term_dict))

        top_candidates = sorted(scored_docs, key=lambda x: x[0], reverse=True)[:initial_top_k]
        return [(doc_id, term_dict) for _, doc_id, term_dict in top_candidates]

    def extract_features_batch(self, query_terms: List[str], candidates: List[Tuple[int, dict]]) -> np.ndarray:
        features_list = []
        query_terms_set = set(query_terms)

        for doc_id, term_dict in candidates:
            bm25 = self.compute_bm25_score_fast(query_terms, doc_id, term_dict)

            url = self.doc_info[doc_id]['url']
            pagerank = self.pagerank_scores.get(url, 0.0)

            matched_terms = len(query_terms_set & set(term_dict.keys()))
            query_coverage = matched_terms / len(query_terms) if query_terms else 0

            doc_length = np.log(self.doc_info[doc_id]['length'] + 1)

            title = self.doc_info[doc_id]['title'].lower()
            title_terms = set(self.tokenize(title))
            title_match = len(query_terms_set & title_terms)

            features_list.append([bm25, pagerank, query_coverage, doc_length, title_match])

        return np.array(features_list)

    def get_html_content(self, doc_id: int) -> str:
        if doc_id in self.html_content:
            return self.html_content[doc_id]

        if doc_id not in self.doc_info:
            return ""

        url = self.doc_info[doc_id]['url']
        url_hash = self.get_url_hash(url)
        html_file = os.path.join(self.pages_dir, f"{url_hash}.html")

        if os.path.exists(html_file):
            try:
                with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    self.html_content[doc_id] = content
                    return content
            except Exception as e:
                print(f"读取HTML文件失败 {html_file}: {e}")
                return ""

        return ""

    def search(self, query: str, top_k=10, use_ltr=True) -> List[Tuple[str, float, str, str]]:
        query_terms = self.tokenize(query)
        if not query_terms:
            return []

        candidates = self.get_candidates_fast(query_terms, initial_top_k=70)

        if not candidates:
            return []

        if use_ltr and self.ltr_model is not None:
            features = self.extract_features_batch(query_terms, candidates)
            features_scaled = self.scaler.transform(features)
            scores = self.ltr_model.predict(features_scaled)

            scored_docs = []
            for idx, (doc_id, _) in enumerate(candidates):
                html_content = self.get_html_content(doc_id)
                scored_docs.append((
                    self.doc_info[doc_id]['url'],
                    scores[idx],
                    self.doc_info[doc_id]['title'],
                    html_content
                ))
        else:
            scored_docs = []
            for doc_id, term_dict in candidates:
                bm25 = self.compute_bm25_score_fast(query_terms, doc_id, term_dict)
                url = self.doc_info[doc_id]['url']
                html_content = self.get_html_content(doc_id)
                scored_docs.append((url, bm25, self.doc_info[doc_id]['title'], html_content))

        scored_docs.sort(key=lambda x: x[1], reverse=True)
        return scored_docs[:top_k]

    def search_with_filters(self, query: str, top_k=10, use_ltr=True,
                           sort_by='relevance', min_pagerank=0.0,
                           diversity_factor=0.0) -> List[Tuple[str, float, str, str]]:
        """
        高级搜索功能，支持多种过滤和排序选项

        Args:
            query: 查询字符串
            top_k: 返回结果数量
            use_ltr: 是否使用LTR模型
            sort_by: 排序方式 ('relevance', 'pagerank', 'length')
            min_pagerank: 最小PageRank分数过滤
            diversity_factor: 结果多样性因子 (0-1)，0表示不考虑多样性
        """
        query_terms = self.tokenize(query)
        if not query_terms:
            return []

        # 获取候选文档
        candidates = self.get_candidates_fast(query_terms, initial_top_k=100)
        if not candidates:
            return []

        # 计算初始得分
        if use_ltr and self.ltr_model is not None:
            features = self.extract_features_batch(query_terms, candidates)
            features_scaled = self.scaler.transform(features)
            initial_scores = self.ltr_model.predict(features_scaled)
        else:
            initial_scores = []
            for doc_id, term_dict in candidates:
                bm25 = self.compute_bm25_score_fast(query_terms, doc_id, term_dict)
                initial_scores.append(bm25)

        # 构建文档列表
        scored_docs = []
        for idx, (doc_id, _) in enumerate(candidates):
            url = self.doc_info[doc_id]['url']
            pagerank = self.pagerank_scores.get(url, 0.0)

            # 应用PageRank过滤
            if pagerank < min_pagerank:
                continue

            title = self.doc_info[doc_id]['title']
            html_content = self.get_html_content(doc_id)
            doc_length = self.doc_info[doc_id]['length']

            scored_docs.append({
                'url': url,
                'score': initial_scores[idx],
                'pagerank': pagerank,
                'title': title,
                'html_content': html_content,
                'length': doc_length
            })

        # 根据排序方式调整得分
        if sort_by == 'pagerank':
            for doc in scored_docs:
                doc['final_score'] = doc['pagerank']
        elif sort_by == 'length':
            for doc in scored_docs:
                doc['final_score'] = doc['length']
        else:  # relevance
            for doc in scored_docs:
                doc['final_score'] = doc['score']

        # 排序
        scored_docs.sort(key=lambda x: x['final_score'], reverse=True)

        # 应用多样性优化 (MMR算法)
        if diversity_factor > 0 and len(scored_docs) > top_k:
            scored_docs = self._apply_mmr(scored_docs, query_terms, top_k, diversity_factor)
        else:
            scored_docs = scored_docs[:top_k]

        # 格式化返回结果
        results = []
        for doc in scored_docs:
            results.append((
                doc['url'],
                doc['final_score'],
                doc['title'],
                doc['html_content']
            ))

        return results

    def _apply_mmr(self, docs, query_terms, top_k, lambda_param=0.5):
        """
        应用MMR (Maximal Marginal Relevance) 算法优化结果多样性

        Args:
            docs: 文档列表
            query_terms: 查询词列表
            top_k: 返回数量
            lambda_param: 相关性与多样性的权重平衡参数
        """
        if not docs:
            return []

        selected = []
        remaining = docs.copy()
        query_terms_set = set(query_terms)

        # 选择初始文档（相关性最高的）
        selected.append(remaining.pop(0))

        # 迭代选择剩余文档
        while len(selected) < top_k and remaining:
            max_mmr = -float('inf')
            max_idx = -1

            for idx, doc in enumerate(remaining):
                # 计算与查询的相关性
                relevance = doc['final_score']

                # 计算与已选文档的最大相似度
                max_similarity = 0
                doc_terms = set(self.tokenize(doc['title']))

                for selected_doc in selected:
                    selected_terms = set(self.tokenize(selected_doc['title']))
                    # 使用Jaccard相似度
                    if len(doc_terms | selected_terms) > 0:
                        similarity = len(doc_terms & selected_terms) / len(doc_terms | selected_terms)
                        max_similarity = max(max_similarity, similarity)

                # 计算MMR得分
                mmr_score = lambda_param * relevance - (1 - lambda_param) * max_similarity

                if mmr_score > max_mmr:
                    max_mmr = mmr_score
                    max_idx = idx

            if max_idx >= 0:
                selected.append(remaining.pop(max_idx))
            else:
                break

        return selected

def main():
    engine = SearchEngine()

    # 加载索引
    engine.load_index('search_index.pkl')

    # 加载PageRank
    engine.load_pagerank('pagerank_results.json')

    # 加载LTR模型
    if os.path.exists('ltr_model.pkl'):
        print("加载LTR模型...")
        with open('ltr_model.pkl', 'rb') as f:
            model_data = pickle.load(f)
            engine.ltr_model = model_data['model']
            engine.scaler = model_data['scaler']
        print("LTR模型加载完成\n")
    else:
        print("未找到LTR模型文件(ltr_model.pkl)，将使用BM25排序\n")

    print("=" * 60)
    print("搜索引擎已启动 (使用LTR模型排序)")
    print("=" * 60)

    while True:
        query = input("\n请输入查询内容（输入'退出'或'exit'结束）：").strip()
        if query.lower() in ['退出', 'exit', 'quit']:
            print("感谢使用，再见！")
            break

        if not query:
            print("查询内容不能为空，请重新输入")
            continue

        print(f"\n正在搜索: {query}")
        results = engine.search(query, top_k=10, use_ltr=True)

        if results:
            print(f"\n找到 {len(results)} 个结果：")
            print("-" * 60)
            for idx, (url, score, title, html_content) in enumerate(results, start=1):
                print(f"\n结果 {idx}:")
                print(f"  标题: {title}")
                print(f"  URL: {url}")
                print(f"  得分: {score:.4f}")
        else:
            print("\n没有找到相关结果，请尝试其他查询词")

if __name__ == '__main__':
    main()