mine_rucweb/recommendation_system.py at main · 508liang/mine_rucweb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
"""
推荐系统和搜索统计模块
提供今日推荐、热门话题、搜索趋势分析等功能
"""
import json
import os
import time
from collections import defaultdict, Counter
from datetime import datetime, timedelta
from typing import List, Dict, Tuple
import numpy as np
from bs4 import BeautifulSoup
import jieba


class RecommendationSystem:
    """推荐系统：提供今日推荐、热门话题等功能"""

    def __init__(self, search_engine):
        self.engine = search_engine
        self.cache_file = 'recommendations_cache.json'
        self.cache_lifetime = 3600  # 缓存1小时
        self.recommendations_cache = {}
        # 扩展的停用词列表（中英文）
        self.stopwords = self._load_stopwords()
        self.load_cache()

    def _load_stopwords(self) -> set:
        """加载扩展的停用词列表"""
        # 中文停用词
        chinese_stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
                            '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
                            '自己', '这', '那', '里', '个', '为', '与', '及', '等', '之', '中', '对', '把',
                            '而', '或', '以', '可以', '这个', '那个', '什么', '怎么', '为什么', '如何',
                            '但是', '因为', '所以', '如果', '虽然', '已经', '还是', '可是', '只是',
                            '它', '他', '她', '我们', '你们', '他们', '它们', '这些', '那些',
                            '能', '能够', '应该', '必须', '可能', '将', '将要', '已', '曾', '曾经'}

        # 英文停用词（常见介词、冠词、连词等）
        english_stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
                            'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
                            'would', 'should', 'could', 'may', 'might', 'must', 'can', 'this',
                            'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
                            'what', 'which', 'who', 'when', 'where', 'why', 'how', 'all', 'each',
                            'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
                            'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
                            'about', 'after', 'before', 'through', 'during', 'into', 'over',
                            'under', 'above', 'below', 'between', 'among', 'up', 'down', 'out'}

        # 特殊字符和数字
        special_chars = {'、', '。', '，', '！', '？', '；', '：', '"', '"', ''', ''', '（', '）',
                        '【', '】', '《', '》', '…', '—', '·', '~', '`', '!', '@', '#', '$',
                        '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '[', ']', '{', '}',
                        '|', '\\', '/', '<', '>', ',', '.', '?', ';', ':', '\'', '"'}

        return chinese_stopwords | english_stopwords | special_chars

    def load_cache(self):
        """加载缓存"""
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    self.recommendations_cache = json.load(f)
            except:
                self.recommendations_cache = {}

    def save_cache(self):
        """保存缓存"""
        try:
            with open(self.cache_file, 'w', encoding='utf-8') as f:
                json.dump(self.recommendations_cache, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"保存推荐缓存失败: {e}")

    def get_daily_recommendations(self, top_k=10) -> List[Dict]:
        """
        获取今日推荐
        基于PageRank分数和内容质量选择高质量文档
        """
        cache_key = 'daily_recommendations'

        # 检查缓存
        if cache_key in self.recommendations_cache:
            cache_data = self.recommendations_cache[cache_key]
            if time.time() - cache_data['timestamp'] < self.cache_lifetime:
                return cache_data['data']

        print("生成今日推荐...")
        recommendations = []

        # 获取所有文档的PageRank分数
        doc_scores = []
        for doc_id, info in self.engine.doc_info.items():
            url = self.engine.docid_to_url.get(doc_id)
            if not url:
                continue

            # 获取PageRank分数
            pagerank = self.engine.pagerank_scores.get(url, 0)

            # 计算质量分数（结合PageRank和文档长度）
            # 倾向于选择中等长度、高PageRank的文档
            length_score = 1.0 / (1.0 + abs(info['length'] - self.engine.avg_doc_length) / self.engine.avg_doc_length)
            quality_score = pagerank * 0.7 + length_score * 0.3

            doc_scores.append((doc_id, url, quality_score, pagerank))

        # 按质量分数排序
        doc_scores.sort(key=lambda x: x[2], reverse=True)

        # 选择top_k个文档，确保多样性
        selected_urls = set()
        for doc_id, url, quality_score, pagerank in doc_scores[:top_k * 3]:
            if len(recommendations) >= top_k:
                break

            # 避免同一域名重复推荐
            from urllib.parse import urlparse
            domain = urlparse(url).netloc
            if domain in selected_urls:
                continue

            selected_urls.add(domain)

            # 获取文档标题和摘要
            title = self.engine.doc_titles.get(doc_id, '')

            # 从HTML中提取摘要
            html_content = self._get_doc_html(doc_id, url)
            summary = self._extract_summary(html_content, max_length=150)

            # 提取关键词
            keywords = self._extract_keywords(html_content, top_n=5)

            recommendations.append({
                'url': url,
                'title': title if title else url,
                'summary': summary,
                'keywords': keywords,
                'pagerank': float(pagerank),
                'quality_score': float(quality_score)
            })

        # 缓存结果
        self.recommendations_cache[cache_key] = {
            'timestamp': time.time(),
            'data': recommendations
        }
        self.save_cache()

        return recommendations

    def get_trending_topics(self, top_k=10) -> List[Dict]:
        """
        获取热门话题
        基于词频和文档覆盖率分析
        """
        cache_key = 'trending_topics'

        # 检查缓存
        if cache_key in self.recommendations_cache:
            cache_data = self.recommendations_cache[cache_key]
            if time.time() - cache_data['timestamp'] < self.cache_lifetime:
                return cache_data['data']

        print("分析热门话题...")

        # 统计词频和文档频率
        term_stats = []
        for term, doc_freq in self.engine.doc_freq.items():
            # 忽略单字词、停用词和纯数字
            if len(term) < 2 or term.lower() in self.stopwords or term.isdigit():
                continue

            # 计算TF-IDF加权的重要性
            idf = self.engine.idf_cache.get(term, 0)
            # 修复：正确处理倒排索引的数据结构
            total_tf = 0
            if term in self.engine.inverted_index:
                for item in self.engine.inverted_index[term]:
                    # item是一个元组 (doc_id, tf)
                    if isinstance(item, tuple) and len(item) >= 2:
                        total_tf += item[1]  # tf是元组的第二个元素
                    elif isinstance(item, dict):
                        total_tf += item.get('tf', 0)
            importance = total_tf * idf * doc_freq

            # 文档覆盖率
            coverage = doc_freq / self.engine.total_docs

            term_stats.append({
                'term': term,
                'doc_count': doc_freq,
                'coverage': coverage,
                'importance': importance
            })

        # 按重要性排序
        term_stats.sort(key=lambda x: x['importance'], reverse=True)

        # 选择热门话题，确保多样性
        topics = []
        selected_terms = set()

        for stat in term_stats:
            if len(topics) >= top_k:
                break

            term = stat['term']

            # 避免过于相似的词
            if any(selected in term or term in selected for selected in selected_terms):
                continue

            selected_terms.add(term)
            topics.append({
                'topic': term,
                'doc_count': stat['doc_count'],
                'relevance_score': float(stat['importance'])
            })

        # 缓存结果
        self.recommendations_cache[cache_key] = {
            'timestamp': time.time(),
            'data': topics
        }
        self.save_cache()

        return topics

    def get_related_searches(self, query: str, top_k=5) -> List[str]:
        """
        获取相关搜索建议
        基于共现词和相似度分析
        """
        query_terms = self.engine.tokenize(query)
        if not query_terms:
            return []

        # 收集与查询词共现的词
        cooccurrence = Counter()

        for term in query_terms:
            if term not in self.engine.inverted_index:
                continue

            # 获取包含该词的文档
            for doc_id, doc_info in self.engine.inverted_index[term]:
                # 获取文档中的其他词
                if doc_id in self.engine.doc_info:
                    for other_term in self.engine.doc_info[doc_id].get('terms', []):
                        if other_term not in query_terms and len(other_term) > 1:
                            cooccurrence[other_term] += 1

        # 生成相关搜索
        related = []
        for term, count in cooccurrence.most_common(top_k * 2):
            # 生成新的查询建议
            suggestion = query + ' ' + term
            related.append(suggestion)

            if len(related) >= top_k:
                break

        return related

    def _get_doc_html(self, doc_id: int, url: str) -> str:
        """获取文档HTML内容"""
        # 从缓存获取
        if hasattr(self.engine, 'html_content') and doc_id in self.engine.html_content:
            return self.engine.html_content[doc_id]

        # 从文件读取
        url_hash = self.engine.get_url_hash(url)
        html_file = os.path.join(self.engine.pages_dir, f"{url_hash}.html")

        if os.path.exists(html_file):
            try:
                with open(html_file, 'r', encoding='utf-8') as f:
                    return f.read()
            except:
                pass

        return ""

    def _extract_summary(self, html_content: str, max_length=150) -> str:
        """从HTML中提取摘要"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # 移除脚本和样式
            for script in soup(["script", "style"]):
                script.decompose()

            # 获取文本
            text = soup.get_text()
            text = ' '.join(text.split())

            # 截断到指定长度
            if len(text) > max_length:
                text = text[:max_length] + '...'

            return text
        except:
            return ""

    def _extract_keywords(self, html_content: str, top_n=5) -> List[str]:
        """从HTML中提取关键词"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # 移除脚本和样式
            for script in soup(["script", "style"]):
                script.decompose()

            # 获取文本并分词
            text = soup.get_text()
            words = jieba.cut(text.lower())

            # 统计词频
            word_count = Counter()
            for word in words:
                if len(word) > 1 and word not in self.engine.stopwords:
                    word_count[word] += 1

            # 返回top_n个关键词
            return [word for word, count in word_count.most_common(top_n)]
        except:
            return []


class SearchAnalytics:
    """搜索统计分析：记录和分析用户搜索行为"""

    def __init__(self, log_file='search_analytics.json'):
        self.log_file = log_file
        self.search_logs = []
        self.load_logs()

    def load_logs(self):
        """加载搜索日志"""
        if os.path.exists(self.log_file):
            try:
                with open(self.log_file, 'r', encoding='utf-8') as f:
                    self.search_logs = json.load(f)
            except:
                self.search_logs = []

    def save_logs(self):
        """保存搜索日志"""
        try:
            # 只保留最近1000条记录
            if len(self.search_logs) > 1000:
                self.search_logs = self.search_logs[-1000:]

            with open(self.log_file, 'w', encoding='utf-8') as f:
                json.dump(self.search_logs, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"保存搜索日志失败: {e}")

    def log_search(self, query: str, result_count: int, search_time: float):
        """记录一次搜索"""
        log_entry = {
            'query': query,
            'result_count': result_count,
            'search_time': search_time,
            'timestamp': time.time(),
            'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
        self.search_logs.append(log_entry)
        self.save_logs()

    def get_popular_queries(self, days=7, top_k=10) -> List[Dict]:
        """获取热门搜索查询"""
        cutoff_time = time.time() - days * 86400

        # 统计查询频率
        query_count = Counter()
        for log in self.search_logs:
            if log['timestamp'] >= cutoff_time:
                query_count[log['query']] += 1

        # 返回top_k
        return [
            {'query': query, 'count': count}
            for query, count in query_count.most_common(top_k)
        ]

    def get_search_trends(self, days=7) -> Dict:
        """获取搜索趋势统计"""
        cutoff_time = time.time() - days * 86400

        # 按天统计
        daily_stats = defaultdict(lambda: {
            'count': 0,
            'avg_results': 0,
            'avg_time': 0
        })

        for log in self.search_logs:
            if log['timestamp'] >= cutoff_time:
                date = datetime.fromtimestamp(log['timestamp']).strftime('%Y-%m-%d')
                daily_stats[date]['count'] += 1
                daily_stats[date]['avg_results'] += log['result_count']
                daily_stats[date]['avg_time'] += log['search_time']

        # 计算平均值
        for date, stats in daily_stats.items():
            if stats['count'] > 0:
                stats['avg_results'] = stats['avg_results'] / stats['count']
                stats['avg_time'] = stats['avg_time'] / stats['count']

        # 总体统计
        total_searches = len([log for log in self.search_logs if log['timestamp'] >= cutoff_time])
        avg_search_time = np.mean([log['search_time'] for log in self.search_logs if log['timestamp'] >= cutoff_time]) if total_searches > 0 else 0

        return {
            'daily_stats': dict(daily_stats),
            'total_searches': total_searches,
            'avg_search_time': float(avg_search_time),
            'period_days': days
        }

    def get_statistics(self) -> Dict:
        """获取总体统计信息"""
        if not self.search_logs:
            return {
                'total_searches': 0,
                'unique_queries': 0,
                'avg_results': 0,
                'avg_search_time': 0
            }

        total_searches = len(self.search_logs)
        unique_queries = len(set(log['query'] for log in self.search_logs))
        avg_results = np.mean([log['result_count'] for log in self.search_logs])
        avg_search_time = np.mean([log['search_time'] for log in self.search_logs])

        return {
            'total_searches': total_searches,
            'unique_queries': unique_queries,
            'avg_results': float(avg_results),
            'avg_search_time': float(avg_search_time)
        }