Skip to content

Commit 963c5b8

Browse files
committed
Fix: add paper role to auto tagger
1 parent f2f15b0 commit 963c5b8

1 file changed

Lines changed: 121 additions & 34 deletions

File tree

scripts/auto_tagger.py

Lines changed: 121 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -8,88 +8,175 @@
88
API_URL = "https://api.deepseek.com/chat/completions"
99
POSTS_DIR = "_posts"
1010

11+
1112
def get_existing_tags():
13+
"""读取已有的全部标签"""
1214
all_tags = set()
13-
if not os.path.exists(POSTS_DIR): return []
15+
if not os.path.exists(POSTS_DIR):
16+
return []
17+
1418
for filename in os.listdir(POSTS_DIR):
1519
if filename.endswith(".md"):
1620
filepath = os.path.join(POSTS_DIR, filename)
1721
try:
18-
with open(filepath, 'r', encoding='utf-8') as f:
22+
with open(filepath, "r", encoding="utf-8") as f:
1923
content = f.read()
20-
parts = content.split('---', 2)
24+
25+
parts = content.split("---", 2)
2126
if len(parts) >= 3:
2227
front_matter = yaml.safe_load(parts[1])
28+
if not front_matter:
29+
continue
30+
2331
tags = front_matter.get("tags")
2432
if isinstance(tags, list):
25-
for t in tags: all_tags.add(t)
26-
except: continue
33+
for t in tags:
34+
all_tags.add(t)
35+
except Exception:
36+
continue
37+
2738
return sorted(list(all_tags))
2839

40+
2941
def get_tags_from_ai(title, content, category, existing_tags):
30-
if not API_KEY: return []
31-
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}"}
32-
42+
"""调用大模型为文章自动生成标签"""
43+
if not API_KEY:
44+
print("未检测到 DEEPSEEK_API_KEY,跳过 AI 打标签。")
45+
return []
46+
47+
headers = {
48+
"Content-Type": "application/json",
49+
"Authorization": f"Bearer {API_KEY}"
50+
}
51+
52+
role_map = {
53+
"study": "资深技术专家",
54+
"anime": "资深二次元漫评人",
55+
"music": "专业乐评人",
56+
"paint": "美术鉴赏家",
57+
"game": "骨灰级游戏玩家",
58+
"snap": "专业摄影师",
59+
"asmr": "ASMR深度体验者",
60+
"emo": "情感作家",
61+
"paper": "资深学术研究员"
62+
}
63+
role = role_map.get(category, "专业博客编辑")
64+
3365
guidance = ""
34-
if category == "study": guidance = "这是一篇学习笔记。请侧重提取技术领域词。"
35-
elif category == "anime": guidance = "这是一篇动漫相关博文。请务必提取作品名称作为首个标签、要根据该动漫的真实内容来提取其他标签。"
36-
elif category == "music": guidance = "这是一篇音乐鉴赏。请提取社团/作者名、曲风等。"
37-
elif category == "paint": guidance = "这是一篇绘画分享。请提取其中的人物、风格等。"
38-
elif category == "game": guidance = "这是一篇游戏记录。请务必提取游戏名称作为首个标签、要根据游戏的真实内容来提取其他标签。"
39-
elif category == "snap": guidance = "这是一篇摄影作品。请提取镜头焦段、拍摄地点等。"
40-
elif category == "asmr": guidance = "这是一篇助眠相关内容。请提取作者名等。"
41-
elif category == "emo": guidance = "这是一篇心情随笔。请提取情感意象或核心感悟。"
66+
if category == "study":
67+
guidance = "这是一篇学习笔记。请侧重提取核心技术栈、框架、编程语言等专业词汇。"
68+
elif category == "anime":
69+
guidance = "这是一篇动漫相关博文。请务必提取作品名称作为首个标签,并提取核心角色、制作公司、类型题材(如机战、日常)等。"
70+
elif category == "music":
71+
guidance = "这是一篇音乐鉴赏。请提取歌手/社团/作者名、曲风、专辑名等。"
72+
elif category == "paint":
73+
guidance = "这是一篇绘画分享。请提取其中的人物名字、画师名、艺术风格、绘制工具等。"
74+
elif category == "game":
75+
guidance = "这是一篇游戏记录。请务必提取游戏名称作为首个标签,并提取游戏类型、核心机制或开发商等。"
76+
elif category == "snap":
77+
guidance = "这是一篇摄影作品。请提取相机型号、镜头焦段、拍摄地点、摄影风格等。"
78+
elif category == "asmr":
79+
guidance = "这是一篇助眠相关内容。请提取音声作者名、触发音类型(如耳语、心跳、底噪)、设备等。"
80+
elif category == "emo":
81+
guidance = "这是一篇心情随笔。请提取抽象的情感意象或核心感悟名词。"
82+
elif category == "paper":
83+
guidance = "这是一篇学术论文阅读笔记或研究总结。请务必提取论文的研究领域、核心算法、关键模型(如Transformer、CNN)、数据集名称等学术专业词汇。"
4284

43-
prompt = f"角色:专业技术博客编辑。任务:为文章生成 3-5 个精准标签。\n分类背景:{guidance}\n要求:1. 可以复用已有标签:{', '.join(existing_tags)}\n2. 格式:JSON 数组。\n标题:{title}\n摘要:{content[:1000]}"
85+
prompt = f"""角色:{role}
86+
任务:请仔细阅读以下文章内容,为其生成 3-5 个最核心、最准确的标签。
87+
分类指导:{guidance}
88+
严格要求:
89+
1. 尽可能复用现有标签库中的标签:{', '.join(existing_tags)}
90+
2. 标签必须是具体的专有名词(如人名、技术名、作品名、算法名),绝对禁止使用宽泛的形容词(如“好看的”、“好听的”)或长句!
91+
3. 必须以 JSON 对象格式返回,键名固定为 "tags",值为字符串数组。
92+
93+
文章标题:{title}
94+
内容摘要:{content[:3000]}"""
4495

4596
payload = {
4697
"model": "deepseek-chat",
4798
"messages": [
48-
{"role": "system", "content": "You are an intelligent tag generator. Output JSON arrays only."},
49-
{"role": "user", "content": prompt}
99+
{
100+
"role": "system",
101+
"content": 'You are a precise and professional tag extractor. You must reply ONLY with a valid JSON object containing a "tags" string array. Do not include markdown formatting like ```json.'
102+
},
103+
{
104+
"role": "user",
105+
"content": prompt
106+
}
50107
],
51-
"response_format": {"type": "json_object"}
108+
"response_format": {"type": "json_object"},
109+
"temperature": 0.1
52110
}
53111

54112
try:
55113
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
114+
response.raise_for_status()
115+
56116
data = response.json()
57117
res = data['choices'][0]['message']['content']
58118
parsed = json.loads(res)
59-
return parsed["tags"] if isinstance(parsed, dict) and "tags" in parsed else parsed
60-
except: return []
119+
120+
tags = parsed.get("tags")
121+
if isinstance(tags, list):
122+
return tags
123+
elif isinstance(parsed, list):
124+
return parsed
125+
return []
126+
except Exception as e:
127+
print(f"Error calling AI: {e}")
128+
return []
129+
61130

62131
def process_posts():
63-
if not os.path.exists(POSTS_DIR): return
132+
"""遍历文章目录并为缺失标签的文章补充标签"""
133+
if not os.path.exists(POSTS_DIR):
134+
print(f"文章目录 {POSTS_DIR} 不存在。")
135+
return
136+
64137
existing_tags = get_existing_tags()
65138

66139
for filename in os.listdir(POSTS_DIR):
67-
if not filename.endswith(".md") or filename == "BLOG_TEMPLATE.md": continue
140+
if not filename.endswith(".md") or filename == "BLOG_TEMPLATE.md":
141+
continue
142+
68143
filepath = os.path.join(POSTS_DIR, filename)
69-
with open(filepath, 'r', encoding='utf-8') as f:
144+
145+
with open(filepath, "r", encoding="utf-8") as f:
70146
full_content = f.read()
71147

72-
parts = full_content.split('---', 2)
73-
if len(parts) < 3: continue
148+
parts = full_content.split("---", 2)
149+
if len(parts) < 3:
150+
continue
74151

75152
try:
76153
front_matter = yaml.safe_load(parts[1])
77154
post_body = parts[2]
78-
except: continue
155+
except Exception:
156+
continue
79157

80-
if not front_matter.get("tags") or len(front_matter["tags"]) == 0:
158+
if not front_matter or not front_matter.get("tags") or len(front_matter["tags"]) == 0:
159+
if not front_matter:
160+
front_matter = {}
161+
81162
title = front_matter.get("title", "")
82-
category = (front_matter.get("categories") or [""])[0].lower()
163+
categories = front_matter.get("categories", [])
164+
category = categories[0].lower() if categories else ""
83165

84166
print(f"Intelligently tagging: {title}...")
85167
new_tags = get_tags_from_ai(title, post_body.strip(), category, existing_tags)
168+
86169
if new_tags:
87170
front_matter["tags"] = new_tags
88-
# 重新写入文件,保持日期原封不动
89-
new_content = f"---\n{yaml.dump(front_matter, allow_unicode=True, sort_keys=False).strip()}\n---\n{post_body}"
90-
with open(filepath, 'w', encoding='utf-8') as f:
171+
172+
173+
yaml_content = yaml.dump(front_matter, allow_unicode=True, sort_keys=False).strip()
174+
new_content = f"---\n{yaml_content}\n---\n{post_body}"
175+
176+
with open(filepath, "w", encoding="utf-8") as f:
91177
f.write(new_content)
92178
print(f"Updated tags for: {filename}")
93179

180+
94181
if __name__ == "__main__":
95-
process_posts()
182+
process_posts()

0 commit comments

Comments
 (0)