-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_cwe.py
More file actions
87 lines (68 loc) · 2.97 KB
/
fetch_cwe.py
File metadata and controls
87 lines (68 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import requests
from bs4 import BeautifulSoup
# 設定存放路徑 (與原本的資料放在一起)
OUTPUT_DIR = "knowledge_base"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# 定義我們要抓取的關鍵 CWE (對應你的專題需求)
# CWE-89: SQL Injection (對應 PDF 測試案例 1)
# CWE-79: XSS (對應 PDF 測試案例 2)
# CWE-22: Path Traversal (常見的存取控制失效)
# CWE-78: OS Command Injection
# CWE-400: Uncontrolled Resource Consumption (對應複雜度/資源耗盡)
TARGET_CWES = [
"89", "79", "22", "78", "400"
]
BASE_URL = "https://cwe.mitre.org/data/definitions"
def fetch_cwe_content(cwe_id):
url = f"{BASE_URL}/{cwe_id}.html"
print(f"📥 正在下載 CWE-{cwe_id} 資料: {url} ...")
try:
# 加上 User-Agent 避免被擋
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 1. 抓取標題
title_tag = soup.find('h2')
title = title_tag.get_text(strip=True) if title_tag else f"CWE-{cwe_id}"
# 2. 抓取描述 (Description)
desc_div = soup.find('div', id='Description')
description = desc_div.get_text(strip=True) if desc_div else "No description found."
# 3. 抓取程式碼範例 (Demonstrative Examples) - 這是 RAG 最需要的部分
# MITRE 網站結構通常將範例放在 id="Demonstrative_Examples" 的 div 中
examples_div = soup.find('div', id='Demonstrative_Examples')
examples_content = ""
if examples_div:
# 嘗試保留程式碼區塊的格式
for code_block in examples_div.find_all(['div', 'pre'], class_=['ExampleCode', 'code']):
# 簡單清理標籤
code_text = code_block.get_text(strip=False)
examples_content += f"\n```\n{code_text}\n```\n"
if not examples_content:
examples_content = "此 CWE 頁面未提供簡單的程式碼範例,請參考描述。"
# 4. 組合成 Markdown
markdown_content = f"""# {title}
## Description
{description}
## Vulnerable Code Examples (關鍵學習資料)
{examples_content}
## URL
{url}
"""
return markdown_content
except Exception as e:
print(f"❌ 下載 CWE-{cwe_id} 失敗: {e}")
return None
def main():
for cwe_id in TARGET_CWES:
content = fetch_cwe_content(cwe_id)
if content:
filename = os.path.join(OUTPUT_DIR, f"CWE_{cwe_id}.md")
with open(filename, "w", encoding="utf-8") as f:
f.write(content)
print(f"✅ 已儲存: {filename}")
print("\n🎉 所有 CWE 資料下載完成!請記得重新執行 create_vector_db.py")
if __name__ == "__main__":
main()