-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract.py
More file actions
55 lines (40 loc) · 1.61 KB
/
extract.py
File metadata and controls
55 lines (40 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np
from konlpy.tag import Okt
import re
import pprint
import os
def get_stopwords():
stopwords = list()
path = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(path,"stopwords.txt")
f = open(file_path, 'r', encoding='utf-8')
while True:
line = f.readline()
if not line: break
stopwords.append(line.strip())
return stopwords
def okt_tokenizer(text):
okt = Okt()
text = re.sub(r'[^ ㄱ-ㅣ가-힣A-Za-z]', '', text) # 특수기호 제거
stopwords = get_stopwords() # 불용어
return [token for token in okt.nouns(text)
if len(token) > 1 and token not in stopwords]
def extract_keywords(text):
vectorizer = TfidfVectorizer(tokenizer=okt_tokenizer)
vectorizer.fit(text)
matrix = vectorizer.fit_transform(text)
# 단어 사전: {"token": id}
vocabulary_word_id = defaultdict(int)
for idx, token in enumerate(vectorizer.get_feature_names()):
vocabulary_word_id[token] = idx
# 특징 추출 결과: {"token": value}
result = defaultdict(str)
for token in vectorizer.get_feature_names():
result[token] = matrix[0, vocabulary_word_id[token]]
# 내림차순 (중요도 high) 기준 정렬
result = sorted(result.items(), key = lambda item: item[1], reverse = True)
pprint.pprint(result)
return [{"keyword": token, "value": round(value * 10, 1)} for token, value in result if not value < 0.1]