-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathtools.py
More file actions
69 lines (62 loc) · 2.47 KB
/
tools.py
File metadata and controls
69 lines (62 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import jieba.posseg as pseg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def get_stopped_words():
words = [line.strip() for line in open("stoppedwords.txt", "r")]
words.append(" ")
return words
def extract_words(sentenceList):
wordlist = []
#sentenceList = [line.rstrip() for line in f]
stopped_words = get_stopped_words()
remain_pos = [line.rstrip() for line in open("remainPos.txt", "r")]
for txt in sentenceList:
seg_list_after_stop = []
seg_list_after_postag = []
words = pseg.cut(txt)
for word, pos in words:
if pos in remain_pos:
seg_list_after_postag.append(word)
for word in seg_list_after_postag:
if word not in stopped_words:
seg_list_after_stop.append(word)
wordlist.append(" ".join(seg_list_after_stop))
return wordlist
def calculate_tfidf(wordlist,sentenceList):
tfidfDict = {}
for txt in sentenceList:
for i in range(len(txt)):
if txt[i] == "《":
for j in range(i + 1, len(txt)):
if txt[j] == "》":
tfidfDict[txt[i + 1:j]] = 2.0
vectorizer = CountVectorizer()
word_frequence = vectorizer.fit_transform(wordlist)
words = vectorizer.get_feature_names()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(word_frequence)
weight = tfidf.toarray()
for i in range(len(weight)):
for j in range(len(words)):
getWord = words[j]
getValue = weight[i][j]
if getValue != 0:
if getWord not in list(tfidfDict.keys()):
tfidfDict.update({getWord: getValue * len(getWord)})
else:
if tfidfDict[getWord]!=2.0:
tfidfDict.update({getWord:tfidfDict[getWord]+getValue*len(getWord)})
# tfidfDict[getWord] += str.atof(getValue*len(getWord))
# else:
# tfidfDict.update({getWord: getValue*len(getWord)})
return tfidfDict
def list_to_file(filename,list):
with open(filename,"w") as f:
for word in list:
f.writelines(word+"\n")
def LDA_modify_weight(wordDict):
topic_words = [word.rstrip() for word in open("LDA_topic_word.txt","r")]
for word in wordDict.keys():
if word in topic_words:
wordDict[word] = wordDict[word]*2
return wordDict