-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathtrainLDA.py
More file actions
31 lines (29 loc) · 898 Bytes
/
trainLDA.py
File metadata and controls
31 lines (29 loc) · 898 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import tools
import jieba
import re
from gensim import corpora
from gensim.models import LdaModel
stopped_words = tools.get_stopped_words()
trainset = []
fr=open('corpus.txt','r')
fw = open('corpus_word.txt','w')
for line in fr:
line = jieba.cut(line.rstrip(),cut_all=False)
words = [word for word in line if word not in stopped_words]
trainset.append(words)
fw.writelines(" ".join(words)+"\n")
fr.close()
fw.close()
dictionary = corpora.Dictionary(trainset)
corpus = [dictionary.doc2bow(text) for text in trainset]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4)
#lda.save('*.model')
result = lda.print_topics(3)
finalwords = []
for tuple in result:
_,string = tuple
words = re.findall(r"\"(.+?)\"",string)
for word in words:
if word not in finalwords:
finalwords.append(word)
tools.list_to_file("./LDA_topic_word.txt",finalwords)