You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The following version of the code is for finding like minded user communities by getting higher level topics during the lda phase before performing clustering. This is the second of two different methods that were attempted.
[nltk_data] Downloading package stopwords to
[nltk_data] /home/ragulan550/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/ragulan550/nltk_data...
[nltk_data] Package words is already up-to-date!
Load Preprocessed User Tweet Data
#loads csv from stored locationdf=pd.read_csv('../csvfiles/tweetsOnUserOnConcepts.csv', lineterminator='\n', low_memory=False)
df.head()
#gets all concept text for each tweet and stores in listtweetConcept=df.ConceptText.values.tolist()
userIds=df.userid.values.tolist()
#stores a list of each tweet and the words the tweets containtempData= []
forsentintweetConcept:
x= []
forwordinsent.split(" "):
x.append(word)
tempData.append(x)
data_final=tempData
LDA Analysis
#setting up corpus for ldaid2word1=corpora.Dictionary(data_final)
texts=data_finalcorpus1= [id2word1.doc2bow(text) fortextintexts]
Run LDA
Multicore allows for multiple cores to be working on LDA simultaneously
Check Number of workers
Check Number of topics set
#uncomment line below to try lda with different valuestopicNum=47#lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus1,id2word=id2word1,num_topics=topicNum, passes=10, workers=7)#lda_model.save('../LdaSaves/topics47mar10p9/lda.model_mar10_t47')#preloading an saved lda run to save time as lda takes long time to runlda_model1=models.LdaModel.load('../LDASaves/ldamar25/lda.model_mar25_t47')
pprint(lda_model1.print_topics())
#creating a dictionary where each key is the user and the value is a list of all topicsNums representing each tweetdictConcept= {}
foriinrange(len(userIds)):
ifuserIds[i] notindictConcept:
dictConcept[userIds[i]] = []
#getting the top 3 topics for each user tweet and appending to the user dictionaryfori, rowinenumerate (lda_model1[corpus1]):
sortedValue=sorted(row, key=lambdax:x[1], reverse=True)
userid=int(df.iloc[[i]].userid)
forzinsortedValue[:3]:
dictConcept[userid].append(str(z[0]))
Running 2nd Round of LDA To find Higher Level Topics
importast#using previously saved list to save time# topicsPerTweets=[]# with open('topicsPerTweets.txt', 'r') as f:# for line in f:# topicsPerTweets.append(ast.literal_eval(line))topicsPerTweets=list(dictConcept.values())
print(topicsPerTweets[:2])
id2word2=corpora.Dictionary(topicsPerTweets)
corpus2= [id2word2.doc2bow(text) fortextintopicsPerTweets]
#saving the list so the above block does not need to be rerun since it takes a while# with open('topicsPerTweets.txt', 'w') as f:# for item in topicsPerTweets:# f.write("%s\n" % item)
[['8', '23', '41'], ['33', '0', '1']]
topicNum2=10today=datetime.datetime.now()
#uncomment the line below to run with own custom topics numbers or workerslda_model2=gensim.models.ldamulticore.LdaMulticore(corpus=corpus2, id2word=id2word2, num_topics=topicNum2, passes=10, workers=3)
lda_model2.save('../LDASaves/HigherOrderModels/LDA'+today.strftime("%M%d") +str(topicNum2))
lda_model2=models.LdaModel.load('../LDASaves/HigherOrderModels/LDA'+today.strftime("%M%d") +str(topicNum2))
doc_lda2=lda_model2[corpus2]
pprint(lda_model2.print_topics())
print('\nPerplexity: ', lda_model2.log_perplexity(corpus2)) # a measure of how good the model is. lower the better.coherence_model_lda=CoherenceModel(model=lda_model2, texts=topicsPerTweets, dictionary=id2word2, coherence='c_v')
coherence_lda=coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
defformat_topics_sentences(ldamodel=lda_model2, corpus=corpus2, texts=topicsPerTweets):
# Init outputsent_topics_df=pd.DataFrame()
# Get main topic in each documentfori, rowinenumerate(ldamodel[corpus]):
row=sorted(row, key=lambdax: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each documentforj, (topic_num, prop_topic) inenumerate(row):
ifj==0: # => dominant topicwp=ldamodel.show_topic(topic_num)
topic_keywords=", ".join([wordforword, propinwp])
sent_topics_df=sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
breaksent_topics_df.columns= ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the outputcontents=pd.Series(texts)
sent_topics_df=pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords=format_topics_sentences(ldamodel=lda_model2, corpus=corpus2, texts=topicsPerTweets)
# Formatdf_dominant_topic=df_topic_sents_keywords.reset_index()
df_dominant_topic.columns= ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Showdf_dominant_topic.head(10)
# Number of Documents for Each Topictopic_counts=df_topic_sents_keywords['Dominant_Topic'].value_counts()
# Percentage of Documents for Each Topictopic_contribution=round(topic_counts/topic_counts.sum(), 4)
# Topic Number and Keywordstopic_num_keywords=df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]
# Concatenate Column wisedf_dominant_topics=pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
# Change Column namesdf_dominant_topics.columns= ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
# Showdf_dominant_topics[0:10]
Creating User Vectors of length K where K is number of topics
UserVectors= []
#for each users shows percent contribution for that topicprint(lda_model2[corpus2][1][1])
forrowinlda_model2[corpus2]:
temp= [0]*topicNum2forvalinrow:
#val is a tuple in form (topicNum, percentContributionOfTopicToUser)temp[val[0]] =val[1]
UserVectors.append(temp)
print("Shows a sample userVector")
print(UserVectors[1])
newsUserId=dfGoldStandard.userid.values.tolist()
newsUrl=dfGoldStandard.url.values.tolist()
newsId=dfGoldStandard.NewsId.values.tolist()
#dictionary of users who posted a newsArticlenewsId2UserId= {}
foriinrange(len(newsId)):
ifnewsId[i] notinnewsId2UserId:
newsId2UserId[newsId[i]] = []
newsId2UserId[newsId[i]].append(newsUserId[i])
# loading another dataframe with goldstandard but only keeping unique newsidsdfUniqueNewsId=pd.read_csv('../csvfiles/GoldStandard.csv', lineterminator='\n', low_memory=False)
dfUniqueNewsId.drop_duplicates(subset='NewsId', inplace=True)
newsArticles=dfUniqueNewsId.NewsConceptText.values.tolist()
Getting 1st Round LDA topics for news articles
#storing words in news articles in a listnewsArticlesForCorpus= [x.split(' ') forxinnewsArticles]
#creating a corpusnewsId2word=corpora.Dictionary(newsArticlesForCorpus)
NewsArticlesCorpus= [newsId2word.doc2bow(text) fortextinnewsArticlesForCorpus]
#using the previous lda_model for first lda run with the news corpus created to get a percent contribution for each topic for each news articleTopicDistributionOnNewsArticles=lda_model1[NewsArticlesCorpus]
# finding the top 3 topics for each news articletopicsPerNewsArticleHighLevel= []
forxin (TopicDistributionOnNewsArticles):
sortedValue=sorted(x, key=lambdax:x[1], reverse=True)
temp= []
forzinsortedValue[:3]:
temp.append(str(z[0]))
topicsPerNewsArticleHighLevel.append(temp)
print(topicsPerNewsArticleHighLevel[:5])
#setting up corpus for 2nd lda run on news articlesnewsId2word2=corpora.Dictionary(topicsPerNewsArticleHighLevel)
NewsArticlesCorpus2= [newsId2word2.doc2bow(text) fortextintopicsPerNewsArticleHighLevel]
TopicDistributionOnNewsArticles=lda_model2[NewsArticlesCorpus2]
Creating User Vectors of length K where K is number of topics
ArticleVector= []
forrowinTopicDistributionOnNewsArticles:
temp= [0]*topicNum2forvalinrow:
#val is a tuple in form (topicNum, percentContributionOfTopicToUser)temp[val[0]] =val[1]
ArticleVector.append(temp)
print("Displaying sample article vector")
print(ArticleVector[1])
#different cluster sizes to try out analysis fornumClusters=[5, 10, 15, 20, 25, 30]
today=datetime.datetime.now()
#saving kmeans results for the differnt cluster sizesforxinrange(len(numClusters)):
userVectorsFit=np.array(UserVectors)
#performing kmeans on the userVector to cluster users into communitieskmeans=KMeans(n_clusters=numClusters[x], random_state=0).fit(userVectorsFit)
kMeansfilename='LDAM2-kMeans'+today.strftime("%M%d") +'CSize'+str(numClusters[x])
pickle.dump(kmeans, open("../kmeansFiles/"+kMeansfilename,'wb'))
#change this number to a number from the [5, 10, 15, 20, 25, 30] to preload a different filechosenNumberOfCluster=30#loading existing kmeans modelkMeansfilename='LDAM2-kMeans'+today.strftime("%M%d") +'CSize'+str(chosenNumberOfCluster)
print('Chosen File: \''+kMeansfilename+'\'')
loadedKmeansModel=pickle.load(open("../kmeansFiles/"+kMeansfilename, 'rb'))
Chosen File: 'LDAM2-kMeans1404CSize30'
Number of Users in each Cluster
#creating a list to show how many users are in each clusteruserClusters= [0]*chosenNumberOfClusterforiinloadedKmeansModel.labels_:
userClusters[i] +=1print(userClusters)
fromscipyimportspatialrankArticlesToCluster=[]
forxinrange(chosenNumberOfCluster):
rankArticlesToCluster.append([])
forxinrange (len(ArticleVector)):
forindex,valueinenumerate(averageDistributionPerCluster):
#finds cosine similarity between artlice vector and average vector of the clusterrankArticlesToCluster[index].append(tuple((x,1-spatial.distance.cosine(ArticleVector[x], value))))
#sorting the ranked listimportoperatorsortedRankArticlesToCluster=[]
forxinrankArticlesToCluster:
sortedRankArticlesToCluster.append(sorted(x,key=lambdax: x[1]))
ascendingRankedArticlesToCluster= []
forxinsortedRankArticlesToCluster:
ascendingRankedArticlesToCluster.append(list(reversed(x)))