Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 50 additions & 12 deletions gsdmm/mgp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from numpy.random import multinomial
from numpy import log, exp
from numpy import argmax
import json
import pickle

class MovieGroupProcess:
def __init__(self, K=8, alpha=0.1, beta=0.1, n_iters=30):
Expand Down Expand Up @@ -42,8 +42,43 @@ def __init__(self, K=8, alpha=0.1, beta=0.1, n_iters=30):
self.cluster_word_count = [0 for _ in range(K)]
self.cluster_word_distribution = [{} for i in range(K)]

@staticmethod
def from_data(K, alpha, beta, D, vocab_size, cluster_doc_count, cluster_word_count, cluster_word_distribution):
def save(self, f):
'''
Dump MovieGroupProcess to Pickle file.
:param f: str
Target File Path
:return:
'''
payload = {"K": self.K, "alpha":self.alpha, 'beta': self.beta, "D": self.number_docs, 'cluster_doc_count': self.cluster_doc_count,
"cluster_word_count": self.cluster_word_count,
"cluster_word_distribution": self.cluster_word_distribution,
"n_iters": self.n_iters}
try:
fstream = open(f, mode='wb')
except IOError as error:
raise IOError("Cannot create file, check path. ", error)
else:
pickle.dump(payload, fstream)
fstream.close()

def load(self, f):
'''
Load Pickled MovieGroupProcess.
:param f: str
Target File Path
:return self:
'''
try:
fstream = open(f, mode='rb')
except IOError as error:
raise IOError(error)
else:
payload = pickle.load(file=fstream)
fstream.close()
return self._from_data(**payload)

def _from_data(self, K, alpha, beta, D, vocab_size, cluster_doc_count,
cluster_word_count, cluster_word_distribution, n_iters):
'''
Reconstitute a MovieGroupProcess from previously fit data
:param K:
Expand All @@ -56,13 +91,16 @@ def from_data(K, alpha, beta, D, vocab_size, cluster_doc_count, cluster_word_cou
:param cluster_word_distribution:
:return:
'''
mgp = MovieGroupProcess(K, alpha, beta, n_iters=30)
mgp.number_docs = D
mgp.vocab_size = vocab_size
mgp.cluster_doc_count = cluster_doc_count
mgp.cluster_word_count = cluster_word_count
mgp.cluster_word_distribution = cluster_word_distribution
return mgp
self.n_iters = n_iters
self.K = K
self.alpha = alpha
self.beta = beta
self.number_docs = D
self.vocab_size = vocab_size
self.cluster_doc_count = cluster_doc_count
self.cluster_word_count = cluster_word_count
self.cluster_word_distribution = cluster_word_distribution
return self

@staticmethod
def _sample(p):
Expand Down Expand Up @@ -185,7 +223,7 @@ def score(self, doc):
lD2 = 0
for word in doc:
lN2 += log(n_z_w[label].get(word, 0) + beta)
for j in range(1, doc_size +1):
for j in range(1, doc_size + 1):
lD2 += log(n_z[label] + V * beta + j - 1)
p[label] = exp(lN1 - lD1 + lN2 - lD2)

Expand All @@ -201,4 +239,4 @@ def choose_best_label(self, doc):
:return:
'''
p = self.score(doc)
return argmax(p),max(p)
return argmax(p), max(p)