-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathformatdata.py
More file actions
129 lines (109 loc) · 3.54 KB
/
formatdata.py
File metadata and controls
129 lines (109 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Make binary data from the raw rating data
"""
import numpy as N
def txtdata2bin(dataloc):
from os import chdir # 작업 디렉토리 변경
chdir(dataloc)
movieratings, movieids, userids, dates, allusers = makearrays('')
chdir('../')
print('Saving movie-indexed data...')
savearrays('', movieratings, movieids, userids, dates, allusers, index='movie')
print('Reindexing data for users...')
movieratings, movieids, userids, dates = reindexmoviearrays(movieratings, movieids, userids, dates)
print('Saving user-indexed data...')
savearrays('', movieratings, movieids, userids, dates, allusers, index='user')
def makearrays(dataloc):
from time import mktime
nummovies = 17770
numusers = 480189
numratings = 100480507
movieratings = N.zeros(numratings, dtype=N.int8)
movieids = N.zeros(numratings, dtype=N.int16)
userids = N.zeros(numratings, dtype=N.int32)
dates = N.zeros(numratings, dtype=N.int32)
counter = 0
for i in range(nummovies):
if i % 100 == 0: print('Extracting movie %d' % i)
f = open(dataloc + 'mv_%07d.txt' % (i+1), 'rt')
data = f.readlines()
f.close()
movieid = int(data.pop(0)[:-2])
inumratings = len(data)
movieids[counter:counter+inumratings] = movieid
for j in range(inumratings):
userid, stars, date = data[j][:-1].split(',')
year, month, day = date.split('-')
epoch = mktime((int(year), int(month), int(day), 0, 0, 0, 0, 0, 0))
userids[counter] = int(userid)
movieratings[counter] = int(stars)
dates[counter] = epoch
counter += 1
print('Finding unique users')
allusers = N.unique(userids)
print('Zero-indexing users and movies')
movieids -= 1
convertusers = N.zeros(N.max(allusers)+1, dtype=N.int32)
convertusers[allusers] = N.r_[0:numusers]
userids = convertusers[userids]
return movieratings, movieids, userids, dates, allusers
def savearrays(savedir, movieratings, movieids, userids, dates, allusers, index='movie'):
from os import mkdir
from flixdata import makeBoundedIndex
try:
mkdir(savedir + 'arrays/')
except:
pass
if index == 'movie':
try:
mkdir(savedir + 'arrays/movie_indexed/')
except:
pass
outdir = savedir+'arrays/movie_indexed/'
elif index == 'user':
try:
mkdir(savedir + 'arrays/user_indexed/')
except:
pass
outdir = savedir+'/arrays/user_indexed/'
else:
raise ValueError, 'only user or movie indexing, bud'
f = open(outdir+'ratings', 'wb')
f.write(movieratings.tostring())
f.close()
f = open(outdir+'movieids', 'wb')
f.write(movieids.tostring())
f.close()
f = open(outdir+'userids', 'wb')
f.write(userids.tostring())
f.close()
f = open(outdir+'dates', 'wb')
f.write(dates.tostring())
f.close()
if index == 'movie':
idx = makeBoundedIndex(movieids)
f = open(outdir+'movieindex_BOUNDS', 'wb')
f = open(savedir+'arrays/originalUserIDs', 'wb')
f.write(allusers.tostring())
f.close()
elif index == 'user':
idx = makeBoundedIndex(userids)
f = open(outdir+'userindex_BOUNDS', 'wb')
idx.dump(f) # we'll only be slicing NumPy arrays, so save it as NumPy
f.close()
def reindexmoviearrays(movieratings, movieids, userids, dates):
newindex = N.argsort(userids)
return movieratings[newindex], movieids[newindex], userids[newindex], dates[newindex]
def makeQuizArrays(qualfile):
"""
Given the text quiz file, make rating arrays
"""
from flixdata import readProbeFile
qualdata = readProbeFile(qualfile)
qualdata[:,0] -= 1
# get the original user IDs
origUserIDs = N.fromfile(origuserIDfile, dtype='int32')
convert = N.r_[0:N.max(origUserIDs)+1]
convert[origUserIds] = N.r_[0:480189]
qualdata[:,1] = convert[qualdata[:,1]]
# etc etc