SUSTech-STA326 · Fujump · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,14 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+.gitignore
+.idea
+src/__pycache__
+src/runs
+src/checkpoints
+res/.ipynb_checkpoints
+
+res
+*.out
+src/data/*
diff --git a/plot.ipynb b/plot.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.26.4
+pandas==2.2.2
+tensorboardX==2.6.2.2
+torch==2.3.0
diff --git a/src/data.py b/src/data.py
@@ -0,0 +1,110 @@
+import torch
+import random
+import pandas as pd
+from copy import deepcopy
+from torch.utils.data import DataLoader, Dataset
+
+random.seed(0)
+
+
+class UserItemRatingDataset(Dataset):
+    """Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
+    def __init__(self, user_tensor, item_tensor, target_tensor):
+        """
+        args:
+
+            target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
+        """
+        self.user_tensor = user_tensor
+        self.item_tensor = item_tensor
+        self.target_tensor = target_tensor
+
+    def __getitem__(self, index):
+        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
+
+    def __len__(self):
+        return self.user_tensor.size(0)
+
+
+class SampleGenerator(object):
+    """Construct dataset for NCF"""
+
+    def __init__(self, ratings):
+        """
+        args:
+            ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
+        """
+        assert 'userId' in ratings.columns
+        assert 'itemId' in ratings.columns
+        assert 'rating' in ratings.columns
+
+        self.ratings = ratings
+        # explicit feedback using _normalize and implicit using _binarize
+        # self.preprocess_ratings = self._normalize(ratings)
+        self.preprocess_ratings = self._binarize(ratings)
+        self.user_pool = set(self.ratings['userId'].unique())
+        self.item_pool = set(self.ratings['itemId'].unique())
+        # create negative item samples for NCF learning
+        self.negatives = self._sample_negative(ratings)
+        self.train_ratings, self.test_ratings = self._split_loo(self.preprocess_ratings)
+
+    def _normalize(self, ratings):
+        """normalize into [0, 1] from [0, max_rating], explicit feedback"""
+        ratings = deepcopy(ratings)
+        max_rating = ratings.rating.max()
+        ratings['rating'] = ratings.rating * 1.0 / max_rating
+        return ratings
+
+    def _binarize(self, ratings):
+        """binarize into 0 or 1, imlicit feedback"""
+        ratings = deepcopy(ratings)
+        ratings['rating'][ratings['rating'] > 0] = 1.0
+        return ratings
+
+    def _split_loo(self, ratings):
+        """leave one out train/test split """
+        ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
+        test = ratings[ratings['rank_latest'] == 1]
+        train = ratings[ratings['rank_latest'] > 1]
+        assert train['userId'].nunique() == test['userId'].nunique()
+        return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]
+
+    def _sample_negative(self, ratings):
+        """return all negative items & 100 sampled negative items"""
+        interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
+            columns={'itemId': 'interacted_items'})
+        interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
+        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
+        return interact_status[['userId', 'negative_items', 'negative_samples']]
+
+    def instance_a_train_loader(self, num_negatives, batch_size):
+        """instance train loader for one training epoch"""
+        users, items, ratings = [], [], []
+        train_ratings = pd.merge(self.train_ratings, self.negatives[['userId', 'negative_items']], on='userId')
+        train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives))
+        for row in train_ratings.itertuples():
+            users.append(int(row.userId))
+            items.append(int(row.itemId))
+            ratings.append(float(row.rating))
+            for i in range(num_negatives):
+                users.append(int(row.userId))
+                items.append(int(row.negatives[i]))
+                ratings.append(float(0))  # negative samples get 0 rating
+        dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
+                                        item_tensor=torch.LongTensor(items),
+                                        target_tensor=torch.FloatTensor(ratings))
+        return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+    @property
+    def evaluate_data(self):
+        """create evaluate data"""
+        test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
+        test_users, test_items, negative_users, negative_items = [], [], [], []
+        for row in test_ratings.itertuples():
+            test_users.append(int(row.userId))
+            test_items.append(int(row.itemId))
+            for i in range(len(row.negative_samples)):
+                negative_users.append(int(row.userId))
+                negative_items.append(int(row.negative_samples[i]))
+        return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
+                torch.LongTensor(negative_items)]
diff --git a/src/data/ml-1m/README b/src/data/ml-1m/README
@@ -0,0 +1,170 @@
+SUMMARY
+================================================================================
+
+These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
+made by 6,040 MovieLens users who joined MovieLens in 2000.
+
+USAGE LICENSE
+================================================================================
+
+Neither the University of Minnesota nor any of the researchers
+involved can guarantee the correctness of the data, its suitability
+for any particular purpose, or the validity of results based on the
+use of the data set.  The data set may be used for any research
+purposes under the following conditions:
+
+     * The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.
+
+     * The user must acknowledge the use of the data set in
+       publications resulting from the use of the data set
+       (see below for citation information).
+
+     * The user may not redistribute the data without separate
+       permission.
+
+     * The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.
+
+If you have any further questions or comments, please contact GroupLens
+<grouplens-info@cs.umn.edu>. 
+
+CITATION
+================================================================================
+
+To acknowledge use of the dataset in publications, please cite the following
+paper:
+
+F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
+and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
+Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872
+
+
+ACKNOWLEDGEMENTS
+================================================================================
+
+Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
+set.
+
+FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
+================================================================================
+
+The GroupLens Research Project is a research group in the Department of 
+Computer Science and Engineering at the University of Minnesota. Members of 
+the GroupLens Research Project are involved in many research projects related 
+to the fields of information filtering, collaborative filtering, and 
+recommender systems. The project is lead by professors John Riedl and Joseph 
+Konstan. The project began to explore automated collaborative filtering in 
+1992, but is most well known for its world wide trial of an automated 
+collaborative filtering system for Usenet news in 1996. Since then the project 
+has expanded its scope to research overall information filtering solutions, 
+integrating in content-based methods as well as improving current collaborative 
+filtering technology.
+
+Further information on the GroupLens Research project, including research 
+publications, can be found at the following web site:
+
+        http://www.grouplens.org/
+
+GroupLens Research currently operates a movie recommender based on 
+collaborative filtering:
+
+        http://www.movielens.org/
+
+RATINGS FILE DESCRIPTION
+================================================================================
+
+All ratings are contained in the file "ratings.dat" and are in the
+following format:
+
+UserID::MovieID::Rating::Timestamp
+
+- UserIDs range between 1 and 6040 
+- MovieIDs range between 1 and 3952
+- Ratings are made on a 5-star scale (whole-star ratings only)
+- Timestamp is represented in seconds since the epoch as returned by time(2)
+- Each user has at least 20 ratings
+
+USERS FILE DESCRIPTION
+================================================================================
+
+User information is in the file "users.dat" and is in the following
+format:
+
+UserID::Gender::Age::Occupation::Zip-code
+
+All demographic information is provided voluntarily by the users and is
+not checked for accuracy.  Only users who have provided some demographic
+information are included in this data set.
+
+- Gender is denoted by a "M" for male and "F" for female
+- Age is chosen from the following ranges:
+
+	*  1:  "Under 18"
+	* 18:  "18-24"
+	* 25:  "25-34"
+	* 35:  "35-44"
+	* 45:  "45-49"
+	* 50:  "50-55"
+	* 56:  "56+"
+
+- Occupation is chosen from the following choices:
+
+	*  0:  "other" or not specified
+	*  1:  "academic/educator"
+	*  2:  "artist"
+	*  3:  "clerical/admin"
+	*  4:  "college/grad student"
+	*  5:  "customer service"
+	*  6:  "doctor/health care"
+	*  7:  "executive/managerial"
+	*  8:  "farmer"
+	*  9:  "homemaker"
+	* 10:  "K-12 student"
+	* 11:  "lawyer"
+	* 12:  "programmer"
+	* 13:  "retired"
+	* 14:  "sales/marketing"
+	* 15:  "scientist"
+	* 16:  "self-employed"
+	* 17:  "technician/engineer"
+	* 18:  "tradesman/craftsman"
+	* 19:  "unemployed"
+	* 20:  "writer"
+
+MOVIES FILE DESCRIPTION
+================================================================================
+
+Movie information is in the file "movies.dat" and is in the following
+format:
+
+MovieID::Title::Genres
+
+- Titles are identical to titles provided by the IMDB (including
+year of release)
+- Genres are pipe-separated and are selected from the following genres:
+
+	* Action
+	* Adventure
+	* Animation
+	* Children's
+	* Comedy
+	* Crime
+	* Documentary
+	* Drama
+	* Fantasy
+	* Film-Noir
+	* Horror
+	* Musical
+	* Mystery
+	* Romance
+	* Sci-Fi
+	* Thriller
+	* War
+	* Western
+
+- Some MovieIDs do not correspond to a movie due to accidental duplicate
+entries and/or test entries
+- Movies are mostly entered by hand, so errors and inconsistencies may exist