SUSTech-STA326 · chrisguo777 · May 28, 2024
diff --git a/ablation.py b/ablation.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import numpy as np
+from mlp import MLPEngine
+from data import SampleGenerator
+import os
+import matplotlib.pyplot as plt
+
+# 创建必要的目录
+os.makedirs('checkpoints', exist_ok=True)
+os.makedirs('images', exist_ok=True)
+
+# Load Data
+ml1m_dir = 'data/ml-1m/ratings.dat'
+ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'], engine='python')
+
+# Reindex
+user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
+user_id['userId'] = np.arange(len(user_id))
+ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
+item_id = ml1m_rating[['mid']].drop_duplicates()
+item_id['itemId'] = np.arange(len(item_id))
+ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
+ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
+
+# DataLoader for training
+generator = SampleGenerator(ratings=ml1m_rating)
+evaluate_data = generator.evaluate_data
+
+# 定义不同的MLP配置
+mlp_configs = [
+    {'alias': 'mlp_0', 'layers': [16]},   
+    {'alias': 'mlp_1', 'layers': [16, 8]},                    
+    {'alias': 'mlp_2', 'layers': [16, 64, 8]},              
+    {'alias': 'mlp_3', 'layers': [16, 64, 16, 8]},        
+    {'alias': 'mlp_4', 'layers': [16, 64, 32, 16, 8]},  
+]
+# 存储每个MLP配置的引擎
+mlp_engines = []
+
+# 训练和评估不同的MLP配置
+for mlp_config in mlp_configs:
+    config = {
+        'alias': mlp_config['alias'],
+        'num_epoch': 30,
+        'batch_size': 1024,
+        'optimizer': 'adam',
+        'adam_lr': 1e-3,
+        'num_users': 6040,
+        'num_items': 3706,
+        'latent_dim': 8,
+        'num_negative': 4,
+        'layers': mlp_config['layers'],
+        'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
+        'weight_init_gaussian': True,
+        'use_cuda': True,
+        'device_id': 0,
+        'pretrain': False,
+        'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
+        'model_dir': 'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
+    }
+
+    print(f"Training MLP with config: {config['alias']}")
+    engine = MLPEngine(config)
+    mlp_engines.append(engine)
+
+    for epoch in range(config['num_epoch']):
+        train_loader = generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
+        engine.train_an_epoch(train_loader, epoch_id=epoch)
+        engine.evaluate(evaluate_data, epoch_id=epoch)
+
+
+# 绘制图像
+for metric, key in [('HR@10', 'hr_list'), ('NDCG@10', 'ndcg_list'), ('Training Loss', 'train_loss')]:
+    plt.figure(figsize=(10, 8))
+    for engine in mlp_engines:
+        plt.plot(range(engine.config['num_epoch']), getattr(engine, key), label=engine.config['alias'])
+    plt.xlabel('Epoch', fontsize=14)
+    plt.ylabel(metric, fontsize=14)
+    plt.legend(fontsize=14)
+    plt.grid(True)
+    plt.savefig(f"images/mlp_{metric.replace(' ', '_')}_epoch.png", dpi=300)
+
+# 保存评估结果到文件
+with open('ablation_results.txt', 'w') as f:
+    for engine in mlp_engines:
+        f.write(f"Config: {engine.config['alias']}\n")
+        f.write(f"HR@10: {engine.hr_list[-1]}\n")
+        f.write(f"NDCG@10: {engine.ndcg_list[-1]}\n")
+        f.write(f"Training Loss: {engine.train_loss[-1]}\n\n")
diff --git a/data.py b/data.py
@@ -0,0 +1,111 @@
+import torch
+import random
+import pandas as pd
+from copy import deepcopy
+from torch.utils.data import DataLoader, Dataset
+
+random.seed(0)
+
+
+class UserItemRatingDataset(Dataset):
+    """Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
+    def __init__(self, user_tensor, item_tensor, target_tensor):
+        """
+        args:
+            target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
+        """
+        self.user_tensor = user_tensor
+        self.item_tensor = item_tensor
+        self.target_tensor = target_tensor
+
+    def __getitem__(self, index):
+        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
+
+    def __len__(self):
+        return self.user_tensor.size(0)
+
+
+class SampleGenerator(object):
+    """Construct dataset for NCF"""
+
+    def __init__(self, ratings):
+        """
+        args:
+            ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
+        """
+        assert 'userId' in ratings.columns
+        assert 'itemId' in ratings.columns
+        assert 'rating' in ratings.columns
+
+        self.ratings = ratings
+        # explicit feedback using _normalize and implicit using _binarize
+        # self.preprocess_ratings = self._normalize(ratings)
+        self.preprocess_ratings = self._binarize(ratings)
+        self.user_pool = set(self.ratings['userId'].unique())
+        self.item_pool = set(self.ratings['itemId'].unique())
+        # create negative item samples for NCF learning
+        self.negatives = self._sample_negative(ratings)
+        self.train_ratings, self.test_ratings = self._split_loo(self.preprocess_ratings)
+
+    def _normalize(self, ratings):
+        """normalize into [0, 1] from [0, max_rating], explicit feedback"""
+        ratings = deepcopy(ratings)
+        max_rating = ratings.rating.max()
+        ratings['rating'] = ratings.rating * 1.0 / max_rating
+        return ratings
+
+    def _binarize(self, ratings):
+        """binarize into 0 or 1, imlicit feedback"""
+        ratings = deepcopy(ratings)
+        ratings.loc[ratings['rating'] > 0, 'rating'] = 1.0
+        return ratings
+
+    def _split_loo(self, ratings):
+        """leave one out train/test split """
+        ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
+        test = ratings[ratings['rank_latest'] == 1]
+        train = ratings[ratings['rank_latest'] > 1]
+        assert train['userId'].nunique() == test['userId'].nunique()
+        return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]
+
+    def _sample_negative(self, ratings):
+        """return all negative items & 100 sampled negative items"""
+        interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
+            columns={'itemId': 'interacted_items'})
+        interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
+        interact_status['negative_items'] = interact_status['negative_items'].apply(list)  # 确保 negative_items 是列表
+
+        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
+        return interact_status[['userId', 'negative_items', 'negative_samples']]
+
+    def instance_a_train_loader(self, num_negatives, batch_size):
+        """instance train loader for one training epoch"""
+        users, items, ratings = [], [], []
+        train_ratings = pd.merge(self.train_ratings, self.negatives[['userId', 'negative_items']], on='userId')
+        train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives))
+        for row in train_ratings.itertuples():
+            users.append(int(row.userId))
+            items.append(int(row.itemId))
+            ratings.append(float(row.rating))
+            for i in range(num_negatives):
+                users.append(int(row.userId))
+                items.append(int(row.negatives[i]))
+                ratings.append(float(0))  # negative samples get 0 rating
+        dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
+                                        item_tensor=torch.LongTensor(items),
+                                        target_tensor=torch.FloatTensor(ratings))
+        return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+    @property
+    def evaluate_data(self):
+        """create evaluate data"""
+        test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
+        test_users, test_items, negative_users, negative_items = [], [], [], []
+        for row in test_ratings.itertuples():
+            test_users.append(int(row.userId))
+            test_items.append(int(row.itemId))
+            for i in range(len(row.negative_samples)):
+                negative_users.append(int(row.userId))
+                negative_items.append(int(row.negative_samples[i]))
+        return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
+                torch.LongTensor(negative_items)]
diff --git a/data/README b/data/README
@@ -0,0 +1,170 @@
+SUMMARY
+================================================================================
+
+These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
+made by 6,040 MovieLens users who joined MovieLens in 2000.
+
+USAGE LICENSE
+================================================================================
+
+Neither the University of Minnesota nor any of the researchers
+involved can guarantee the correctness of the data, its suitability
+for any particular purpose, or the validity of results based on the
+use of the data set.  The data set may be used for any research
+purposes under the following conditions:
+
+     * The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.
+
+     * The user must acknowledge the use of the data set in
+       publications resulting from the use of the data set
+       (see below for citation information).
+
+     * The user may not redistribute the data without separate
+       permission.
+
+     * The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.
+
+If you have any further questions or comments, please contact GroupLens
+<grouplens-info@cs.umn.edu>. 
+
+CITATION
+================================================================================
+
+To acknowledge use of the dataset in publications, please cite the following
+paper:
+
+F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
+and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
+Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872
+
+
+ACKNOWLEDGEMENTS
+================================================================================
+
+Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
+set.
+
+FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
+================================================================================
+
+The GroupLens Research Project is a research group in the Department of 
+Computer Science and Engineering at the University of Minnesota. Members of 
+the GroupLens Research Project are involved in many research projects related 
+to the fields of information filtering, collaborative filtering, and 
+recommender systems. The project is lead by professors John Riedl and Joseph 
+Konstan. The project began to explore automated collaborative filtering in 
+1992, but is most well known for its world wide trial of an automated 
+collaborative filtering system for Usenet news in 1996. Since then the project 
+has expanded its scope to research overall information filtering solutions, 
+integrating in content-based methods as well as improving current collaborative 
+filtering technology.
+
+Further information on the GroupLens Research project, including research 
+publications, can be found at the following web site:
+
+        http://www.grouplens.org/
+
+GroupLens Research currently operates a movie recommender based on 
+collaborative filtering:
+
+        http://www.movielens.org/
+
+RATINGS FILE DESCRIPTION
+================================================================================
+
+All ratings are contained in the file "ratings.dat" and are in the
+following format:
+
+UserID::MovieID::Rating::Timestamp
+
+- UserIDs range between 1 and 6040 
+- MovieIDs range between 1 and 3952
+- Ratings are made on a 5-star scale (whole-star ratings only)
+- Timestamp is represented in seconds since the epoch as returned by time(2)
+- Each user has at least 20 ratings
+
+USERS FILE DESCRIPTION
+================================================================================
+
+User information is in the file "users.dat" and is in the following
+format:
+
+UserID::Gender::Age::Occupation::Zip-code
+
+All demographic information is provided voluntarily by the users and is
+not checked for accuracy.  Only users who have provided some demographic
+information are included in this data set.
+
+- Gender is denoted by a "M" for male and "F" for female
+- Age is chosen from the following ranges:
+
+	*  1:  "Under 18"
+	* 18:  "18-24"
+	* 25:  "25-34"
+	* 35:  "35-44"
+	* 45:  "45-49"
+	* 50:  "50-55"
+	* 56:  "56+"
+
+- Occupation is chosen from the following choices:
+
+	*  0:  "other" or not specified
+	*  1:  "academic/educator"
+	*  2:  "artist"
+	*  3:  "clerical/admin"
+	*  4:  "college/grad student"
+	*  5:  "customer service"
+	*  6:  "doctor/health care"
+	*  7:  "executive/managerial"
+	*  8:  "farmer"
+	*  9:  "homemaker"
+	* 10:  "K-12 student"
+	* 11:  "lawyer"
+	* 12:  "programmer"
+	* 13:  "retired"
+	* 14:  "sales/marketing"
+	* 15:  "scientist"
+	* 16:  "self-employed"
+	* 17:  "technician/engineer"
+	* 18:  "tradesman/craftsman"
+	* 19:  "unemployed"
+	* 20:  "writer"
+
+MOVIES FILE DESCRIPTION
+================================================================================
+
+Movie information is in the file "movies.dat" and is in the following
+format:
+
+MovieID::Title::Genres
+
+- Titles are identical to titles provided by the IMDB (including
+year of release)
+- Genres are pipe-separated and are selected from the following genres:
+
+	* Action
+	* Adventure
+	* Animation
+	* Children's
+	* Comedy
+	* Crime
+	* Documentary
+	* Drama
+	* Fantasy
+	* Film-Noir
+	* Horror
+	* Musical
+	* Mystery
+	* Romance
+	* Sci-Fi
+	* Thriller
+	* War
+	* Western
+
+- Some MovieIDs do not correspond to a movie due to accidental duplicate
+entries and/or test entries
+- Movies are mostly entered by hand, so errors and inconsistencies may exist