SUSTech-STA326 · Lapalce · May 6, 2024 · May 6, 2024 · May 6, 2024 · May 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.idea/
diff --git a/Builder/DataBuilder.py b/Builder/DataBuilder.py
@@ -0,0 +1,99 @@
+import pandas as pd
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+
+class DataProcessor:  # 数据处理类
+    def __init__(self, train_path, test_path):
+        self.train_path = train_path
+        self.test_path = test_path
+
+    def preprocess_data(self):
+        # 先读取数据，然后进行一系列的处理
+        train_data = pd.read_csv(self.train_path, sep='\t',
+                                 names=['user', 'item', 'rating', 'timestamp'])
+
+        train_data.drop(columns='timestamp', inplace=True)
+        train_data['rating'] = 1
+
+        min_item = min(train_data['item'].tolist())
+        max_item = max(train_data['item'].tolist())
+        all_items = set(range(min_item, max_item + 1))
+
+        # 用于存放negative样本的字典
+        negative_samples = {u_id: [] for u_id in train_data['user'].unique()}
+
+        # 给每个用户获取negative样本
+        for u_id in negative_samples:
+            # positive样本
+            pos_items = set(train_data[train_data['user'] == u_id]['item'].unique())
+
+            neg_items = list(all_items - pos_items)
+
+            selected_negatives = np.random.choice(neg_items, 8, replace=False)
+
+            negative_samples[u_id].extend(selected_negatives)
+
+        # 创建一个DataFrame来存放negative样本
+        neg_data_list = []
+        for u_id, items in negative_samples.items():
+            for item in items:
+                neg_data_list.append([u_id, item, 0])
+
+        df_negatives = pd.DataFrame(neg_data_list, columns=['user', 'item', 'rating'])
+
+        # 将negative样本拼接到原始数据中
+        final_data = pd.concat([train_data, df_negatives], ignore_index=True)
+        final_data.to_csv(f'{self.train_path}.final', sep='\t', header=False, index=False)
+
+        test_data = pd.read_csv(self.test_path, sep='\t',
+                                names=['user', 'item', 'rating', 'timestamp'])
+        test_data.drop(columns='timestamp', inplace=True)
+        test_data['rating'] = 1
+        test_data.to_csv(f'{self.test_path}.final', sep='\t', header=False, index=False)
+
+
+class RatingDataset(Dataset):
+    """Dataset for loading user-item ratings."""
+
+    def __init__(self, file):
+        self.data = pd.read_csv(file, sep='\t', names=['user', 'item', 'rating'])
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx]
+        # Convert data to tensors
+        user = torch.tensor(row['user'], dtype=torch.int64)
+        item = torch.tensor(row['item'], dtype=torch.int64)
+        rating = torch.tensor(row['rating'], dtype=torch.float32)
+        return user, item, rating
+
+
+class NegativeDataset(Dataset):
+    """Dataset for loading negative samples for each user-item pair."""
+
+    def __init__(self, file):
+        self.negatives = self.load_negative(file)
+
+    @staticmethod
+    def load_negative(n_file):
+        negs = {}
+        with open(n_file, 'r') as f:
+            for line in f:
+                line = line.strip().split('\t')
+                user, item = eval(line[0])
+                negs[(user, item)] = list(map(int, line[1:]))
+        return negs
+
+    def __len__(self):
+        return len(self.negatives)
+
+    def __getitem__(self, idx):
+        user_item = list(self.negatives.keys())[idx]
+        user = torch.tensor(user_item[0], dtype=torch.int64)
+        item = torch.tensor(user_item[1], dtype=torch.int64)
+        negs = torch.tensor(self.negatives[user_item], dtype=torch.int64)
+        return user, item, negs
diff --git a/Builder/Models.py b/Builder/Models.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class GMF(nn.Module):
+    def __init__(self, num_users, num_items, num_factors):
+        super(GMF, self).__init__()
+        self.user_emb = nn.Embedding(num_embeddings=num_users, embedding_dim=num_factors).cuda()  
+        self.item_emb = nn.Embedding(num_embeddings=num_items, embedding_dim=num_factors).cuda()  
+        self.apply(self.normalize)
+        self.device = torch.device('cuda')
+
+    def forward(self, user_ids, item_ids):
+        user_ids = user_ids.to(self.device)  
+        item_ids = item_ids.to(self.device)  
+        user_embedding = self.user_emb(user_ids)
+        item_embedding = self.item_emb(item_ids)
+        output = (user_embedding * item_embedding).sum(1)
+        return F.relu(output)  # 修改激活函数为ReLU
+
+    def normalize(self, module):
+        if isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight.data, mean=0.0, std=0.01)
+        elif isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight.data, mean=0.0, std=0.01)
+            if module.bias is not None:
+                torch.nn.init.constant_(module.bias.data, 0)
+
+class MLP(nn.Module):
+    def __init__(self, num_users, num_items, layers):
+        super(MLP, self).__init__()
+        self.user_emb = nn.Embedding(num_users, layers[0] // 2 if layers else 8).cuda()  # 移到CUDA设备上
+        self.item_emb = nn.Embedding(num_items, layers[0] // 2 if layers else 8).cuda()  # 移到CUDA设备上
+        self.fc_layers = nn.ModuleList()
+        self.device = torch.device('cuda')
+        for idx in range(1, len(layers)):
+            self.fc_layers.append(nn.Linear(layers[idx - 1], layers[idx]).cuda())  # 移到CUDA设备上
+        self.output_layer = nn.Linear(layers[-1] if layers else 16, 1).cuda()  # 移到CUDA设备上
+
+    def forward(self, user_ids, item_ids):
+        user_ids = user_ids.to(self.device)  # 移到CUDA设备上
+        item_ids = item_ids.to(self.device)  # 移到CUDA设备上
+        user_embedding = self.user_emb(user_ids)
+        item_embedding = self.item_emb(item_ids)
+        vector = torch.cat([user_embedding, item_embedding], dim=-1)
+        for layer in self.fc_layers:
+            vector = torch.relu(layer(vector))
+        output = self.output_layer(vector)
+        return output.sigmoid()
+
+def MLP_with_hidden_layers(num_users, num_items, num_factors, num_hidden_layers):
+    if num_hidden_layers == 0:
+        layers = []  # 没有隐藏层
+    else:
+        layers = [num_factors] * num_hidden_layers  # 为简单起见，与潜在因素大小相同
+    return MLP(num_users, num_items, layers)
+
+
+class NeuMF(nn.Module):
+    def __init__(self, num_users, num_items, num_factors, mlp_layers):
+        super(NeuMF, self).__init__()
+        self.gmf = GMF(num_users, num_items, num_factors).cuda()  
+        self.mlp = MLP_with_hidden_layers(num_users, num_items, num_factors, 3).cuda()  # 修改隐藏层层数
+        self.output_layer = nn.Linear(2, 1).cuda()  
+        self.device = torch.device('cuda')
+
+    def forward(self, user_ids, item_ids):
+        user_ids = user_ids.to(self.device)  
+        item_ids = item_ids.to(self.device)  
+        gmf_output = self.gmf(user_ids, item_ids)
+        mlp_output = self.mlp(user_ids, item_ids)
+        gmf_output = gmf_output.unsqueeze(1)
+        concatenated_output = torch.cat((gmf_output, mlp_output), dim=-1)
+        output = self.output_layer(concatenated_output)
+        return torch.sigmoid(output)  # 修改激活函数为sigmoid
diff --git a/Builder/Process.py b/Builder/Process.py
@@ -0,0 +1,63 @@
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+device = torch.device("cuda:0")
+
+def train(model, train_loader, optimizer, criterion):  # 修改函数名
+    model.train()
+    total_loss = 0
+    for u_ids, i_ids, ratings in train_loader:  # 修改变量名
+        # Move data to the appropriate device if using CUDA
+        u_ids = u_ids.to(device)  # 修改变量名
+        i_ids = i_ids.to(device)  # 修改变量名
+        ratings = ratings.float().to(device)
+
+        optimizer.zero_grad()
+        outputs = model(u_ids, i_ids)  # ratings are not passed to the model
+        loss = criterion(outputs.squeeze(), ratings.float())  # Ensure ratings are float for loss calculation
+        loss.backward()
+        optimizer.step()
+
+        total_loss += loss.item()
+    return total_loss / len(train_loader)
+
+def evaluate(model, test_loader, negative_loader, top_k=10):  # 修改函数名
+    model.eval()
+    HR_list = []
+    NDCG_list = []
+
+    with torch.no_grad():
+        for (user_ids, pos_item_ids, _), (neg_user_ids, _, neg_item_ids) in zip(test_loader, negative_loader):
+            # Combine positive and negative items for scoring
+            items = torch.cat([pos_item_ids.unsqueeze(1), neg_item_ids], dim=1)
+            user_ids = user_ids.unsqueeze(1).expand(-1, items.size(1))  # Expand user_ids to match the number of items
+
+            # Move data to the correct device
+            items = items.to(device)
+            user_ids = user_ids.to(device)
+            pos_item_ids = pos_item_ids.to(device)
+            neg_item_ids = neg_item_ids.to(device)
+
+            # Predict the scores for these items
+            predictions = model(user_ids.reshape(-1), items.reshape(-1)).reshape(-1, items.size(1))
+
+            # Get the index of the highest scored items
+            _, indices = torch.topk(predictions, k=top_k, dim=1)
+            recommended_items = items.gather(1, indices)
+
+            # Check if the positive test item is among the recommended items
+            HR = (recommended_items == pos_item_ids.unsqueeze(1)).any(dim=1).float()
+            HR_list.append(HR.mean().item())
+
+            # Compute NDCG
+            relevant = (recommended_items == pos_item_ids.unsqueeze(1))
+            rank = relevant.nonzero(as_tuple=True)[1]
+            NDCG = (1 / torch.log2(rank.float() + 2)).mean().item()  # Compute NDCG score
+            NDCG_list.append(NDCG)
+
+    # Compute the average HR and NDCG
+    mean_HR = np.mean(HR_list)
+    mean_NDCG = np.mean(NDCG_list)
+
+    return mean_HR, mean_NDCG
-Original file line number
+Diff line change
@@ Expand Up / @@ -158,3 +158,4 @@ cython_debug/ @@
     #  and can be added to the global gitignore or merged into this file.  For a more nuclear
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
+    .idea/