Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
99 changes: 99 additions & 0 deletions Builder/DataBuilder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset


class DataProcessor: # 数据处理类
def __init__(self, train_path, test_path):
self.train_path = train_path
self.test_path = test_path

def preprocess_data(self):
# 先读取数据,然后进行一系列的处理
train_data = pd.read_csv(self.train_path, sep='\t',
names=['user', 'item', 'rating', 'timestamp'])

train_data.drop(columns='timestamp', inplace=True)
train_data['rating'] = 1

min_item = min(train_data['item'].tolist())
max_item = max(train_data['item'].tolist())
all_items = set(range(min_item, max_item + 1))

# 用于存放negative样本的字典
negative_samples = {u_id: [] for u_id in train_data['user'].unique()}

# 给每个用户获取negative样本
for u_id in negative_samples:
# positive样本
pos_items = set(train_data[train_data['user'] == u_id]['item'].unique())

neg_items = list(all_items - pos_items)

selected_negatives = np.random.choice(neg_items, 8, replace=False)

negative_samples[u_id].extend(selected_negatives)

# 创建一个DataFrame来存放negative样本
neg_data_list = []
for u_id, items in negative_samples.items():
for item in items:
neg_data_list.append([u_id, item, 0])

df_negatives = pd.DataFrame(neg_data_list, columns=['user', 'item', 'rating'])

# 将negative样本拼接到原始数据中
final_data = pd.concat([train_data, df_negatives], ignore_index=True)
final_data.to_csv(f'{self.train_path}.final', sep='\t', header=False, index=False)

test_data = pd.read_csv(self.test_path, sep='\t',
names=['user', 'item', 'rating', 'timestamp'])
test_data.drop(columns='timestamp', inplace=True)
test_data['rating'] = 1
test_data.to_csv(f'{self.test_path}.final', sep='\t', header=False, index=False)


class RatingDataset(Dataset):
"""Dataset for loading user-item ratings."""

def __init__(self, file):
self.data = pd.read_csv(file, sep='\t', names=['user', 'item', 'rating'])

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
row = self.data.iloc[idx]
# Convert data to tensors
user = torch.tensor(row['user'], dtype=torch.int64)
item = torch.tensor(row['item'], dtype=torch.int64)
rating = torch.tensor(row['rating'], dtype=torch.float32)
return user, item, rating


class NegativeDataset(Dataset):
"""Dataset for loading negative samples for each user-item pair."""

def __init__(self, file):
self.negatives = self.load_negative(file)

@staticmethod
def load_negative(n_file):
negs = {}
with open(n_file, 'r') as f:
for line in f:
line = line.strip().split('\t')
user, item = eval(line[0])
negs[(user, item)] = list(map(int, line[1:]))
return negs

def __len__(self):
return len(self.negatives)

def __getitem__(self, idx):
user_item = list(self.negatives.keys())[idx]
user = torch.tensor(user_item[0], dtype=torch.int64)
item = torch.tensor(user_item[1], dtype=torch.int64)
negs = torch.tensor(self.negatives[user_item], dtype=torch.int64)
return user, item, negs
79 changes: 79 additions & 0 deletions Builder/Models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings

warnings.filterwarnings('ignore')


class GMF(nn.Module):
def __init__(self, num_users, num_items, num_factors):
super(GMF, self).__init__()
self.user_emb = nn.Embedding(num_embeddings=num_users, embedding_dim=num_factors).cuda()
self.item_emb = nn.Embedding(num_embeddings=num_items, embedding_dim=num_factors).cuda()
self.apply(self.normalize)
self.device = torch.device('cuda')

def forward(self, user_ids, item_ids):
user_ids = user_ids.to(self.device)
item_ids = item_ids.to(self.device)
user_embedding = self.user_emb(user_ids)
item_embedding = self.item_emb(item_ids)
output = (user_embedding * item_embedding).sum(1)
return F.relu(output) # 修改激活函数为ReLU

def normalize(self, module):
if isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight.data, mean=0.0, std=0.01)
elif isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight.data, mean=0.0, std=0.01)
if module.bias is not None:
torch.nn.init.constant_(module.bias.data, 0)

class MLP(nn.Module):
def __init__(self, num_users, num_items, layers):
super(MLP, self).__init__()
self.user_emb = nn.Embedding(num_users, layers[0] // 2 if layers else 8).cuda() # 移到CUDA设备上
self.item_emb = nn.Embedding(num_items, layers[0] // 2 if layers else 8).cuda() # 移到CUDA设备上
self.fc_layers = nn.ModuleList()
self.device = torch.device('cuda')
for idx in range(1, len(layers)):
self.fc_layers.append(nn.Linear(layers[idx - 1], layers[idx]).cuda()) # 移到CUDA设备上
self.output_layer = nn.Linear(layers[-1] if layers else 16, 1).cuda() # 移到CUDA设备上

def forward(self, user_ids, item_ids):
user_ids = user_ids.to(self.device) # 移到CUDA设备上
item_ids = item_ids.to(self.device) # 移到CUDA设备上
user_embedding = self.user_emb(user_ids)
item_embedding = self.item_emb(item_ids)
vector = torch.cat([user_embedding, item_embedding], dim=-1)
for layer in self.fc_layers:
vector = torch.relu(layer(vector))
output = self.output_layer(vector)
return output.sigmoid()

def MLP_with_hidden_layers(num_users, num_items, num_factors, num_hidden_layers):
if num_hidden_layers == 0:
layers = [] # 没有隐藏层
else:
layers = [num_factors] * num_hidden_layers # 为简单起见,与潜在因素大小相同
return MLP(num_users, num_items, layers)


class NeuMF(nn.Module):
def __init__(self, num_users, num_items, num_factors, mlp_layers):
super(NeuMF, self).__init__()
self.gmf = GMF(num_users, num_items, num_factors).cuda()
self.mlp = MLP_with_hidden_layers(num_users, num_items, num_factors, 3).cuda() # 修改隐藏层层数
self.output_layer = nn.Linear(2, 1).cuda()
self.device = torch.device('cuda')

def forward(self, user_ids, item_ids):
user_ids = user_ids.to(self.device)
item_ids = item_ids.to(self.device)
gmf_output = self.gmf(user_ids, item_ids)
mlp_output = self.mlp(user_ids, item_ids)
gmf_output = gmf_output.unsqueeze(1)
concatenated_output = torch.cat((gmf_output, mlp_output), dim=-1)
output = self.output_layer(concatenated_output)
return torch.sigmoid(output) # 修改激活函数为sigmoid
63 changes: 63 additions & 0 deletions Builder/Process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import torch
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cuda:0")

def train(model, train_loader, optimizer, criterion): # 修改函数名
model.train()
total_loss = 0
for u_ids, i_ids, ratings in train_loader: # 修改变量名
# Move data to the appropriate device if using CUDA
u_ids = u_ids.to(device) # 修改变量名
i_ids = i_ids.to(device) # 修改变量名
ratings = ratings.float().to(device)

optimizer.zero_grad()
outputs = model(u_ids, i_ids) # ratings are not passed to the model
loss = criterion(outputs.squeeze(), ratings.float()) # Ensure ratings are float for loss calculation
loss.backward()
optimizer.step()

total_loss += loss.item()
return total_loss / len(train_loader)

def evaluate(model, test_loader, negative_loader, top_k=10): # 修改函数名
model.eval()
HR_list = []
NDCG_list = []

with torch.no_grad():
for (user_ids, pos_item_ids, _), (neg_user_ids, _, neg_item_ids) in zip(test_loader, negative_loader):
# Combine positive and negative items for scoring
items = torch.cat([pos_item_ids.unsqueeze(1), neg_item_ids], dim=1)
user_ids = user_ids.unsqueeze(1).expand(-1, items.size(1)) # Expand user_ids to match the number of items

# Move data to the correct device
items = items.to(device)
user_ids = user_ids.to(device)
pos_item_ids = pos_item_ids.to(device)
neg_item_ids = neg_item_ids.to(device)

# Predict the scores for these items
predictions = model(user_ids.reshape(-1), items.reshape(-1)).reshape(-1, items.size(1))

# Get the index of the highest scored items
_, indices = torch.topk(predictions, k=top_k, dim=1)
recommended_items = items.gather(1, indices)

# Check if the positive test item is among the recommended items
HR = (recommended_items == pos_item_ids.unsqueeze(1)).any(dim=1).float()
HR_list.append(HR.mean().item())

# Compute NDCG
relevant = (recommended_items == pos_item_ids.unsqueeze(1))
rank = relevant.nonzero(as_tuple=True)[1]
NDCG = (1 / torch.log2(rank.float() + 2)).mean().item() # Compute NDCG score
NDCG_list.append(NDCG)

# Compute the average HR and NDCG
mean_HR = np.mean(HR_list)
mean_NDCG = np.mean(NDCG_list)

return mean_HR, mean_NDCG
Loading