Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions recommendation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Two environments need to be configured before running the following code:<br>
1.nltk<br>
Run the following python code:

import nltk<br>
nltk.download('wordnet')<br>
nltk.download('punkt')<br>
nltk.download('vader_lexicon')

If the above code fails to run, you can try to manually download the relevant dataset and place it in the appropriate location.<br>
For example, after creating a folder named wordnet under the path (D:\nltk_data\corpora), you can download and unzip the wordnet dataset, and finally move the dataset to that folder.

2.KAFKA<br>
The specific process of building kafka in local windows environment is as follows:<br>
First, install the kafka-python package via "pip install kafka-python -i https://pypi.tuna.tsinghua.edu.cn/simple".<br>
Secondly, download kafka via "http://mirrors.cloud.tencent.com/apache/kafka/".<br>
Next, extract the kafka files into a folder. You need to be careful not to set a very long path, otherwise it will generate an error. Enter the folder, open a cmd window for each of the following commands, and don't close it even after success.<br>
(1) zookeeper-server-start.bat ... \... \config\zookeeper.properties <br>
(2) kafka-server-start.bat ... \... \config\server.properties # load configuration <br>
(3)kafka-console-producer.bat --broker-list 127.0.0.1:9092 --topic test #Start the producer <br>
(4) kafka-console-consumer.bat --bootstrap-server 127.0.0.1:9092 --topic test --from-beginning #Start the consumer on port 9092. <br>
Finally, run a couple of simple demos you can find on the web to check if the installation was successful.
166 changes: 166 additions & 0 deletions recommendation/calculateRank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import pymysql as mysql
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

#tmp_department="Business Administration"
tmp_comment=""

def mysqlConnect():
#数据库连接
conn = mysql.connect(
host='localhost',
user='root',
password='111',
database='findmyprof'
)
return conn

def universityScore(university):
#计算院校的分数,且院校属性占比40%
weight1=0.4
if university:
university=int(university)
else:
return 0
score1=float(university)*weight1
return score1

def calculateSimilar(phrase1,phrase2):
#计算学生专业与教授研究领域的匹配程度(相似性)
tokens1 = word_tokenize(phrase1)
tokens2 = word_tokenize(phrase2)
# 使用NLTK的word_tokenize函数将短语分词,得到两个短语的词汇列表
synsets1 = []
synsets2 = []
for token in tokens1:
synsets1.extend(wordnet.synsets(token))
# 对于短语1中的每个词,使用NLTK的wordnet.synsets函数获取其同义词集合,并将其添加到synsets1列表中
for token in tokens2:
synsets2.extend(wordnet.synsets(token))

max_sim = -1
# 初始化最大相似度为-1
for synset1 in synsets1:
for synset2 in synsets2:
sim = synset1.path_similarity(synset2)
# 使用synset1.path_similarity(synset2)计算两个同义词集合的路径相似度,并将结果存储在sim变量中

if sim is not None and sim > max_sim:
max_sim = sim
# 如果计算得到的相似度不为None且大于最大相似度,则更新最大相似度的值
return max_sim

def researchAreaScore(researchArea,tmp_department):#这里的researchArea暂用degree
#计算研究领域属性的分数,研究领域属性占比37%
weight2=0.37
if researchArea is None:
return 0
else:
phrase1=tmp_department
phrase2=re.split("in ",researchArea)[1]#暂用degree值代替researchArea值,因此需取degree后面部分
#计算分词相似度
max_sim=calculateSimilar(phrase1,phrase2)
score2=max_sim*weight2
return score2

def evaluate_sentiment(comment):
#评估关于教授的网上评论的好坏程度
sid = SentimentIntensityAnalyzer()# 初始化情感分析器
sentiment_scores = sid.polarity_scores(comment)# 获取评论的情感分数
good_bad_score = (sentiment_scores['pos'] - sentiment_scores['neg']) / 2 + 0.5# 根据情感分数计算好坏程度
good_bad_score = max(0, min(1, good_bad_score))# 限定好坏程度范围在0至1之间
return good_bad_score

def webCommentScore(webComment):
#计算网上评论的分数,其中网上评论属性的占比为16%
weight3=0.16
if webComment is None:
return 0
else:
judge=evaluate_sentiment(webComment)
score3=judge*weight3
return score3

def educationScore(education):
#计算教授的学术背景的分数,其中学术背景属性占比为7%
weight4=0.07
if education is None:
return 0
else:
#score4=int(education)*weight4
score4 = 1.0 * weight4#目前education
return score4

def calculateRank(user_degree,name,university,researchArea,webComment,education):
#计算ranking值
score1=universityScore(university)
score2=researchAreaScore(researchArea,user_degree)
score3=webCommentScore(webComment)
score4=educationScore(education)
score=score1+score2+score3+score4
return score

def writeInDataBase(ranks):
# 将结果写入数据库的result表
mydb = mysqlConnect()
mycursor = mydb.cursor()

sql1="drop table result;"
mycursor.execute(sql1)
mydb.commit()

sql2="create table result ( `English Name` VARCHAR(100),ranking FLOAT)"
mycursor.execute(sql2)
mydb.commit()

sql3 = "truncate table result;"
mycursor.execute(sql3)
mydb.commit()

for key in ranks.keys():
value=ranks[key]
value=round(value,3)
sql4 = "insert into result values(%s,%s);"
val = (key, value)
mycursor.execute(sql4, val)
mydb.commit()

mycursor.close()
mydb.close()


def getData(user_degree):
# 获取数据库连接
mydb = mysqlConnect()
mycursor = mydb.cursor()

"""sql = "select * from ust_web limit 5;"
mycursor.execute(sql)
#mydb.commit()
results=mycursor.fetchall()
for i in results:
print(i)"""
sql1="select `English Name`,Degree,Education from ust_web limit 32;"
mycursor.execute(sql1)
initial_data=mycursor.fetchall()
university=1 #目前所有教授均是hku,故设默认值为1
result={} #用于存储ranking结果
for i in initial_data:
rank=calculateRank(user_degree,i[0],university,i[1],tmp_comment,i[2])
result[i[0]]=rank
#print(result)

mycursor.close()
mydb.close()

return result



def main(id,user_degree):
result=getData(user_degree)
print("已计算出结果")
writeInDataBase(result)
print("已将"+"为ID:"+id+"的用户推荐的结果写入数据库result表内!")
139 changes: 139 additions & 0 deletions recommendation/calculateRank_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import csv
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

file_path='E:/FindMyProf/prof_data.csv'#教授数据文件的路径
save_path='E:/FindMyProf/result.csv'#结果文件的路径

def universityScore(university):
#计算院校的分数,且院校属性占比40%
weight1=0.4
if university:
university=int(university)
else:
return 0
score1=float(university)*weight1
return score1

def calculateSimilar(phrase1,phrase2):
#计算学生专业与教授研究领域的匹配程度(相似性)
tokens1 = word_tokenize(phrase1)
tokens2 = word_tokenize(phrase2)
# 使用NLTK的word_tokenize函数将短语分词,得到两个短语的词汇列表
synsets1 = []
synsets2 = []
for token in tokens1:
synsets1.extend(wordnet.synsets(token))
# 对于短语1中的每个词,使用NLTK的wordnet.synsets函数获取其同义词集合,并将其添加到synsets1列表中
for token in tokens2:
synsets2.extend(wordnet.synsets(token))

max_sim = -1
# 初始化最大相似度为-1
for synset1 in synsets1:
for synset2 in synsets2:
sim = synset1.path_similarity(synset2)
# 使用synset1.path_similarity(synset2)计算两个同义词集合的路径相似度,并将结果存储在sim变量中

if sim is not None and sim > max_sim:
max_sim = sim
# 如果计算得到的相似度不为None且大于最大相似度,则更新最大相似度的值
return max_sim

def researchAreaScore(researchArea,tmp_department):#这里的researchArea暂用degree
#计算研究领域属性的分数,研究领域属性占比37%
weight2=0.37
if researchArea is None:
return 0
else:
phrase1=tmp_department
phrase2=re.split("in ",researchArea)[1]#暂用degree值代替researchArea值,因此需取degree后面部分
#计算分词相似度
max_sim=calculateSimilar(phrase1,phrase2)
score2=max_sim*weight2
return score2

def evaluate_sentiment(comment):
#评估关于教授的网上评论的好坏程度
sid = SentimentIntensityAnalyzer()# 初始化情感分析器
sentiment_scores = sid.polarity_scores(comment)# 获取评论的情感分数
good_bad_score = (sentiment_scores['pos'] - sentiment_scores['neg']) / 2 + 0.5# 根据情感分数计算好坏程度
good_bad_score = max(0, min(1, good_bad_score))# 限定好坏程度范围在0至1之间
return good_bad_score

def webCommentScore(webComment):
#计算网上评论的分数,其中网上评论属性的占比为16%
weight3=0.16
if webComment is None:
return 0
else:
judge=evaluate_sentiment(webComment)
score3=judge*weight3
return score3

def educationScore(education):
#计算教授的学术背景的分数,其中学术背景属性占比为7%
weight4=0.07
if education is None:
return 0
else:
#score4=int(education)*weight4
score4 = 1.0 * weight4#目前education
return score4

def calculateRank(user_degree,name,university,researchArea,webComment,education):
# 计算ranking值
score1 = universityScore(university)
score2 = researchAreaScore(researchArea, user_degree)
score3 = webCommentScore(webComment)
score4 = educationScore(education)
score = score1 + score2 + score3 + score4
return score

def getData(user_degree):
university=1#目前所有教授均是hku,故设默认值为1
tmp_comment=""#目前没有关于教授的网上评论,故设为空
result={} #用于存储ranking结果

with open(file_path, 'r') as file:#文件路径
reader = csv.reader(file) # 创建csv reader对象
for i, row in enumerate(reader): # 逐行读取csv文件的内容
# 读取前32行内容
if i == 0:
continue
else:
rank = calculateRank(user_degree, row[1], university, row[2], tmp_comment, row[3])
keyword=row[0]+"!"+row[1]
result[keyword] = rank
"""elif i <= 32:
#print(row)
rank = calculateRank(user_degree, row[1], university, row[2], tmp_comment, row[3])
keyword = row[0] + "!" + row[1]
result[keyword] = rank
else:
break"""


return result

def write_csv_file(ranks):
with open(save_path, 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['ID','English Name', 'ranking'])

for key in ranks.keys():
keyId=re.split("!",key)[0]
value1=re.split("!",key)[1]
value2 = ranks[key]
value2 = round(value2, 3)
writer.writerow([keyId, value1,value2])

def main(id,user_degree):
result = getData(user_degree)
print("已计算出结果")
write_csv_file(result)
print("已将" + "为ID:" + id + "的用户推荐的结果写入result.csv内!")

main("1234","Business Administration")#用于调试
15 changes: 15 additions & 0 deletions recommendation/consumer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import calculateRank_csv
import resultSorted
import re
from kafka import KafkaConsumer

consumer=KafkaConsumer("test",bootstrap_servers='localhost:9092')

for i in consumer:
print(i)
record=i.value.decode()
ID=re.split(",",record)[0]
degree=re.split(",",record)[1]
n=re.split(",",record)[2]
calculateRank_csv.main(ID,degree)#所有教授的ranking结果
resultSorted.sort_csv_by_column(n)#评分最高的n名教授的ranking结果
12 changes: 12 additions & 0 deletions recommendation/consumer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import calculateRank
import re
from kafka import KafkaConsumer

consumer=KafkaConsumer("test",bootstrap_servers='localhost:9092')

for i in consumer:
print(i)
record=i.value.decode()
ID=re.split(",",record)[0]
degree=re.split(",",record)[1]
calculateRank.main(ID,degree)
17 changes: 17 additions & 0 deletions recommendation/producer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from kafka import KafkaProducer

producer = KafkaProducer(bootstrap_servers='localhost:9092') #连接kafka

id=input("What is your ID?")
#academicStatus=input("What is your academic status?")
#faculty=input("What is your faculty?")
program=input("What is your program?")
#major=input("What is your major?")
#interest=input("What is your research interest?")
n=input("How many recommendations do you need to show you? Please fill in the integer.")

msg=id+","+program+","+n
msg = msg.encode('utf-8')
producer.send('test', msg)

producer.close()
16 changes: 16 additions & 0 deletions recommendation/producer_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from kafka import KafkaProducer

producer = KafkaProducer(bootstrap_servers='localhost:9092') #连接kafka

id=input("What is your ID?")
#academicStatus=input("What is your academic status?")
#faculty=input("What is your faculty?")
program=input("What is your program?")
#major=input("What is your major?")
#interest=input("What is your research interest?")

msg=id+","+program
msg = msg.encode('utf-8')
producer.send('test', msg)

producer.close()
Loading