BlackTuringAI · PangYin · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/recommendation/README.md b/recommendation/README.md
@@ -0,0 +1,22 @@
+Two environments need to be configured before running the following code:<br>
+1.nltk<br>
+Run the following python code:
+
+import nltk<br>
+nltk.download('wordnet')<br>
+nltk.download('punkt')<br>
+nltk.download('vader_lexicon')
+
+If the above code fails to run, you can try to manually download the relevant dataset and place it in the appropriate location.<br>
+For example, after creating a folder named wordnet under the path (D:\nltk_data\corpora), you can download and unzip the wordnet dataset, and finally move the dataset to that folder.
+
+2.KAFKA<br>
+The specific process of building kafka in local windows environment is as follows:<br>
+First, install the kafka-python package via "pip install kafka-python -i https://pypi.tuna.tsinghua.edu.cn/simple".<br>
+Secondly, download kafka via "http://mirrors.cloud.tencent.com/apache/kafka/".<br>
+Next, extract the kafka files into a folder. You need to be careful not to set a very long path, otherwise it will generate an error. Enter the folder, open a cmd window for each of the following commands, and don't close it even after success.<br>
+ (1) zookeeper-server-start.bat ... \... \config\zookeeper.properties <br>
+ (2) kafka-server-start.bat ... \... \config\server.properties # load configuration <br>
+ (3)kafka-console-producer.bat --broker-list 127.0.0.1:9092 --topic test #Start the producer <br>
+ (4) kafka-console-consumer.bat --bootstrap-server 127.0.0.1:9092 --topic test --from-beginning #Start the consumer on port 9092. <br>
+Finally, run a couple of simple demos you can find on the web to check if the installation was successful.
diff --git a/recommendation/calculateRank.py b/recommendation/calculateRank.py
@@ -0,0 +1,166 @@
+import pymysql as mysql
+import re
+from nltk.corpus import wordnet
+from nltk.tokenize import word_tokenize
+from nltk.sentiment import SentimentIntensityAnalyzer
+
+#tmp_department="Business Administration"
+tmp_comment=""
+
+def mysqlConnect():
+    #数据库连接
+    conn = mysql.connect(
+        host='localhost',
+        user='root',
+        password='111',
+        database='findmyprof'
+    )
+    return conn
+
+def universityScore(university):
+    #计算院校的分数，且院校属性占比40%
+    weight1=0.4
+    if university:
+        university=int(university)
+    else:
+        return 0
+    score1=float(university)*weight1
+    return score1
+
+def calculateSimilar(phrase1,phrase2):
+    #计算学生专业与教授研究领域的匹配程度（相似性）
+    tokens1 = word_tokenize(phrase1)
+    tokens2 = word_tokenize(phrase2)
+    # 使用NLTK的word_tokenize函数将短语分词，得到两个短语的词汇列表
+    synsets1 = []
+    synsets2 = []
+    for token in tokens1:
+        synsets1.extend(wordnet.synsets(token))
+    # 对于短语1中的每个词，使用NLTK的wordnet.synsets函数获取其同义词集合，并将其添加到synsets1列表中
+    for token in tokens2:
+        synsets2.extend(wordnet.synsets(token))
+
+    max_sim = -1
+    # 初始化最大相似度为-1
+    for synset1 in synsets1:
+        for synset2 in synsets2:
+            sim = synset1.path_similarity(synset2)
+            # 使用synset1.path_similarity(synset2)计算两个同义词集合的路径相似度，并将结果存储在sim变量中
+
+            if sim is not None and sim > max_sim:
+                max_sim = sim
+                # 如果计算得到的相似度不为None且大于最大相似度，则更新最大相似度的值
+    return max_sim
+
+def researchAreaScore(researchArea,tmp_department):#这里的researchArea暂用degree
+    #计算研究领域属性的分数，研究领域属性占比37%
+    weight2=0.37
+    if researchArea is None:
+        return 0
+    else:
+        phrase1=tmp_department
+        phrase2=re.split("in ",researchArea)[1]#暂用degree值代替researchArea值，因此需取degree后面部分
+        #计算分词相似度
+        max_sim=calculateSimilar(phrase1,phrase2)
+        score2=max_sim*weight2
+    return score2
+
+def evaluate_sentiment(comment):
+    #评估关于教授的网上评论的好坏程度
+    sid = SentimentIntensityAnalyzer()# 初始化情感分析器
+    sentiment_scores = sid.polarity_scores(comment)# 获取评论的情感分数
+    good_bad_score = (sentiment_scores['pos'] - sentiment_scores['neg']) / 2 + 0.5# 根据情感分数计算好坏程度
+    good_bad_score = max(0, min(1, good_bad_score))# 限定好坏程度范围在0至1之间
+    return good_bad_score
+
+def webCommentScore(webComment):
+    #计算网上评论的分数，其中网上评论属性的占比为16%
+    weight3=0.16
+    if webComment is None:
+        return 0
+    else:
+        judge=evaluate_sentiment(webComment)
+        score3=judge*weight3
+    return score3
+
+def educationScore(education):
+    #计算教授的学术背景的分数，其中学术背景属性占比为7%
+    weight4=0.07
+    if education is None:
+        return 0
+    else:
+        #score4=int(education)*weight4
+        score4 = 1.0 * weight4#目前education
+    return score4
+
+def calculateRank(user_degree,name,university,researchArea,webComment,education):
+    #计算ranking值
+    score1=universityScore(university)
+    score2=researchAreaScore(researchArea,user_degree)
+    score3=webCommentScore(webComment)
+    score4=educationScore(education)
+    score=score1+score2+score3+score4
+    return score
+
+def writeInDataBase(ranks):
+    # 将结果写入数据库的result表
+    mydb = mysqlConnect()
+    mycursor = mydb.cursor()
+
+    sql1="drop table result;"
+    mycursor.execute(sql1)
+    mydb.commit()
+
+    sql2="create table result ( `English Name` VARCHAR(100),ranking FLOAT)"
+    mycursor.execute(sql2)
+    mydb.commit()
+
+    sql3 = "truncate table result;"
+    mycursor.execute(sql3)
+    mydb.commit()
+
+    for key in ranks.keys():
+        value=ranks[key]
+        value=round(value,3)
+        sql4 = "insert into result values(%s,%s);"
+        val = (key, value)
+        mycursor.execute(sql4, val)
+        mydb.commit()
+
+    mycursor.close()
+    mydb.close()
+
+
+def getData(user_degree):
+    # 获取数据库连接
+    mydb = mysqlConnect()
+    mycursor = mydb.cursor()
+
+    """sql = "select * from ust_web limit 5;"
+    mycursor.execute(sql)
+    #mydb.commit()
+    results=mycursor.fetchall()
+    for i in results:
+        print(i)"""
+    sql1="select `English Name`,Degree,Education from ust_web limit 32;"
+    mycursor.execute(sql1)
+    initial_data=mycursor.fetchall()
+    university=1 #目前所有教授均是hku，故设默认值为1
+    result={} #用于存储ranking结果
+    for i in initial_data:
+        rank=calculateRank(user_degree,i[0],university,i[1],tmp_comment,i[2])
+        result[i[0]]=rank
+    #print(result)
+
+    mycursor.close()
+    mydb.close()
+
+    return result
+
+
+
+def main(id,user_degree):
+    result=getData(user_degree)
+    print("已计算出结果")
+    writeInDataBase(result)
+    print("已将"+"为ID："+id+"的用户推荐的结果写入数据库result表内！")
diff --git a/recommendation/calculateRank_csv.py b/recommendation/calculateRank_csv.py
@@ -0,0 +1,139 @@
+import csv
+import re
+from nltk.corpus import wordnet
+from nltk.tokenize import word_tokenize
+from nltk.sentiment import SentimentIntensityAnalyzer
+
+file_path='E:/FindMyProf/prof_data.csv'#教授数据文件的路径
+save_path='E:/FindMyProf/result.csv'#结果文件的路径
+
+def universityScore(university):
+    #计算院校的分数，且院校属性占比40%
+    weight1=0.4
+    if university:
+        university=int(university)
+    else:
+        return 0
+    score1=float(university)*weight1
+    return score1
+
+def calculateSimilar(phrase1,phrase2):
+    #计算学生专业与教授研究领域的匹配程度（相似性）
+    tokens1 = word_tokenize(phrase1)
+    tokens2 = word_tokenize(phrase2)
+    # 使用NLTK的word_tokenize函数将短语分词，得到两个短语的词汇列表
+    synsets1 = []
+    synsets2 = []
+    for token in tokens1:
+        synsets1.extend(wordnet.synsets(token))
+    # 对于短语1中的每个词，使用NLTK的wordnet.synsets函数获取其同义词集合，并将其添加到synsets1列表中
+    for token in tokens2:
+        synsets2.extend(wordnet.synsets(token))
+
+    max_sim = -1
+    # 初始化最大相似度为-1
+    for synset1 in synsets1:
+        for synset2 in synsets2:
+            sim = synset1.path_similarity(synset2)
+            # 使用synset1.path_similarity(synset2)计算两个同义词集合的路径相似度，并将结果存储在sim变量中
+
+            if sim is not None and sim > max_sim:
+                max_sim = sim
+                # 如果计算得到的相似度不为None且大于最大相似度，则更新最大相似度的值
+    return max_sim
+
+def researchAreaScore(researchArea,tmp_department):#这里的researchArea暂用degree
+    #计算研究领域属性的分数，研究领域属性占比37%
+    weight2=0.37
+    if researchArea is None:
+        return 0
+    else:
+        phrase1=tmp_department
+        phrase2=re.split("in ",researchArea)[1]#暂用degree值代替researchArea值，因此需取degree后面部分
+        #计算分词相似度
+        max_sim=calculateSimilar(phrase1,phrase2)
+        score2=max_sim*weight2
+    return score2
+
+def evaluate_sentiment(comment):
+    #评估关于教授的网上评论的好坏程度
+    sid = SentimentIntensityAnalyzer()# 初始化情感分析器
+    sentiment_scores = sid.polarity_scores(comment)# 获取评论的情感分数
+    good_bad_score = (sentiment_scores['pos'] - sentiment_scores['neg']) / 2 + 0.5# 根据情感分数计算好坏程度
+    good_bad_score = max(0, min(1, good_bad_score))# 限定好坏程度范围在0至1之间
+    return good_bad_score
+
+def webCommentScore(webComment):
+    #计算网上评论的分数，其中网上评论属性的占比为16%
+    weight3=0.16
+    if webComment is None:
+        return 0
+    else:
+        judge=evaluate_sentiment(webComment)
+        score3=judge*weight3
+    return score3
+
+def educationScore(education):
+    #计算教授的学术背景的分数，其中学术背景属性占比为7%
+    weight4=0.07
+    if education is None:
+        return 0
+    else:
+        #score4=int(education)*weight4
+        score4 = 1.0 * weight4#目前education
+    return score4
+
+def calculateRank(user_degree,name,university,researchArea,webComment,education):
+    # 计算ranking值
+    score1 = universityScore(university)
+    score2 = researchAreaScore(researchArea, user_degree)
+    score3 = webCommentScore(webComment)
+    score4 = educationScore(education)
+    score = score1 + score2 + score3 + score4
+    return score
+
+def getData(user_degree):
+    university=1#目前所有教授均是hku，故设默认值为1
+    tmp_comment=""#目前没有关于教授的网上评论，故设为空
+    result={} #用于存储ranking结果
+
+    with open(file_path, 'r') as file:#文件路径
+        reader = csv.reader(file)  # 创建csv reader对象
+        for i, row in enumerate(reader):  # 逐行读取csv文件的内容
+            # 读取前32行内容
+            if i == 0:
+                continue
+            else:
+                rank = calculateRank(user_degree, row[1], university, row[2], tmp_comment, row[3])
+                keyword=row[0]+"!"+row[1]
+                result[keyword] = rank
+            """elif i <= 32:
+                #print(row)
+                rank = calculateRank(user_degree, row[1], university, row[2], tmp_comment, row[3])
+                keyword = row[0] + "!" + row[1]
+                result[keyword] = rank
+            else:
+                break"""
+
+
+    return result
+
+def write_csv_file(ranks):
+    with open(save_path, 'w+', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerow(['ID','English Name', 'ranking'])
+
+        for key in ranks.keys():
+            keyId=re.split("!",key)[0]
+            value1=re.split("!",key)[1]
+            value2 = ranks[key]
+            value2 = round(value2, 3)
+            writer.writerow([keyId, value1,value2])
+
+def main(id,user_degree):
+    result = getData(user_degree)
+    print("已计算出结果")
+    write_csv_file(result)
+    print("已将" + "为ID：" + id + "的用户推荐的结果写入result.csv内！")
+
+main("1234","Business Administration")#用于调试
diff --git a/recommendation/consumer.py b/recommendation/consumer.py
@@ -0,0 +1,15 @@
+import calculateRank_csv
+import resultSorted
+import re
+from kafka import KafkaConsumer
+
+consumer=KafkaConsumer("test",bootstrap_servers='localhost:9092')
+
+for i in consumer:
+    print(i)
+    record=i.value.decode()
+    ID=re.split(",",record)[0]
+    degree=re.split(",",record)[1]
+    n=re.split(",",record)[2]
+    calculateRank_csv.main(ID,degree)#所有教授的ranking结果
+    resultSorted.sort_csv_by_column(n)#评分最高的n名教授的ranking结果
diff --git a/recommendation/consumer_test.py b/recommendation/consumer_test.py
@@ -0,0 +1,12 @@
+import calculateRank
+import re
+from kafka import KafkaConsumer
+
+consumer=KafkaConsumer("test",bootstrap_servers='localhost:9092')
+
+for i in consumer:
+    print(i)
+    record=i.value.decode()
+    ID=re.split(",",record)[0]
+    degree=re.split(",",record)[1]
+    calculateRank.main(ID,degree)
diff --git a/recommendation/producer.py b/recommendation/producer.py
@@ -0,0 +1,17 @@
+from kafka import KafkaProducer
+
+producer = KafkaProducer(bootstrap_servers='localhost:9092')   #连接kafka
+
+id=input("What is your ID?")
+#academicStatus=input("What is your academic status?")
+#faculty=input("What is your faculty?")
+program=input("What is your program?")
+#major=input("What is your major?")
+#interest=input("What is your research interest?")
+n=input("How many recommendations do you need to show you? Please fill in the integer.")
+
+msg=id+","+program+","+n
+msg = msg.encode('utf-8')
+producer.send('test', msg)
+
+producer.close()
diff --git a/recommendation/producer_test.py b/recommendation/producer_test.py
@@ -0,0 +1,16 @@
+from kafka import KafkaProducer
+
+producer = KafkaProducer(bootstrap_servers='localhost:9092')   #连接kafka
+
+id=input("What is your ID?")
+#academicStatus=input("What is your academic status?")
+#faculty=input("What is your faculty?")
+program=input("What is your program?")
+#major=input("What is your major?")
+#interest=input("What is your research interest?")
+
+msg=id+","+program
+msg = msg.encode('utf-8')
+producer.send('test', msg)
+
+producer.close()