diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..0a5775df --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,25 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + { + "type": "firefox", + "request": "launch", + "reAttach": true, + "name": "Launch index.html", + "file": "${workspaceFolder}/index.html" + }, + { + "type": "node", + "request": "launch", + "name": "Launch Program", + "skipFiles": [ + "/**" + ], + "program": "${file}" + } + ] +} \ No newline at end of file diff --git a/projectEnv/__pycache__/app.cpython-311.pyc b/projectEnv/__pycache__/app.cpython-311.pyc index 444f35ba..00c7783a 100644 Binary files a/projectEnv/__pycache__/app.cpython-311.pyc and b/projectEnv/__pycache__/app.cpython-311.pyc differ diff --git a/projectEnv/__pycache__/search.cpython-310.pyc b/projectEnv/__pycache__/search.cpython-310.pyc index 2b84d6a8..b4cd999c 100644 Binary files a/projectEnv/__pycache__/search.cpython-310.pyc and b/projectEnv/__pycache__/search.cpython-310.pyc differ diff --git a/projectEnv/__pycache__/search.cpython-311.pyc b/projectEnv/__pycache__/search.cpython-311.pyc index e55bb601..5fc0a4ed 100644 Binary files a/projectEnv/__pycache__/search.cpython-311.pyc and b/projectEnv/__pycache__/search.cpython-311.pyc differ diff --git a/projectEnv/__pycache__/searchCode.cpython-311.pyc b/projectEnv/__pycache__/searchCode.cpython-311.pyc index 6e10a2c2..8fea131e 100644 Binary files a/projectEnv/__pycache__/searchCode.cpython-311.pyc and b/projectEnv/__pycache__/searchCode.cpython-311.pyc differ diff --git a/projectEnv/app.py b/projectEnv/app.py index 53a14f6b..53b59b4d 100644 --- a/projectEnv/app.py +++ b/projectEnv/app.py @@ -68,9 +68,3 @@ def loading(): session['form_data'] = request.form return render_template("loading.html", gifList=gifList) - - - - - - diff --git a/projectEnv/databases/__pycache__/dataModels.cpython-311.pyc b/projectEnv/databases/__pycache__/dataModels.cpython-311.pyc new file mode 100644 index 00000000..e11813df Binary files /dev/null and b/projectEnv/databases/__pycache__/dataModels.cpython-311.pyc differ diff --git a/projectEnv/instance/posts.db b/projectEnv/instance/posts.db index df859b81..ae7a3349 100644 Binary files a/projectEnv/instance/posts.db and b/projectEnv/instance/posts.db differ diff --git a/projectEnv/search.py b/projectEnv/search.py deleted file mode 100644 index 83927794..00000000 --- a/projectEnv/search.py +++ /dev/null @@ -1,20 +0,0 @@ -import re -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer - -def sentiment_scores(comment_body): - - # creates SentimentIntensityAnalyzer object - sid_obj = SentimentIntensityAnalyzer() - scores = sid_obj.polarity_scores(comment_body) - if scores['compound'] >= 0.05: #positive - return 1 - - elif scores['compound'] <= - 0.05: #negative - return -1 - - else: - return 0 #neutral - - - - diff --git a/projectEnv/searchCode.py b/projectEnv/searchCode.py index 2726cbe6..139188d5 100644 --- a/projectEnv/searchCode.py +++ b/projectEnv/searchCode.py @@ -40,19 +40,35 @@ def get_data_if_exist(p_key): def sortRating(s): return s.rating -def getRating(comments): +def getRating(prpost): rating = 0 - comments.replace_more(limit=None) #replace all unloaded comment obj with loaded comments - for comment in comments: - rating += sentiment_scores(comment.body) + sentimentrating = 0 + sentind = 0 + prpost.comments.replace_more(limit=None) #replace all unloaded comment obj with loaded comments + # rating from the comment lin reg + baserate = linreg(prpost) + + for comment in prpost.comments: + sentimentrating += sentiment_scores(comment.body) + sentind = sentind + 1 + + rating = baserate * ((sentimentrating / sentind) + 1) + + + # temp checks + print("base", baserate) + print("sentimentrating", sentimentrating/sentind) return rating + + + def addToPosts(p_data): #look for cached data cached= True rating = get_data_if_exist(p_data.id) if rating == False: print("getting ratings", p_data.id) - rating = getRating(p_data.comments) + rating = getRating(p_data) cached = False print("not cached") #add to processed post @@ -67,9 +83,7 @@ def addToPosts(p_data): def searchItem(db,prompt): - - - #purge old processed_ids and posts + # purge old processed_ids and posts post_list.clear() total_post_list.clear() @@ -98,4 +112,4 @@ def loadMore(): post_list.sort(key=sortRating, reverse = True)#sort the post obj in order of rating - return post_list \ No newline at end of file + return post_list diff --git a/projectEnv/stattime.py b/projectEnv/stattime.py new file mode 100644 index 00000000..36b1fed8 --- /dev/null +++ b/projectEnv/stattime.py @@ -0,0 +1,39 @@ +# attempting to explore the relationship between the average special char score and the number of upvotes +from search import sexbot +from searchCode import * +import praw + + +r = praw.Reddit( + client_id='jknOULmDh_Xkmi5xLSpl_A', + client_secret='5jOGVzfdgGJgRrxS7oPZAzaBZnndEA', + user_agent="smol man", +) + +plist = [] +listyy = [] +prompt = "QC" + +for i in r.subreddit("FashionReps").search(query=prompt, + sort="relevance", + limit=100, + time_filter= "year"): + plist.append((i.score, i.num_comments)) + listyy.append(getRating(i.comments)) + + +print(len(plist)) +# all the averaged values for 100 post +average_upvotes = sum(upvotes for upvotes, _ in plist) / len(plist) +average_comments = sum(comments for _, comments in plist) / len(plist) +average_score = sum(listyy) / len(listyy) + + + +print(f"average score is {average_score}") +print(f"Average number of upvotes for posts with '{len(plist)}' posts: {average_upvotes}") +print(f"Average number of comments for posts with '{len(plist)}' posts: {average_comments}") + + + + diff --git a/projectEnv/testing.py b/projectEnv/testing.py new file mode 100644 index 00000000..a0ae3c02 --- /dev/null +++ b/projectEnv/testing.py @@ -0,0 +1,38 @@ +import praw +from praw.models import MoreComments +from flask_sqlalchemy import SQLAlchemy +from datetime import datetime +from praw.models.util import stream_generator +from searchCode import * +from search import * + + + +def searchItems(prompt): + + # purge old processed_ids and posts + + post_list.clear() + total_post_list.clear() + #grabs posts based on prompt + for i in r.subreddit("FashionReps").search(query=prompt, + sort="relevance", + limit=24, + time_filter= "year"): + total_post_list.append(i) + #add first 6 posts + print("looked for posts") + return total_post_list + +postslst = searchItems(prompt='bapesta QC') +for sub in postslst[:6]: + print(linreg(sub)) + +print(postslst[3].score) +print(postslst[3].num_comments) + +rating = 0 +for comment in postslst[3].comments: + rating += sentiment_scores(comment.body) + +print(rating) \ No newline at end of file diff --git a/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentiment.py b/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentiment.py index e831d9fe..35d9747d 100644 --- a/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentiment.py +++ b/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentiment.py @@ -684,4 +684,4 @@ def score_valence(self, sentiments, text): str(vs['compound']), translator_name)) print("----------------------------------------------------") - print("\n\n Demo Done!") \ No newline at end of file + print("\n\n Demo Done!") diff --git a/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentimentAiden.py b/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentimentAiden.py new file mode 100644 index 00000000..35d9747d --- /dev/null +++ b/projectEnv/venv/lib/python3.8/site-packages/vaderSentiment/vaderSentimentAiden.py @@ -0,0 +1,687 @@ +# coding: utf-8 +# Author: C.J. Hutto +# Thanks to George Berry for reducing the time complexity from something like O(N^4) to O(N). +# Thanks to Ewan Klein and Pierpaolo Pantone for bringing VADER into NLTK. Those modifications were awesome. +# For license information, see LICENSE.TXT + +""" +If you use the VADER sentiment analysis tools, please cite: +Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for +Sentiment Analysis of Social Media Text. Eighth International Conference on +Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. +""" +import os +import re +import math +import string +import codecs +import json +from itertools import product +from inspect import getsourcefile +from io import open + +# ##Constants## + +# (empirically derived mean sentiment intensity rating increase for booster words) +B_INCR = 0.293 +B_DECR = -0.293 + +# (empirically derived mean sentiment intensity rating increase for using ALLCAPs to emphasize a word) +C_INCR = 0.733 +N_SCALAR = -0.74 + +NEGATE = \ + ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", + "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", + "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", + "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", + "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere", + "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", + "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", + "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"] + +# booster/dampener 'intensifiers' or 'degree adverbs' +# http://en.wiktionary.org/wiki/Category:English_degree_adverbs + +BOOSTER_DICT = \ + {"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, + "completely": B_INCR, "considerable": B_INCR, "considerably": B_INCR, + "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormous": B_INCR, "enormously": B_INCR, + "entirely": B_INCR, "especially": B_INCR, "exceptional": B_INCR, "exceptionally": B_INCR, + "extreme": B_INCR, "extremely": B_INCR, + "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR, "frackin": B_INCR, "fracking": B_INCR, + "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, + "fuckin": B_INCR, "fucking": B_INCR, "fuggin": B_INCR, "fugging": B_INCR, + "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, + "incredible": B_INCR, "incredibly": B_INCR, "intensely": B_INCR, + "major": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR, + "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR, + "so": B_INCR, "substantially": B_INCR, + "thoroughly": B_INCR, "total": B_INCR, "totally": B_INCR, "tremendous": B_INCR, "tremendously": B_INCR, + "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utter": B_INCR, "utterly": B_INCR, + "very": B_INCR, + "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR, + "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR, + "less": B_DECR, "little": B_DECR, "marginal": B_DECR, "marginally": B_DECR, + "occasional": B_DECR, "occasionally": B_DECR, "partly": B_DECR, + "scarce": B_DECR, "scarcely": B_DECR, "slight": B_DECR, "slightly": B_DECR, "somewhat": B_DECR, + "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR} + +# check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented) +SENTIMENT_LADEN_IDIOMS = {"cut the mustard": 2, "hand to mouth": -2, + "back handed": -2, "blow smoke": -2, "blowing smoke": -2, + "upper hand": 1, "break a leg": 2, + "cooking with gas": 2, "in the black": 2, "in the red": -2, + "on the ball": 2, "under the weather": -2} + +# check for special case idioms and phrases containing lexicon words +SPECIAL_CASES = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "badass": 1.5, "bus stop": 0.0, + "yeah right": -2, "kiss of death": -1.5, "to die for": 3, "beating heart": 3.5} + + +# #Static methods# # + +def negated(input_words, include_nt=True): + """ + Determine if input contains negation words + """ + input_words = [str(w).lower() for w in input_words] + neg_words = [] + neg_words.extend(NEGATE) + for word in neg_words: + if word in input_words: + return True + if include_nt: + for word in input_words: + if "n't" in word: + return True + '''if "least" in input_words: + i = input_words.index("least") + if i > 0 and input_words[i - 1] != "at": + return True''' + return False + + +def normalize(score, alpha=15): + """ + Normalize the score to be between -1 and 1 using an alpha that + approximates the max expected value + """ + norm_score = score / math.sqrt((score * score) + alpha) + if norm_score < -1.0: + return -1.0 + elif norm_score > 1.0: + return 1.0 + else: + return norm_score + + +def allcap_differential(words): + """ + Check whether just some words in the input are ALL CAPS + :param list words: The words to inspect + :returns: `True` if some but not all items in `words` are ALL CAPS + """ + is_different = False + allcap_words = 0 + for word in words: + if word.isupper(): + allcap_words += 1 + cap_differential = len(words) - allcap_words + if 0 < cap_differential < len(words): + is_different = True + return is_different + + +def scalar_inc_dec(word, valence, is_cap_diff): + """ + Check if the preceding words increase, decrease, or negate/nullify the + valence + """ + scalar = 0.0 + word_lower = word.lower() + if word_lower in BOOSTER_DICT: + scalar = BOOSTER_DICT[word_lower] + if valence < 0: + scalar *= -1 + # check if booster/dampener word is in ALLCAPS (while others aren't) + if word.isupper() and is_cap_diff: + if valence > 0: + scalar += C_INCR + else: + scalar -= C_INCR + return scalar + + +class SentiText(object): + """ + Identify sentiment-relevant string-level properties of input text. + """ + + def __init__(self, text): + if not isinstance(text, str): + text = str(text).encode('utf-8') + self.text = text + self.words_and_emoticons = self._words_and_emoticons() + # doesn't separate words from\ + # adjacent punctuation (keeps emoticons & contractions) + self.is_cap_diff = allcap_differential(self.words_and_emoticons) + + @staticmethod + def _strip_punc_if_word(token): + """ + Removes all trailing and leading punctuation + If the resulting string has two or fewer characters, + then it was likely an emoticon, so return original string + (ie ":)" stripped would be "", so just return ":)" + """ + stripped = token.strip(string.punctuation) + if len(stripped) <= 2: + return token + return stripped + + def _words_and_emoticons(self): + """ + Removes leading and trailing puncutation + Leaves contractions and most emoticons + Does not preserve punc-plus-letter emoticons (e.g. :D) + """ + wes = self.text.split() + stripped = list(map(self._strip_punc_if_word, wes)) + return stripped + +class SentimentIntensityAnalyzer(object): + """ + Give a sentiment intensity score to sentences. + """ + + def __init__(self, lexicon_file="vader_lexicon.txt", emoji_lexicon="emoji_utf8_lexicon.txt"): + _this_module_file_path_ = os.path.abspath(getsourcefile(lambda: 0)) + lexicon_full_filepath = os.path.join(os.path.dirname(_this_module_file_path_), lexicon_file) + with codecs.open(lexicon_full_filepath, encoding='utf-8') as f: + self.lexicon_full_filepath = f.read() + self.lexicon = self.make_lex_dict() + + emoji_full_filepath = os.path.join(os.path.dirname(_this_module_file_path_), emoji_lexicon) + with codecs.open(emoji_full_filepath, encoding='utf-8') as f: + self.emoji_full_filepath = f.read() + self.emojis = self.make_emoji_dict() + + def make_lex_dict(self): + """ + Convert lexicon file to a dictionary + """ + lex_dict = {} + for line in self.lexicon_full_filepath.rstrip('\n').split('\n'): + if not line: + continue + (word, measure) = line.strip().split('\t')[0:2] + lex_dict[word] = float(measure) + return lex_dict + + def make_emoji_dict(self): + """ + Convert emoji lexicon file to a dictionary + """ + emoji_dict = {} + for line in self.emoji_full_filepath.rstrip('\n').split('\n'): + (emoji, description) = line.strip().split('\t')[0:2] + emoji_dict[emoji] = description + return emoji_dict + + def polarity_scores(self, text): + """ + Return a float for sentiment strength based on the input text. + Positive values are positive valence, negative value are negative + valence. + """ + # convert emojis to their textual descriptions + text_no_emoji = "" + prev_space = True + for chr in text: + if chr in self.emojis: + # get the textual description + description = self.emojis[chr] + if not prev_space: + text_no_emoji += ' ' + text_no_emoji += description + prev_space = False + else: + text_no_emoji += chr + prev_space = chr == ' ' + text = text_no_emoji.strip() + + sentitext = SentiText(text) + + sentiments = [] + words_and_emoticons = sentitext.words_and_emoticons + for i, item in enumerate(words_and_emoticons): + valence = 0 + # check for vader_lexicon words that may be used as modifiers or negations + if item.lower() in BOOSTER_DICT: + sentiments.append(valence) + continue + if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and + words_and_emoticons[i + 1].lower() == "of"): + sentiments.append(valence) + continue + + sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) + + sentiments = self._but_check(words_and_emoticons, sentiments) + + valence_dict = self.score_valence(sentiments, text) + + return valence_dict + + def sentiment_valence(self, valence, sentitext, item, i, sentiments): + is_cap_diff = sentitext.is_cap_diff + words_and_emoticons = sentitext.words_and_emoticons + item_lowercase = item.lower() + if item_lowercase in self.lexicon: + # get the sentiment valence + valence = self.lexicon[item_lowercase] + + # check for "no" as negation for an adjacent lexicon item vs "no" as its own stand-alone lexicon item + if item_lowercase == "no" and i != len(words_and_emoticons)-1 and words_and_emoticons[i + 1].lower() in self.lexicon: + # don't use valence of "no" as a lexicon item. Instead set it's valence to 0.0 and negate the next item + valence = 0.0 + if (i > 0 and words_and_emoticons[i - 1].lower() == "no") \ + or (i > 1 and words_and_emoticons[i - 2].lower() == "no") \ + or (i > 2 and words_and_emoticons[i - 3].lower() == "no" and words_and_emoticons[i - 1].lower() in ["or", "nor"] ): + valence = self.lexicon[item_lowercase] * N_SCALAR + + # check if sentiment laden word is in ALL CAPS (while others aren't) + if item.isupper() and is_cap_diff: + if valence > 0: + valence += C_INCR + else: + valence -= C_INCR + + for start_i in range(0, 3): + # dampen the scalar modifier of preceding words and emoticons + # (excluding the ones that immediately preceed the item) based + # on their distance from the current item. + if i > start_i and words_and_emoticons[i - (start_i + 1)].lower() not in self.lexicon: + s = scalar_inc_dec(words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff) + if start_i == 1 and s != 0: + s = s * 0.95 + if start_i == 2 and s != 0: + s = s * 0.9 + valence = valence + s + valence = self._negation_check(valence, words_and_emoticons, start_i, i) + if start_i == 2: + valence = self._special_idioms_check(valence, words_and_emoticons, i) + + valence = self._least_check(valence, words_and_emoticons, i) + sentiments.append(valence) + return sentiments + + def _least_check(self, valence, words_and_emoticons, i): + # check for negation case using "least" + if i > 1 and words_and_emoticons[i - 1].lower() not in self.lexicon \ + and words_and_emoticons[i - 1].lower() == "least": + if words_and_emoticons[i - 2].lower() != "at" and words_and_emoticons[i - 2].lower() != "very": + valence = valence * N_SCALAR + elif i > 0 and words_and_emoticons[i - 1].lower() not in self.lexicon \ + and words_and_emoticons[i - 1].lower() == "least": + valence = valence * N_SCALAR + return valence + + @staticmethod + def _but_check(words_and_emoticons, sentiments): + # check for modification in sentiment due to contrastive conjunction 'but' + words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons] + if 'but' in words_and_emoticons_lower: + bi = words_and_emoticons_lower.index('but') + for sentiment in sentiments: + si = sentiments.index(sentiment) + if si < bi: + sentiments.pop(si) + sentiments.insert(si, sentiment * 0.5) + elif si > bi: + sentiments.pop(si) + sentiments.insert(si, sentiment * 1.5) + return sentiments + + @staticmethod + def _special_idioms_check(valence, words_and_emoticons, i): + words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons] + onezero = "{0} {1}".format(words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i]) + + twoonezero = "{0} {1} {2}".format(words_and_emoticons_lower[i - 2], + words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i]) + + twoone = "{0} {1}".format(words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1]) + + threetwoone = "{0} {1} {2}".format(words_and_emoticons_lower[i - 3], + words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1]) + + threetwo = "{0} {1}".format(words_and_emoticons_lower[i - 3], words_and_emoticons_lower[i - 2]) + + sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] + + for seq in sequences: + if seq in SPECIAL_CASES: + valence = SPECIAL_CASES[seq] + break + + if len(words_and_emoticons_lower) - 1 > i: + zeroone = "{0} {1}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1]) + if zeroone in SPECIAL_CASES: + valence = SPECIAL_CASES[zeroone] + if len(words_and_emoticons_lower) - 1 > i + 1: + zeroonetwo = "{0} {1} {2}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1], + words_and_emoticons_lower[i + 2]) + if zeroonetwo in SPECIAL_CASES: + valence = SPECIAL_CASES[zeroonetwo] + + # check for booster/dampener bi-grams such as 'sort of' or 'kind of' + n_grams = [threetwoone, threetwo, twoone] + for n_gram in n_grams: + if n_gram in BOOSTER_DICT: + valence = valence + BOOSTER_DICT[n_gram] + return valence + + @staticmethod + def _sentiment_laden_idioms_check(valence, senti_text_lower): + # Future Work + # check for sentiment laden idioms that don't contain a lexicon word + idioms_valences = [] + for idiom in SENTIMENT_LADEN_IDIOMS: + if idiom in senti_text_lower: + print(idiom, senti_text_lower) + valence = SENTIMENT_LADEN_IDIOMS[idiom] + idioms_valences.append(valence) + if len(idioms_valences) > 0: + valence = sum(idioms_valences) / float(len(idioms_valences)) + return valence + + @staticmethod + def _negation_check(valence, words_and_emoticons, start_i, i): + words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons] + if start_i == 0: + if negated([words_and_emoticons_lower[i - (start_i + 1)]]): # 1 word preceding lexicon word (w/o stopwords) + valence = valence * N_SCALAR + if start_i == 1: + if words_and_emoticons_lower[i - 2] == "never" and \ + (words_and_emoticons_lower[i - 1] == "so" or + words_and_emoticons_lower[i - 1] == "this"): + valence = valence * 1.25 + elif words_and_emoticons_lower[i - 2] == "without" and \ + words_and_emoticons_lower[i - 1] == "doubt": + valence = valence + elif negated([words_and_emoticons_lower[i - (start_i + 1)]]): # 2 words preceding the lexicon word position + valence = valence * N_SCALAR + if start_i == 2: + if words_and_emoticons_lower[i - 3] == "never" and \ + (words_and_emoticons_lower[i - 2] == "so" or words_and_emoticons_lower[i - 2] == "this") or \ + (words_and_emoticons_lower[i - 1] == "so" or words_and_emoticons_lower[i - 1] == "this"): + valence = valence * 1.25 + elif words_and_emoticons_lower[i - 3] == "without" and \ + (words_and_emoticons_lower[i - 2] == "doubt" or words_and_emoticons_lower[i - 1] == "doubt"): + valence = valence + elif negated([words_and_emoticons_lower[i - (start_i + 1)]]): # 3 words preceding the lexicon word position + valence = valence * N_SCALAR + return valence + + def _punctuation_emphasis(self, text): + # add emphasis from exclamation points and question marks + ep_amplifier = self._amplify_ep(text) + qm_amplifier = self._amplify_qm(text) + punct_emph_amplifier = ep_amplifier + qm_amplifier + return punct_emph_amplifier + + @staticmethod + def _amplify_ep(text): + # check for added emphasis resulting from exclamation points (up to 4 of them) + ep_count = text.count("!") + if ep_count > 4: + ep_count = 4 + # (empirically derived mean sentiment intensity rating increase for + # exclamation points) + ep_amplifier = ep_count * 0.292 + return ep_amplifier + + @staticmethod + def _amplify_qm(text): + # check for added emphasis resulting from question marks (2 or 3+) + qm_count = text.count("?") + qm_amplifier = 0 + if qm_count > 1: + if qm_count <= 3: + # (empirically derived mean sentiment intensity rating increase for + # question marks) + qm_amplifier = qm_count * 0.18 + else: + qm_amplifier = 0.96 + return qm_amplifier + + @staticmethod + def _sift_sentiment_scores(sentiments): + # want separate positive versus negative sentiment scores + pos_sum = 0.0 + neg_sum = 0.0 + neu_count = 0 + for sentiment_score in sentiments: + if sentiment_score > 0: + pos_sum += (float(sentiment_score) + 1) # compensates for neutral words that are counted as 1 + if sentiment_score < 0: + neg_sum += (float(sentiment_score) - 1) # when used with math.fabs(), compensates for neutrals + if sentiment_score == 0: + neu_count += 1 + return pos_sum, neg_sum, neu_count + + def score_valence(self, sentiments, text): + if sentiments: + sum_s = float(sum(sentiments)) + # compute and add emphasis from punctuation in text + punct_emph_amplifier = self._punctuation_emphasis(text) + if sum_s > 0: + sum_s += punct_emph_amplifier + elif sum_s < 0: + sum_s -= punct_emph_amplifier + + compound = normalize(sum_s) + # discriminate between positive, negative and neutral sentiment scores + pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) + + if pos_sum > math.fabs(neg_sum): + pos_sum += punct_emph_amplifier + elif pos_sum < math.fabs(neg_sum): + neg_sum -= punct_emph_amplifier + + total = pos_sum + math.fabs(neg_sum) + neu_count + pos = math.fabs(pos_sum / total) + neg = math.fabs(neg_sum / total) + neu = math.fabs(neu_count / total) + + else: + compound = 0.0 + pos = 0.0 + neg = 0.0 + neu = 0.0 + + sentiment_dict = \ + {"neg": round(neg, 3), + "neu": round(neu, 3), + "pos": round(pos, 3), + "compound": round(compound, 4)} + + return sentiment_dict + + +if __name__ == '__main__': + # --- examples ------- + sentences = ["VADER is smart, handsome, and funny.", # positive sentence example + "VADER is smart, handsome, and funny!", + # punctuation emphasis handled correctly (sentiment intensity adjusted) + "VADER is very smart, handsome, and funny.", + # booster words handled correctly (sentiment intensity adjusted) + "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled + "VADER is VERY SMART, handsome, and FUNNY!!!", + # combination of signals - VADER appropriately adjusts intensity + "VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!", + # booster words & punctuation make this close to ceiling for score + "VADER is not smart, handsome, nor funny.", # negation sentence example + "The book was good.", # positive sentence + "At least it isn't a horrible book.", # negated negative sentence with contraction + "The book was only kind of good.", + # qualified positive sentence is handled correctly (intensity adjusted) + "The plot was good, but the characters are uncompelling and the dialog is not great.", + # mixed negation sentence + "Today SUX!", # negative slang with capitalization emphasis + "Today only kinda sux! But I'll get by, lol", + # mixed sentiment example with slang and constrastive conjunction "but" + "Make sure you :) or :D today!", # emoticons handled + "Catch utf-8 emoji such as 💘 and 💋 and 😁", # emojis handled + "Not bad at all" # Capitalized negation + ] + + analyzer = SentimentIntensityAnalyzer() + + print("----------------------------------------------------") + print(" - Analyze typical example cases, including handling of:") + print(" -- negations") + print(" -- punctuation emphasis & punctuation flooding") + print(" -- word-shape as emphasis (capitalization difference)") + print(" -- degree modifiers (intensifiers such as 'very' and dampeners such as 'kind of')") + print(" -- slang words as modifiers such as 'uber' or 'friggin' or 'kinda'") + print(" -- contrastive conjunction 'but' indicating a shift in sentiment; sentiment of later text is dominant") + print(" -- use of contractions as negations") + print(" -- sentiment laden emoticons such as :) and :D") + print(" -- utf-8 encoded emojis such as 💘 and 💋 and 😁") + print(" -- sentiment laden slang words (e.g., 'sux')") + print(" -- sentiment laden initialisms and acronyms (for example: 'lol') \n") + for sentence in sentences: + vs = analyzer.polarity_scores(sentence) + print("{:-<65} {}".format(sentence, str(vs))) + print("----------------------------------------------------") + print(" - About the scoring: ") + print(""" -- The 'compound' score is computed by summing the valence scores of each word in the lexicon, adjusted + according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). + This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. + Calling it a 'normalized, weighted composite score' is accurate.""") + print(""" -- The 'pos', 'neu', and 'neg' scores are ratios for proportions of text that fall in each category (so these + should all add up to be 1... or close to it with float operation). These are the most useful metrics if + you want multidimensional measures of sentiment for a given sentence.""") + print("----------------------------------------------------") + + # input("\nPress Enter to continue the demo...\n") # for DEMO purposes... + + tricky_sentences = ["Sentiment analysis has never been good.", + "Sentiment analysis has never been this good!", + "Most automated sentiment analysis tools are shit.", + "With VADER, sentiment analysis is the shit!", + "Other sentiment analysis tools can be quite bad.", + "On the other hand, VADER is quite bad ass", + "VADER is such a badass!", # slang with punctuation emphasis + "Without a doubt, excellent idea.", + "Roger Dodger is one of the most compelling variations on this theme.", + "Roger Dodger is at least compelling as a variation on the theme.", + "Roger Dodger is one of the least compelling variations on this theme.", + "Not such a badass after all.", # Capitalized negation with slang + "Without a doubt, an excellent idea." # "without {any} doubt" as negation + ] + print("----------------------------------------------------") + print(" - Analyze examples of tricky sentences that cause trouble to other sentiment analysis tools.") + print(" -- special case idioms - e.g., 'never good' vs 'never this good', or 'bad' vs 'bad ass'.") + print(" -- special uses of 'least' as negation versus comparison \n") + for sentence in tricky_sentences: + vs = analyzer.polarity_scores(sentence) + print("{:-<69} {}".format(sentence, str(vs))) + print("----------------------------------------------------") + + # input("\nPress Enter to continue the demo...\n") # for DEMO purposes... + + print("----------------------------------------------------") + print( + " - VADER works best when analysis is done at the sentence level (but it can work on single words or entire novels).") + paragraph = "It was one of the worst movies I've seen, despite good reviews. Unbelievably bad acting!! Poor direction. VERY poor production. The movie was bad. Very bad movie. VERY BAD movie!" + print(" -- For example, given the following paragraph text from a hypothetical movie review:\n\t'{}'".format( + paragraph)) + print( + " -- You could use NLTK to break the paragraph into sentence tokens for VADER, then average the results for the paragraph like this: \n") + # simple example to tokenize paragraph into sentences for VADER + from nltk import tokenize + + sentence_list = tokenize.sent_tokenize(paragraph) + paragraphSentiments = 0.0 + for sentence in sentence_list: + vs = analyzer.polarity_scores(sentence) + print("{:-<69} {}".format(sentence, str(vs["compound"]))) + paragraphSentiments += vs["compound"] + print("AVERAGE SENTIMENT FOR PARAGRAPH: \t" + str(round(paragraphSentiments / len(sentence_list), 4))) + print("----------------------------------------------------") + + # input("\nPress Enter to continue the demo...\n") # for DEMO purposes... + + print("----------------------------------------------------") + print(" - Analyze sentiment of IMAGES/VIDEO data based on annotation 'tags' or image labels. \n") + conceptList = ["balloons", "cake", "candles", "happy birthday", "friends", "laughing", "smiling", "party"] + conceptSentiments = 0.0 + for concept in conceptList: + vs = analyzer.polarity_scores(concept) + print("{:-<15} {}".format(concept, str(vs['compound']))) + conceptSentiments += vs["compound"] + print("AVERAGE SENTIMENT OF TAGS/LABELS: \t" + str(round(conceptSentiments / len(conceptList), 4))) + print("\t") + conceptList = ["riot", "fire", "fight", "blood", "mob", "war", "police", "tear gas"] + conceptSentiments = 0.0 + for concept in conceptList: + vs = analyzer.polarity_scores(concept) + print("{:-<15} {}".format(concept, str(vs['compound']))) + conceptSentiments += vs["compound"] + print("AVERAGE SENTIMENT OF TAGS/LABELS: \t" + str(round(conceptSentiments / len(conceptList), 4))) + print("----------------------------------------------------") + + # input("\nPress Enter to continue the demo...") # for DEMO purposes... + + do_translate = input( + "\nWould you like to run VADER demo examples with NON-ENGLISH text? \n (Note: requires Internet access and uses the 'requests' library) \n Type 'y' or 'n', then press Enter: ") + if do_translate.lower().lstrip().__contains__("y"): + import requests + print("\n----------------------------------------------------") + print(" - Analyze sentiment of NON ENGLISH text...for example:") + print(" -- French, German, Spanish, Italian, Russian, Japanese, Arabic, Chinese(Simplified) , Chinese(Traditional)") + print(" -- many other languages supported. \n") + languages = ["English", "French", "German", "Spanish", "Italian", "Russian", "Japanese", "Arabic", "Chinese(Simplified)", "Chinese(Traditional)"] + language_codes = ["en", "fr", "de", "es", "it", "ru", "ja", "ar", "zh-CN", "zh-TW"] + nonEnglish_sentences = ["I'm surprised to see just how amazingly helpful VADER is!", + "Je suis surpris de voir comment VADER est incroyablement utile !", + "Ich bin überrascht zu sehen, nur wie erstaunlich nützlich VADER!", + "Me sorprende ver sólo cómo increíblemente útil VADER!", + "Sono sorpreso di vedere solo come incredibilmente utile VADER è!", + "Я удивлен увидеть, как раз как удивительно полезно ВЕЙДЕРА!", + "私はちょうどどのように驚くほど役に立つベイダーを見て驚いています!", + "أنا مندهش لرؤية فقط كيف مثير للدهشة فيدر فائدة!", + "我很惊讶地看到VADER是如此有用!", + "我很驚訝地看到VADER是如此有用!" + ] + for sentence in nonEnglish_sentences: + to_lang = "en" + from_lang = language_codes[nonEnglish_sentences.index(sentence)] + if (from_lang == "en") or (from_lang == "en-US"): + translation = sentence + translator_name = "No translation needed" + else: # please note usage limits for My Memory Translation Service: http://mymemory.translated.net/doc/usagelimits.php + # using MY MEMORY NET http://mymemory.translated.net + api_url = "http://mymemory.translated.net/api/get?q={}&langpair={}|{}".format(sentence, from_lang, + to_lang) + hdrs = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', + 'Accept-Encoding': 'none', + 'Accept-Language': 'en-US,en;q=0.8', + 'Connection': 'keep-alive'} + response = requests.get(api_url, headers=hdrs) + response_json = json.loads(response.text) + translation = response_json["responseData"]["translatedText"] + translator_name = "MemoryNet Translation Service" + vs = analyzer.polarity_scores(translation) + print("- {: <8}: {: <69}\t {} ({})".format(languages[nonEnglish_sentences.index(sentence)], sentence, + str(vs['compound']), translator_name)) + print("----------------------------------------------------") + + print("\n\n Demo Done!")