} # Verb\n",
+ "PP: { } # PP -> P NP\n",
+ "VP: { *} # VP -> V (NP|PP)*''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituency_output_per_sentence = []\n",
+ "for sent in pos_tags_per_sentence:\n",
+ " constituent_structure = constituent_parser.parse(sent)\n",
+ " constituency_output_per_sentence.append(constituent_structure)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[Tree('S', [Tree('NP', [('https', 'NN')]), (':', ':'), Tree('NP', [('//www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html', 'JJ')]), ('Documents', 'NNS'), Tree('VP', [Tree('V', [('filed', 'VBN')])]), ('to', 'TO'), Tree('NP', [('the', 'DT')]), ('San', 'NNP'), ('Jose', 'NNP'), Tree('NP', [('federal', 'JJ'), ('court', 'NN')]), Tree('P', [('in', 'IN')]), ('California', 'NNP'), Tree('P', [('on', 'IN')]), ('November', 'NNP'), ('23', 'CD'), Tree('NP', [('list', 'NN')]), ('six', 'CD'), ('Samsung', 'NNP'), ('products', 'NNS'), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT')])]), ('``', '``'), ('Jelly', 'RB'), ('Bean', 'NNP'), (\"''\", \"''\"), ('and', 'CC'), ('``', '``'), ('Ice', 'NNP'), ('Cream', 'NNP'), ('Sandwich', 'NNP'), (\"''\", \"''\"), Tree('VP', [Tree('V', [('operating', 'VBG')])]), ('systems', 'NNS'), (',', ','), ('which', 'WDT'), ('Apple', 'NNP'), Tree('VP', [Tree('V', [('claims', 'VBZ')])]), Tree('VP', [Tree('V', [('infringe', 'VB')])]), ('its', 'PRP$'), ('patents', 'NNS'), ('.', '.')]), Tree('S', [Tree('NP', [('The', 'DT')]), ('six', 'CD'), ('phones', 'NNS'), ('and', 'CC'), ('tablets', 'NNS'), Tree('VP', [Tree('V', [('affected', 'VBN')])]), Tree('VP', [Tree('V', [('are', 'VBP')]), Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), (',', ','), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT'), ('new', 'JJ')])]), ('Jelly', 'NNP'), ('Bean', 'NNP'), Tree('NP', [('system', 'NN')]), (',', ','), Tree('NP', [('the', 'DT')]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('8.9', 'CD'), ('Wifi', 'NNP'), Tree('NP', [('tablet', 'NN')]), (',', ','), Tree('NP', [('the', 'DT')]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('2', 'CD'), ('10.1', 'CD'), (',', ','), ('Galaxy', 'NNP'), ('Rugby', 'NNP'), ('Pro', 'NNP'), ('and', 'CC'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), Tree('NP', [('mini', 'NN')]), ('.', '.')]), Tree('S', [('Apple', 'NNP'), Tree('VP', [Tree('V', [('stated', 'VBD')])]), ('it', 'PRP'), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('“', 'NNP'), Tree('VP', [Tree('V', [('acted', 'VBD')])]), ('quickly', 'RB'), ('and', 'CC'), ('diligently', 'RB'), (\"''\", \"''\"), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('order', 'NN')])]), ('to', 'TO'), ('``', '``'), Tree('VP', [Tree('V', [('determine', 'VB')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('these', 'DT')])])]), ('newly', 'RB'), Tree('VP', [Tree('V', [('released', 'VBN')])]), ('products', 'NNS'), Tree('VP', [Tree('V', [('do', 'VBP')])]), Tree('VP', [Tree('V', [('infringe', 'VB')]), Tree('NP', [('many', 'JJ')]), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('same', 'JJ')])])]), ('claims', 'NNS'), ('already', 'RB'), Tree('VP', [Tree('V', [('asserted', 'VBN')])]), Tree('P', [('by', 'IN')]), ('Apple', 'NNP'), ('.', '.'), (\"''\", \"''\")]), Tree('S', [Tree('P', [('In', 'IN')]), ('August', 'NNP'), (',', ','), ('Samsung', 'NNP'), Tree('VP', [Tree('V', [('lost', 'VBD')]), Tree('NP', [('a', 'DT')])]), ('US', 'NNP'), Tree('NP', [('patent', 'NN'), ('case', 'NN')]), ('to', 'TO'), ('Apple', 'NNP'), ('and', 'CC'), Tree('VP', [Tree('V', [('was', 'VBD')])]), Tree('VP', [Tree('V', [('ordered', 'VBN')])]), ('to', 'TO'), Tree('VP', [Tree('V', [('pay', 'VB')])]), ('its', 'PRP$'), Tree('NP', [('rival', 'JJ')]), ('$', '$'), ('1.05bn', 'CD'), ('(', '('), Tree('NP', [('£0.66bn', 'NN')]), (')', ')'), Tree('P', [('in', 'IN')]), ('damages', 'NNS'), Tree('P', [('for', 'IN')]), Tree('VP', [Tree('V', [('copying', 'VBG')])]), ('features', 'NNS'), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('iPad', 'NN')])]), ('and', 'CC'), Tree('NP', [('iPhone', 'NN')]), Tree('P', [('in', 'IN')]), ('its', 'PRP$'), ('Galaxy', 'NNP'), Tree('NP', [('range', 'NN')]), Tree('P', [('of', 'IN')]), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('Samsung', 'NNP'), (',', ','), ('which', 'WDT'), Tree('VP', [Tree('V', [('is', 'VBZ')]), Tree('NP', [('the', 'DT'), ('world', 'NN')])]), (\"'s\", 'POS'), Tree('NP', [('top', 'JJ'), ('mobile', 'NN'), ('phone', 'NN'), ('maker', 'NN')]), (',', ','), Tree('VP', [Tree('V', [('is', 'VBZ')])]), Tree('VP', [Tree('V', [('appealing', 'VBG')]), Tree('NP', [('the', 'DT'), ('ruling', 'NN')])]), ('.', '.')]), Tree('S', [Tree('NP', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN')]), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('the', 'DT')])]), ('UK', 'NNP'), Tree('VP', [Tree('V', [('found', 'VBD')])]), Tree('P', [('in', 'IN')]), ('Samsung', 'NNP'), (\"'s\", 'POS'), Tree('NP', [('favour', 'NN')]), ('and', 'CC'), Tree('VP', [Tree('V', [('ordered', 'VBD')])]), ('Apple', 'NNP'), ('to', 'TO'), Tree('VP', [Tree('V', [('publish', 'VB')]), Tree('NP', [('an', 'DT'), ('apology', 'NN')])]), Tree('VP', [Tree('V', [('making', 'VBG')]), Tree('NP', [('clear', 'JJ')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('the', 'DT'), ('South', 'JJ'), ('Korean', 'JJ'), ('firm', 'NN')])])]), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('not', 'RB'), Tree('VP', [Tree('V', [('copied', 'VBN')])]), ('its', 'PRP$'), Tree('NP', [('iPad', 'NN')]), ('when', 'WRB'), Tree('VP', [Tree('V', [('designing', 'VBG')])]), ('its', 'PRP$'), Tree('NP', [('own', 'JJ')]), ('devices', 'NNS'), ('.', '.')])]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(constituency_output_per_sentence)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Augment the RegexpParser so that it also detects Named Entity Phrases (NEP), e.g., that it detects *Galaxy S III* and *Ice Cream Sandwich*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituent_parser_v2 = nltk.RegexpParser('''\n",
+ "NP: {? * *} # NP\n",
+ "P: {} # Preposition\n",
+ "V: {} # Verb\n",
+ "PP: { } # PP -> P NP\n",
+ "VP: { *} # VP -> V (NP|PP)*\n",
+ "NEP: {*} # More than a NP''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituency_v2_output_per_sentence = []\n",
+ "for const in pos_tags_per_sentence:\n",
+ " structure = constituent_parser_v2.parse(const)\n",
+ " constituency_v2_output_per_sentence.append(structure)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[Tree('S', [Tree('NEP', [Tree('NP', [('https', 'NN')])]), (':', ':'), Tree('NEP', [Tree('NP', [('//www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html', 'JJ')])]), ('Documents', 'NNS'), Tree('VP', [Tree('V', [('filed', 'VBN')])]), ('to', 'TO'), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('San', 'NNP'), ('Jose', 'NNP'), Tree('NEP', [Tree('NP', [('federal', 'JJ'), ('court', 'NN')])]), Tree('P', [('in', 'IN')]), ('California', 'NNP'), Tree('P', [('on', 'IN')]), ('November', 'NNP'), ('23', 'CD'), Tree('NEP', [Tree('NP', [('list', 'NN')])]), ('six', 'CD'), ('Samsung', 'NNP'), ('products', 'NNS'), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT')])]), ('``', '``'), ('Jelly', 'RB'), ('Bean', 'NNP'), (\"''\", \"''\"), ('and', 'CC'), ('``', '``'), ('Ice', 'NNP'), ('Cream', 'NNP'), ('Sandwich', 'NNP'), (\"''\", \"''\"), Tree('VP', [Tree('V', [('operating', 'VBG')])]), ('systems', 'NNS'), (',', ','), ('which', 'WDT'), ('Apple', 'NNP'), Tree('VP', [Tree('V', [('claims', 'VBZ')])]), Tree('VP', [Tree('V', [('infringe', 'VB')])]), ('its', 'PRP$'), ('patents', 'NNS'), ('.', '.')]), Tree('S', [Tree('NEP', [Tree('NP', [('The', 'DT')])]), ('six', 'CD'), ('phones', 'NNS'), ('and', 'CC'), ('tablets', 'NNS'), Tree('VP', [Tree('V', [('affected', 'VBN')])]), Tree('VP', [Tree('V', [('are', 'VBP')]), Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), (',', ','), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT'), ('new', 'JJ')])]), ('Jelly', 'NNP'), ('Bean', 'NNP'), Tree('NEP', [Tree('NP', [('system', 'NN')])]), (',', ','), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('8.9', 'CD'), ('Wifi', 'NNP'), Tree('NEP', [Tree('NP', [('tablet', 'NN')])]), (',', ','), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('2', 'CD'), ('10.1', 'CD'), (',', ','), ('Galaxy', 'NNP'), ('Rugby', 'NNP'), ('Pro', 'NNP'), ('and', 'CC'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), Tree('NEP', [Tree('NP', [('mini', 'NN')])]), ('.', '.')]), Tree('S', [('Apple', 'NNP'), Tree('VP', [Tree('V', [('stated', 'VBD')])]), ('it', 'PRP'), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('“', 'NNP'), Tree('VP', [Tree('V', [('acted', 'VBD')])]), ('quickly', 'RB'), ('and', 'CC'), ('diligently', 'RB'), (\"''\", \"''\"), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('order', 'NN')])]), ('to', 'TO'), ('``', '``'), Tree('VP', [Tree('V', [('determine', 'VB')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('these', 'DT')])])]), ('newly', 'RB'), Tree('VP', [Tree('V', [('released', 'VBN')])]), ('products', 'NNS'), Tree('VP', [Tree('V', [('do', 'VBP')])]), Tree('VP', [Tree('V', [('infringe', 'VB')]), Tree('NP', [('many', 'JJ')]), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('same', 'JJ')])])]), ('claims', 'NNS'), ('already', 'RB'), Tree('VP', [Tree('V', [('asserted', 'VBN')])]), Tree('P', [('by', 'IN')]), ('Apple', 'NNP'), ('.', '.'), (\"''\", \"''\")]), Tree('S', [Tree('P', [('In', 'IN')]), ('August', 'NNP'), (',', ','), ('Samsung', 'NNP'), Tree('VP', [Tree('V', [('lost', 'VBD')]), Tree('NP', [('a', 'DT')])]), ('US', 'NNP'), Tree('NEP', [Tree('NP', [('patent', 'NN'), ('case', 'NN')])]), ('to', 'TO'), ('Apple', 'NNP'), ('and', 'CC'), Tree('VP', [Tree('V', [('was', 'VBD')])]), Tree('VP', [Tree('V', [('ordered', 'VBN')])]), ('to', 'TO'), Tree('VP', [Tree('V', [('pay', 'VB')])]), ('its', 'PRP$'), Tree('NEP', [Tree('NP', [('rival', 'JJ')])]), ('$', '$'), ('1.05bn', 'CD'), ('(', '('), Tree('NEP', [Tree('NP', [('£0.66bn', 'NN')])]), (')', ')'), Tree('P', [('in', 'IN')]), ('damages', 'NNS'), Tree('P', [('for', 'IN')]), Tree('VP', [Tree('V', [('copying', 'VBG')])]), ('features', 'NNS'), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('iPad', 'NN')])]), ('and', 'CC'), Tree('NEP', [Tree('NP', [('iPhone', 'NN')])]), Tree('P', [('in', 'IN')]), ('its', 'PRP$'), ('Galaxy', 'NNP'), Tree('NEP', [Tree('NP', [('range', 'NN')])]), Tree('P', [('of', 'IN')]), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('Samsung', 'NNP'), (',', ','), ('which', 'WDT'), Tree('VP', [Tree('V', [('is', 'VBZ')]), Tree('NP', [('the', 'DT'), ('world', 'NN')])]), (\"'s\", 'POS'), Tree('NEP', [Tree('NP', [('top', 'JJ'), ('mobile', 'NN'), ('phone', 'NN'), ('maker', 'NN')])]), (',', ','), Tree('VP', [Tree('V', [('is', 'VBZ')])]), Tree('VP', [Tree('V', [('appealing', 'VBG')]), Tree('NP', [('the', 'DT'), ('ruling', 'NN')])]), ('.', '.')]), Tree('S', [Tree('NEP', [Tree('NP', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN')])]), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('the', 'DT')])]), ('UK', 'NNP'), Tree('VP', [Tree('V', [('found', 'VBD')])]), Tree('P', [('in', 'IN')]), ('Samsung', 'NNP'), (\"'s\", 'POS'), Tree('NEP', [Tree('NP', [('favour', 'NN')])]), ('and', 'CC'), Tree('VP', [Tree('V', [('ordered', 'VBD')])]), ('Apple', 'NNP'), ('to', 'TO'), Tree('VP', [Tree('V', [('publish', 'VB')]), Tree('NP', [('an', 'DT'), ('apology', 'NN')])]), Tree('VP', [Tree('V', [('making', 'VBG')]), Tree('NP', [('clear', 'JJ')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('the', 'DT'), ('South', 'JJ'), ('Korean', 'JJ'), ('firm', 'NN')])])]), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('not', 'RB'), Tree('VP', [Tree('V', [('copied', 'VBN')])]), ('its', 'PRP$'), Tree('NEP', [Tree('NP', [('iPad', 'NN')])]), ('when', 'WRB'), Tree('VP', [Tree('V', [('designing', 'VBG')])]), ('its', 'PRP$'), Tree('NEP', [Tree('NP', [('own', 'JJ')])]), ('devices', 'NNS'), ('.', '.')])]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(constituency_v2_output_per_sentence)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## [total points: 1] Exercise 2: spaCy\n",
+ "Use Spacy to process the same text as you analyzed with NLTK."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import spacy\n",
+ "nlp = spacy.load('en_core_web_sm')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'verb, gerund or present participle'"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "doc = nlp(text)\n",
+ "#finding tokens in text\n",
+ "tokens = []\n",
+ "for token in doc:\n",
+ " tokens.append(token)\n",
+ "\n",
+ " \n",
+ "#finding tags per each token\n",
+ "tags = []\n",
+ "for tag in tokens:\n",
+ " tagging = [tag, tag.tag_]\n",
+ " tags.append(tagging) \n",
+ "\n",
+ "\n",
+ "#ner recognition\n",
+ "ner = []\n",
+ "for entity in doc.ents:\n",
+ " tuples = [entity.text, entity.label_]\n",
+ " ner.append(tuples)\n",
+ "\n",
+ "\n",
+ "\n",
+ "#constituency parsing\n",
+ "cp = []\n",
+ "for token in doc:\n",
+ " cp.append(token.dep_)\n",
+ " #I comment displacy out because my laptop could not handle it\n",
+ " #displacy.render(doc, jupyter=True, style='dep')\n",
+ "spacy.explain('VBG') \n",
+ "\n",
+ "# insert code here"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "small tip: You can use **sents = list(doc.sents)** to be able to use the index to access a sentence like **sents[2]** for the third sentence.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## [total points: 7] Exercise 3: Comparison NLTK and spaCy\n",
+ "We will now compare the output of NLTK and spaCy, i.e., in what do they differ?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 3] Exercise 3a: Part of speech tagging\n",
+ "Compare the output from NLTK and spaCy regarding part of speech tagging.\n",
+ "\n",
+ "* To compare, you probably would like to compare sentence per sentence. Describe if the sentence splitting is different for NLTK than for spaCy. If not, where do they differ?\n",
+ "* After checking the sentence splitting, select a sentence for which you expect interesting results and perhaps differences. Motivate your choice.\n",
+ "* Compare the output in `token.tag` from spaCy to the part of speech tagging from NLTK for each token in your selected sentence. Are there any differences? This is not a trick question; it is possible that there are no differences."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise 3a - Answers\n",
+ "\n",
+ "1) it looks like the difference between sentence splitting with NLTK and spaCy differs in what method they use to split. With NLTK, all the tokens end up with quotationmarks around them, whereas spaCy does not. This is presumably the case because NLTK uses the split function and spaCy treats the tokens as objects.\n",
+ "\n",
+ "2) \"A similar case in the UK found in Samsung's favour and ordered Apple to publish an apology making clear that the South Korean firm had not copied its iPad when designing its own devices.\" We think this sentence would have some interesting outcomes because there are a couple of parts of this sentences that are ambiguous, for example \" 's \" could be the concatenation of \"is\" as well as a possessive pronoun, or \"ordered\" could be either \"to command\" or ordered in the sense of ordering something from a restaurant.\n",
+ "\n",
+ "3) There was only one difference found in this sentence, which is that nltk says iPad is a singular noun (NN) while spaCy says it is a proper noun singular (NNP).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 2] Exercise 3b: Named Entity Recognition (NER)\n",
+ "* Describe differences between the output from NLTK and spaCy for Named Entity Recognition. Which one do you think performs better?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise 3b - Answers\n",
+ "\n",
+ "The output of NLTK is in the form trees instead of the individual words as in spaCy. This could potentially provide more information as it shows the entity that is most dominant in each row, while showing the entities of each word in said row. While spaCy only shows the entity and the word it is referring to, meaning with spaCy, it's more likely that you would need to compare with the original sentences to make more sense of the output. However, spaCy's output being a list of lists instead of a list of trees makes it easier to work with and manipulate compared to the tree structure of NLTK.\n",
+ "\n",
+ "Another difference is NLTK shows the entity of each word in each sentence while spaCy only shows a few entity types. For example, NLTK shows \"A\" is a determiner, while that cannot be found in the spaCy output. However, spaCy performs better in the identification. For example, spaCy recognizes \"Novermber 23\" as a Date entity, while NLTK splits November and 23 into two seperate words"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 2] Exercise 3c: Constituency/dependency parsing\n",
+ "Choose one sentence from the text and run constituency parsing using NLTK and dependency parsing using spaCy.\n",
+ "* describe briefly the difference between constituency parsing and dependency parsing\n",
+ "* describe differences between the output from NLTK and spaCy."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise 3c - Answers\n",
+ "\n",
+ "The main differences between constituency and dependency parsing is that constituency parsers break down texts into sub-phrases and dependency parsers connect words according to their relationships."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Constituency sentence\n",
+ "[Tree('S', [Tree('NEP', [Tree('NP', [('The', 'DT')])]), ('six', 'CD'), ('phones', 'NNS'), ('and', 'CC'), ('tablets', 'NNS'), Tree('VP', [Tree('V', [('affected', 'VBN')])]), Tree('VP', [Tree('V', [('are', 'VBP')]), Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), (',', ','), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT'), ('new', 'JJ')])]), ('Jelly', 'NNP'), ('Bean', 'NNP'), Tree('NEP', [Tree('NP', [('system', 'NN')])]), (',', ','), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('8.9', 'CD'), ('Wifi', 'NNP'), Tree('NEP', [Tree('NP', [('tablet', 'NN')])]), (',', ','), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('2', 'CD'), ('10.1', 'CD'), (',', ','), ('Galaxy', 'NNP'), ('Rugby', 'NNP'), ('Pro', 'NNP'), ('and', 'CC'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), Tree('NEP', [Tree('NP', [('mini', 'NN')])]), ('.', '.')])]\n",
+ "\n",
+ "\n",
+ "Dependency sentence\n",
+ "[[The, 'DT'], [six, 'CD'], [phones, 'NNS'], [and, 'CC'], [tablets, 'NNS'], [affected, 'VBN'], [are, 'VBP'], [the, 'DT'], [Galaxy, 'NNP'], [S, 'NNP'], [III, 'NNP'], [,, ','], [running, 'VBG'], [the, 'DT'], [new, 'JJ'], [Jelly, 'NNP'], [Bean, 'NNP'], [system, 'NN'], [,, ','], [the, 'DT'], [Galaxy, 'NNP'], [Tab, 'NNP'], [8.9, 'CD'], [Wifi, 'NNP'], [tablet, 'NN'], [,, ','], [the, 'DT'], [Galaxy, 'NNP'], [Tab, 'NNP'], [2, 'CD'], [10.1, 'CD'], [,, ','], [Galaxy, 'NNP'], [Rugby, 'NNP'], [Pro, 'NNP'], [and, 'CC'], [Galaxy, 'NNP'], [S, 'NNP'], [III, 'NNP'], [mini, 'NN'], [., '.']]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Constituency sentence\")\n",
+ "print(constituency_v2_output_per_sentence[1:2])\n",
+ "print(\"\\n\")\n",
+ "print(\"Dependency sentence\")\n",
+ "print(tags[42:83])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Even though both are very similar, outputing almost the same result, we can see some differences, for example, the way that SpaCy handles punctuation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# End of this notebook"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb b/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb
deleted file mode 100644
index 9207bf59..00000000
--- a/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Lab1-Assignment\n",
- "\n",
- "Copyright: Vrije Universiteit Amsterdam, Faculty of Humanities, CLTL\n",
- "\n",
- "This notebook describes the assignment for Lab 1 of the text mining course. \n",
- "\n",
- "**Points**: each exercise is prefixed with the number of points you can obtain for the exercise.\n",
- "\n",
- "We assume you have worked through the following notebooks:\n",
- "* **Lab1.1-introduction**\n",
- "* **Lab1.2-introduction-to-NLTK**\n",
- "* **Lab1.3-introduction-to-spaCy** \n",
- "\n",
- "In this assignment, you will process an English text (**Lab1-apple-samsung-example.txt**) with both NLTK and spaCy and discuss the similarities and differences."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Credits\n",
- "The notebooks in this block have been originally created by [Marten Postma](https://martenpostma.github.io). Adaptations were made by [Filip Ilievski](http://ilievski.nl)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Tip: how to read a file from disk\n",
- "Let's open the file **Lab1-apple-samsung-example.txt** from disk."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pathlib import Path"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "cur_dir = Path().resolve() # this should provide you with the folder in which this notebook is placed\n",
- "path_to_file = Path.joinpath(cur_dir, 'Lab1-apple-samsung-example.txt')\n",
- "print(path_to_file)\n",
- "print('does path exist? ->', Path.exists(path_to_file))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "If the output from the code cell above states that **does path exist? -> False**, please check that the file **Lab1-apple-samsung-example.txt** is in the same directory as this notebook."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open(path_to_file) as infile:\n",
- " text = infile.read()\n",
- "\n",
- "print('number of characters', len(text))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## [total points: 4] Exercise 1: NLTK\n",
- "In this exercise, we use NLTK to apply **Part-of-speech (POS) tagging**, **Named Entity Recognition (NER)**, and **Constituency parsing**. The following code snippet already performs sentence splitting and tokenization. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import nltk\n",
- "from nltk.tokenize import sent_tokenize\n",
- "from nltk import word_tokenize"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sentences_nltk = sent_tokenize(text)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tokens_per_sentence = []\n",
- "for sentence_nltk in sentences_nltk:\n",
- " sent_tokens = word_tokenize(sentence_nltk)\n",
- " tokens_per_sentence.append(sent_tokens)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We will use lists to keep track of the output of the NLP tasks. We can hence inspect the output for each task using the index of the sentence."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sent_id = 1\n",
- "print('SENTENCE', sentences_nltk[sent_id])\n",
- "print('TOKENS', tokens_per_sentence[sent_id])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### [point: 1] Exercise 1a: Part-of-speech (POS) tagging\n",
- "Use `nltk.pos_tag` to perform part-of-speech tagging on each sentence.\n",
- "\n",
- "Use `print` to **show** the output in the notebook (and hence also in the exported PDF!)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "pos_tags_per_sentence = []\n",
- "for tokens in tokens_per_sentence:\n",
- " print()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(pos_tags_per_sentence)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### [point: 1] Exercise 1b: Named Entity Recognition (NER)\n",
- "Use `nltk.chunk.ne_chunk` to perform Named Entity Recognition (NER) on each sentence.\n",
- "\n",
- "Use `print` to **show** the output in the notebook (and hence also in the exported PDF!)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ner_tags_per_sentence = []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(ner_tags_per_sentence)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### [points: 2] Exercise 1c: Constituency parsing\n",
- "Use the `nltk.RegexpParser` to perform constituency parsing on each sentence.\n",
- "\n",
- "Use `print` to **show** the output in the notebook (and hence also in the exported PDF!)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "constituent_parser = nltk.RegexpParser('''\n",
- "NP: {? * *} # NP\n",
- "P: {} # Preposition\n",
- "V: {} # Verb\n",
- "PP: { } # PP -> P NP\n",
- "VP: { *} # VP -> V (NP|PP)*''')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "constituency_output_per_sentence = []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(constituency_output_per_sentence)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Augment the RegexpParser so that it also detects Named Entity Phrases (NEP), e.g., that it detects *Galaxy S III* and *Ice Cream Sandwich*"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "constituent_parser_v2 = nltk.RegexpParser('''\n",
- "NP: {? * *} # NP\n",
- "P: {} # Preposition\n",
- "V: {} # Verb\n",
- "PP: { } # PP -> P NP\n",
- "VP: { *} # VP -> V (NP|PP)*\n",
- "NEP: {} # ???''')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "constituency_v2_output_per_sentence = []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(constituency_v2_output_per_sentence)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## [total points: 1] Exercise 2: spaCy\n",
- "Use Spacy to process the same text as you analyzed with NLTK."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import spacy\n",
- "nlp = spacy.load('en_core_web_sm')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "doc = nlp(text) # insert code here"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "small tip: You can use **sents = list(doc.sents)** to be able to use the index to access a sentence like **sents[2]** for the third sentence.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## [total points: 7] Exercise 3: Comparison NLTK and spaCy\n",
- "We will now compare the output of NLTK and spaCy, i.e., in what do they differ?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### [points: 3] Exercise 3a: Part of speech tagging\n",
- "Compare the output from NLTK and spaCy regarding part of speech tagging.\n",
- "\n",
- "* To compare, you probably would like to compare sentence per sentence. Describe if the sentence splitting is different for NLTK than for spaCy. If not, where do they differ?\n",
- "* After checking the sentence splitting, select a sentence for which you expect interesting results and perhaps differences. Motivate your choice.\n",
- "* Compare the output in `token.tag` from spaCy to the part of speech tagging from NLTK for each token in your selected sentence. Are there any differences? This is not a trick question; it is possible that there are no differences."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### [points: 2] Exercise 3b: Named Entity Recognition (NER)\n",
- "* Describe differences between the output from NLTK and spaCy for Named Entity Recognition. Which one do you think performs better?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### [points: 2] Exercise 3c: Constituency/dependency parsing\n",
- "Choose one sentence from the text and run constituency parsing using NLTK and dependency parsing using spaCy.\n",
- "* describe briefly the difference between constituency parsing and dependency parsing\n",
- "* describe differences between the output from NLTK and spaCy."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# End of this notebook"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb.zip b/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb.zip
new file mode 100644
index 00000000..f5427ebb
Binary files /dev/null and b/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb.zip differ
diff --git a/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb/Lab1-assignment-toolkits.ipynb b/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb/Lab1-assignment-toolkits.ipynb
new file mode 100644
index 00000000..7d719b61
--- /dev/null
+++ b/lab_sessions/lab1/Lab1-assignment-toolkits.ipynb/Lab1-assignment-toolkits.ipynb
@@ -0,0 +1,536 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Lab1-Assignment\n",
+ "\n",
+ "Copyright: Vrije Universiteit Amsterdam, Faculty of Humanities, CLTL\n",
+ "\n",
+ "This notebook describes the assignment for Lab 1 of the text mining course. \n",
+ "\n",
+ "**Points**: each exercise is prefixed with the number of points you can obtain for the exercise.\n",
+ "\n",
+ "We assume you have worked through the following notebooks:\n",
+ "* **Lab1.1-introduction**\n",
+ "* **Lab1.2-introduction-to-NLTK**\n",
+ "* **Lab1.3-introduction-to-spaCy** \n",
+ "\n",
+ "In this assignment, you will process an English text (**Lab1-apple-samsung-example.txt**) with both NLTK and spaCy and discuss the similarities and differences."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Credits\n",
+ "The notebooks in this block have been originally created by [Marten Postma](https://martenpostma.github.io). Adaptations were made by [Filip Ilievski](http://ilievski.nl)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Tip: how to read a file from disk\n",
+ "Let's open the file **Lab1-apple-samsung-example.txt** from disk."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pathlib import Path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\zahra_6hcxkfv\\ba-text-mining\\lab_sessions\\lab1\\Lab1-apple-samsung-example.txt\n",
+ "does path exist? -> True\n"
+ ]
+ }
+ ],
+ "source": [
+ "cur_dir = Path().resolve() # this should provide you with the folder in which this notebook is placed\n",
+ "path_to_file = Path.joinpath(cur_dir, 'Lab1-apple-samsung-example.txt')\n",
+ "print(path_to_file)\n",
+ "print('does path exist? ->', Path.exists(path_to_file))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If the output from the code cell above states that **does path exist? -> False**, please check that the file **Lab1-apple-samsung-example.txt** is in the same directory as this notebook."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "number of characters 1139\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(path_to_file) as infile:\n",
+ " text = infile.read()\n",
+ "\n",
+ "print('number of characters', len(text))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## [total points: 4] Exercise 1: NLTK\n",
+ "In this exercise, we use NLTK to apply **Part-of-speech (POS) tagging**, **Named Entity Recognition (NER)**, and **Constituency parsing**. The following code snippet already performs sentence splitting and tokenization. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import nltk\n",
+ "from nltk.tokenize import sent_tokenize\n",
+ "from nltk import word_tokenize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sentences_nltk = sent_tokenize(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokens_per_sentence = []\n",
+ "for sentence_nltk in sentences_nltk:\n",
+ " sent_tokens = word_tokenize(sentence_nltk)\n",
+ " tokens_per_sentence.append(sent_tokens)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will use lists to keep track of the output of the NLP tasks. We can hence inspect the output for each task using the index of the sentence."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SENTENCE The six phones and tablets affected are the Galaxy S III, running the new Jelly Bean system, the Galaxy Tab 8.9 Wifi tablet, the Galaxy Tab 2 10.1, Galaxy Rugby Pro and Galaxy S III mini.\n",
+ "TOKENS ['The', 'six', 'phones', 'and', 'tablets', 'affected', 'are', 'the', 'Galaxy', 'S', 'III', ',', 'running', 'the', 'new', 'Jelly', 'Bean', 'system', ',', 'the', 'Galaxy', 'Tab', '8.9', 'Wifi', 'tablet', ',', 'the', 'Galaxy', 'Tab', '2', '10.1', ',', 'Galaxy', 'Rugby', 'Pro', 'and', 'Galaxy', 'S', 'III', 'mini', '.']\n"
+ ]
+ }
+ ],
+ "source": [
+ "sent_id = 1\n",
+ "print('SENTENCE', sentences_nltk[sent_id])\n",
+ "print('TOKENS', tokens_per_sentence[sent_id])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [point: 1] Exercise 1a: Part-of-speech (POS) tagging\n",
+ "Use `nltk.pos_tag` to perform part-of-speech tagging on each sentence.\n",
+ "\n",
+ "Use `print` to **show** the output in the notebook (and hence also in the exported PDF!)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pos_tags_per_sentence = []\n",
+ "for tokens in tokens_per_sentence:\n",
+ " pos_tags_per_sentence.append(nltk.pos_tag(tokens))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[('https', 'NN'), (':', ':'), ('//www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html', 'JJ'), ('Documents', 'NNS'), ('filed', 'VBN'), ('to', 'TO'), ('the', 'DT'), ('San', 'NNP'), ('Jose', 'NNP'), ('federal', 'JJ'), ('court', 'NN'), ('in', 'IN'), ('California', 'NNP'), ('on', 'IN'), ('November', 'NNP'), ('23', 'CD'), ('list', 'NN'), ('six', 'CD'), ('Samsung', 'NNP'), ('products', 'NNS'), ('running', 'VBG'), ('the', 'DT'), ('``', '``'), ('Jelly', 'RB'), ('Bean', 'NNP'), (\"''\", \"''\"), ('and', 'CC'), ('``', '``'), ('Ice', 'NNP'), ('Cream', 'NNP'), ('Sandwich', 'NNP'), (\"''\", \"''\"), ('operating', 'VBG'), ('systems', 'NNS'), (',', ','), ('which', 'WDT'), ('Apple', 'NNP'), ('claims', 'VBZ'), ('infringe', 'VB'), ('its', 'PRP$'), ('patents', 'NNS'), ('.', '.')], [('The', 'DT'), ('six', 'CD'), ('phones', 'NNS'), ('and', 'CC'), ('tablets', 'NNS'), ('affected', 'VBN'), ('are', 'VBP'), ('the', 'DT'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), (',', ','), ('running', 'VBG'), ('the', 'DT'), ('new', 'JJ'), ('Jelly', 'NNP'), ('Bean', 'NNP'), ('system', 'NN'), (',', ','), ('the', 'DT'), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('8.9', 'CD'), ('Wifi', 'NNP'), ('tablet', 'NN'), (',', ','), ('the', 'DT'), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('2', 'CD'), ('10.1', 'CD'), (',', ','), ('Galaxy', 'NNP'), ('Rugby', 'NNP'), ('Pro', 'NNP'), ('and', 'CC'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), ('mini', 'NN'), ('.', '.')], [('Apple', 'NNP'), ('stated', 'VBD'), ('it', 'PRP'), ('had', 'VBD'), ('“', 'NNP'), ('acted', 'VBD'), ('quickly', 'RB'), ('and', 'CC'), ('diligently', 'RB'), (\"''\", \"''\"), ('in', 'IN'), ('order', 'NN'), ('to', 'TO'), ('``', '``'), ('determine', 'VB'), ('that', 'IN'), ('these', 'DT'), ('newly', 'RB'), ('released', 'VBN'), ('products', 'NNS'), ('do', 'VBP'), ('infringe', 'VB'), ('many', 'JJ'), ('of', 'IN'), ('the', 'DT'), ('same', 'JJ'), ('claims', 'NNS'), ('already', 'RB'), ('asserted', 'VBN'), ('by', 'IN'), ('Apple', 'NNP'), ('.', '.'), (\"''\", \"''\")], [('In', 'IN'), ('August', 'NNP'), (',', ','), ('Samsung', 'NNP'), ('lost', 'VBD'), ('a', 'DT'), ('US', 'NNP'), ('patent', 'NN'), ('case', 'NN'), ('to', 'TO'), ('Apple', 'NNP'), ('and', 'CC'), ('was', 'VBD'), ('ordered', 'VBN'), ('to', 'TO'), ('pay', 'VB'), ('its', 'PRP$'), ('rival', 'JJ'), ('$', '$'), ('1.05bn', 'CD'), ('(', '('), ('£0.66bn', 'NN'), (')', ')'), ('in', 'IN'), ('damages', 'NNS'), ('for', 'IN'), ('copying', 'VBG'), ('features', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('iPad', 'NN'), ('and', 'CC'), ('iPhone', 'NN'), ('in', 'IN'), ('its', 'PRP$'), ('Galaxy', 'NNP'), ('range', 'NN'), ('of', 'IN'), ('devices', 'NNS'), ('.', '.')], [('Samsung', 'NNP'), (',', ','), ('which', 'WDT'), ('is', 'VBZ'), ('the', 'DT'), ('world', 'NN'), (\"'s\", 'POS'), ('top', 'JJ'), ('mobile', 'NN'), ('phone', 'NN'), ('maker', 'NN'), (',', ','), ('is', 'VBZ'), ('appealing', 'VBG'), ('the', 'DT'), ('ruling', 'NN'), ('.', '.')], [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), ('UK', 'NNP'), ('found', 'VBD'), ('in', 'IN'), ('Samsung', 'NNP'), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('Apple', 'NNP'), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), ('South', 'JJ'), ('Korean', 'JJ'), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(pos_tags_per_sentence)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [point: 1] Exercise 1b: Named Entity Recognition (NER)\n",
+ "Use `nltk.chunk.ne_chunk` to perform Named Entity Recognition (NER) on each sentence.\n",
+ "\n",
+ "Use `print` to **show** the output in the notebook (and hence also in the exported PDF!)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ner_tags_per_sentence = []\n",
+ "for sentence in sentences_nltk:\n",
+ " ner_tags_per_sentence.append(nltk.chunk.ne_chunk(nltk.pos_tag(tokens)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[Tree('S', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('UK', 'NNP')]), ('found', 'VBD'), ('in', 'IN'), Tree('GPE', [('Samsung', 'NNP')]), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), Tree('PERSON', [('Apple', 'NNP')]), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), Tree('LOCATION', [('South', 'JJ'), ('Korean', 'JJ')]), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('UK', 'NNP')]), ('found', 'VBD'), ('in', 'IN'), Tree('GPE', [('Samsung', 'NNP')]), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), Tree('PERSON', [('Apple', 'NNP')]), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), Tree('LOCATION', [('South', 'JJ'), ('Korean', 'JJ')]), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('UK', 'NNP')]), ('found', 'VBD'), ('in', 'IN'), Tree('GPE', [('Samsung', 'NNP')]), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), Tree('PERSON', [('Apple', 'NNP')]), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), Tree('LOCATION', [('South', 'JJ'), ('Korean', 'JJ')]), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('UK', 'NNP')]), ('found', 'VBD'), ('in', 'IN'), Tree('GPE', [('Samsung', 'NNP')]), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), Tree('PERSON', [('Apple', 'NNP')]), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), Tree('LOCATION', [('South', 'JJ'), ('Korean', 'JJ')]), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('UK', 'NNP')]), ('found', 'VBD'), ('in', 'IN'), Tree('GPE', [('Samsung', 'NNP')]), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), Tree('PERSON', [('Apple', 'NNP')]), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), Tree('LOCATION', [('South', 'JJ'), ('Korean', 'JJ')]), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN'), ('in', 'IN'), ('the', 'DT'), Tree('ORGANIZATION', [('UK', 'NNP')]), ('found', 'VBD'), ('in', 'IN'), Tree('GPE', [('Samsung', 'NNP')]), (\"'s\", 'POS'), ('favour', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), Tree('PERSON', [('Apple', 'NNP')]), ('to', 'TO'), ('publish', 'VB'), ('an', 'DT'), ('apology', 'NN'), ('making', 'VBG'), ('clear', 'JJ'), ('that', 'IN'), ('the', 'DT'), Tree('LOCATION', [('South', 'JJ'), ('Korean', 'JJ')]), ('firm', 'NN'), ('had', 'VBD'), ('not', 'RB'), ('copied', 'VBN'), ('its', 'PRP$'), ('iPad', 'NN'), ('when', 'WRB'), ('designing', 'VBG'), ('its', 'PRP$'), ('own', 'JJ'), ('devices', 'NNS'), ('.', '.')])]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(ner_tags_per_sentence)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 2] Exercise 1c: Constituency parsing\n",
+ "Use the `nltk.RegexpParser` to perform constituency parsing on each sentence.\n",
+ "\n",
+ "Use `print` to **show** the output in the notebook (and hence also in the exported PDF!)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituent_parser = nltk.RegexpParser('''\n",
+ "NP: {? * *} # NP\n",
+ "P: {} # Preposition\n",
+ "V: {} # Verb\n",
+ "PP: { } # PP -> P NP\n",
+ "VP: { *} # VP -> V (NP|PP)*''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituency_output_per_sentence = []\n",
+ "for sent in pos_tags_per_sentence:\n",
+ " constituent_structure = constituent_parser.parse(sent)\n",
+ " constituency_output_per_sentence.append(constituent_structure)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[Tree('S', [Tree('NP', [('https', 'NN')]), (':', ':'), Tree('NP', [('//www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html', 'JJ')]), ('Documents', 'NNS'), Tree('VP', [Tree('V', [('filed', 'VBN')])]), ('to', 'TO'), Tree('NP', [('the', 'DT')]), ('San', 'NNP'), ('Jose', 'NNP'), Tree('NP', [('federal', 'JJ'), ('court', 'NN')]), Tree('P', [('in', 'IN')]), ('California', 'NNP'), Tree('P', [('on', 'IN')]), ('November', 'NNP'), ('23', 'CD'), Tree('NP', [('list', 'NN')]), ('six', 'CD'), ('Samsung', 'NNP'), ('products', 'NNS'), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT')])]), ('``', '``'), ('Jelly', 'RB'), ('Bean', 'NNP'), (\"''\", \"''\"), ('and', 'CC'), ('``', '``'), ('Ice', 'NNP'), ('Cream', 'NNP'), ('Sandwich', 'NNP'), (\"''\", \"''\"), Tree('VP', [Tree('V', [('operating', 'VBG')])]), ('systems', 'NNS'), (',', ','), ('which', 'WDT'), ('Apple', 'NNP'), Tree('VP', [Tree('V', [('claims', 'VBZ')])]), Tree('VP', [Tree('V', [('infringe', 'VB')])]), ('its', 'PRP$'), ('patents', 'NNS'), ('.', '.')]), Tree('S', [Tree('NP', [('The', 'DT')]), ('six', 'CD'), ('phones', 'NNS'), ('and', 'CC'), ('tablets', 'NNS'), Tree('VP', [Tree('V', [('affected', 'VBN')])]), Tree('VP', [Tree('V', [('are', 'VBP')]), Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), (',', ','), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT'), ('new', 'JJ')])]), ('Jelly', 'NNP'), ('Bean', 'NNP'), Tree('NP', [('system', 'NN')]), (',', ','), Tree('NP', [('the', 'DT')]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('8.9', 'CD'), ('Wifi', 'NNP'), Tree('NP', [('tablet', 'NN')]), (',', ','), Tree('NP', [('the', 'DT')]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('2', 'CD'), ('10.1', 'CD'), (',', ','), ('Galaxy', 'NNP'), ('Rugby', 'NNP'), ('Pro', 'NNP'), ('and', 'CC'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), Tree('NP', [('mini', 'NN')]), ('.', '.')]), Tree('S', [('Apple', 'NNP'), Tree('VP', [Tree('V', [('stated', 'VBD')])]), ('it', 'PRP'), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('“', 'NNP'), Tree('VP', [Tree('V', [('acted', 'VBD')])]), ('quickly', 'RB'), ('and', 'CC'), ('diligently', 'RB'), (\"''\", \"''\"), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('order', 'NN')])]), ('to', 'TO'), ('``', '``'), Tree('VP', [Tree('V', [('determine', 'VB')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('these', 'DT')])])]), ('newly', 'RB'), Tree('VP', [Tree('V', [('released', 'VBN')])]), ('products', 'NNS'), Tree('VP', [Tree('V', [('do', 'VBP')])]), Tree('VP', [Tree('V', [('infringe', 'VB')]), Tree('NP', [('many', 'JJ')]), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('same', 'JJ')])])]), ('claims', 'NNS'), ('already', 'RB'), Tree('VP', [Tree('V', [('asserted', 'VBN')])]), Tree('P', [('by', 'IN')]), ('Apple', 'NNP'), ('.', '.'), (\"''\", \"''\")]), Tree('S', [Tree('P', [('In', 'IN')]), ('August', 'NNP'), (',', ','), ('Samsung', 'NNP'), Tree('VP', [Tree('V', [('lost', 'VBD')]), Tree('NP', [('a', 'DT')])]), ('US', 'NNP'), Tree('NP', [('patent', 'NN'), ('case', 'NN')]), ('to', 'TO'), ('Apple', 'NNP'), ('and', 'CC'), Tree('VP', [Tree('V', [('was', 'VBD')])]), Tree('VP', [Tree('V', [('ordered', 'VBN')])]), ('to', 'TO'), Tree('VP', [Tree('V', [('pay', 'VB')])]), ('its', 'PRP$'), Tree('NP', [('rival', 'JJ')]), ('$', '$'), ('1.05bn', 'CD'), ('(', '('), Tree('NP', [('£0.66bn', 'NN')]), (')', ')'), Tree('P', [('in', 'IN')]), ('damages', 'NNS'), Tree('P', [('for', 'IN')]), Tree('VP', [Tree('V', [('copying', 'VBG')])]), ('features', 'NNS'), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('iPad', 'NN')])]), ('and', 'CC'), Tree('NP', [('iPhone', 'NN')]), Tree('P', [('in', 'IN')]), ('its', 'PRP$'), ('Galaxy', 'NNP'), Tree('NP', [('range', 'NN')]), Tree('P', [('of', 'IN')]), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('Samsung', 'NNP'), (',', ','), ('which', 'WDT'), Tree('VP', [Tree('V', [('is', 'VBZ')]), Tree('NP', [('the', 'DT'), ('world', 'NN')])]), (\"'s\", 'POS'), Tree('NP', [('top', 'JJ'), ('mobile', 'NN'), ('phone', 'NN'), ('maker', 'NN')]), (',', ','), Tree('VP', [Tree('V', [('is', 'VBZ')])]), Tree('VP', [Tree('V', [('appealing', 'VBG')]), Tree('NP', [('the', 'DT'), ('ruling', 'NN')])]), ('.', '.')]), Tree('S', [Tree('NP', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN')]), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('the', 'DT')])]), ('UK', 'NNP'), Tree('VP', [Tree('V', [('found', 'VBD')])]), Tree('P', [('in', 'IN')]), ('Samsung', 'NNP'), (\"'s\", 'POS'), Tree('NP', [('favour', 'NN')]), ('and', 'CC'), Tree('VP', [Tree('V', [('ordered', 'VBD')])]), ('Apple', 'NNP'), ('to', 'TO'), Tree('VP', [Tree('V', [('publish', 'VB')]), Tree('NP', [('an', 'DT'), ('apology', 'NN')])]), Tree('VP', [Tree('V', [('making', 'VBG')]), Tree('NP', [('clear', 'JJ')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('the', 'DT'), ('South', 'JJ'), ('Korean', 'JJ'), ('firm', 'NN')])])]), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('not', 'RB'), Tree('VP', [Tree('V', [('copied', 'VBN')])]), ('its', 'PRP$'), Tree('NP', [('iPad', 'NN')]), ('when', 'WRB'), Tree('VP', [Tree('V', [('designing', 'VBG')])]), ('its', 'PRP$'), Tree('NP', [('own', 'JJ')]), ('devices', 'NNS'), ('.', '.')])]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(constituency_output_per_sentence)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Augment the RegexpParser so that it also detects Named Entity Phrases (NEP), e.g., that it detects *Galaxy S III* and *Ice Cream Sandwich*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituent_parser_v2 = nltk.RegexpParser('''\n",
+ "NP: {? * *} # NP\n",
+ "P: {} # Preposition\n",
+ "V: {} # Verb\n",
+ "PP: { } # PP -> P NP\n",
+ "VP: { *} # VP -> V (NP|PP)*\n",
+ "NEP: {*} # More than a NP''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constituency_v2_output_per_sentence = []\n",
+ "for const in pos_tags_per_sentence:\n",
+ " structure = constituent_parser_v2.parse(const)\n",
+ " constituency_v2_output_per_sentence.append(structure)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[Tree('S', [Tree('NEP', [Tree('NP', [('https', 'NN')])]), (':', ':'), Tree('NEP', [Tree('NP', [('//www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html', 'JJ')])]), ('Documents', 'NNS'), Tree('VP', [Tree('V', [('filed', 'VBN')])]), ('to', 'TO'), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('San', 'NNP'), ('Jose', 'NNP'), Tree('NEP', [Tree('NP', [('federal', 'JJ'), ('court', 'NN')])]), Tree('P', [('in', 'IN')]), ('California', 'NNP'), Tree('P', [('on', 'IN')]), ('November', 'NNP'), ('23', 'CD'), Tree('NEP', [Tree('NP', [('list', 'NN')])]), ('six', 'CD'), ('Samsung', 'NNP'), ('products', 'NNS'), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT')])]), ('``', '``'), ('Jelly', 'RB'), ('Bean', 'NNP'), (\"''\", \"''\"), ('and', 'CC'), ('``', '``'), ('Ice', 'NNP'), ('Cream', 'NNP'), ('Sandwich', 'NNP'), (\"''\", \"''\"), Tree('VP', [Tree('V', [('operating', 'VBG')])]), ('systems', 'NNS'), (',', ','), ('which', 'WDT'), ('Apple', 'NNP'), Tree('VP', [Tree('V', [('claims', 'VBZ')])]), Tree('VP', [Tree('V', [('infringe', 'VB')])]), ('its', 'PRP$'), ('patents', 'NNS'), ('.', '.')]), Tree('S', [Tree('NEP', [Tree('NP', [('The', 'DT')])]), ('six', 'CD'), ('phones', 'NNS'), ('and', 'CC'), ('tablets', 'NNS'), Tree('VP', [Tree('V', [('affected', 'VBN')])]), Tree('VP', [Tree('V', [('are', 'VBP')]), Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), (',', ','), Tree('VP', [Tree('V', [('running', 'VBG')]), Tree('NP', [('the', 'DT'), ('new', 'JJ')])]), ('Jelly', 'NNP'), ('Bean', 'NNP'), Tree('NEP', [Tree('NP', [('system', 'NN')])]), (',', ','), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('8.9', 'CD'), ('Wifi', 'NNP'), Tree('NEP', [Tree('NP', [('tablet', 'NN')])]), (',', ','), Tree('NEP', [Tree('NP', [('the', 'DT')])]), ('Galaxy', 'NNP'), ('Tab', 'NNP'), ('2', 'CD'), ('10.1', 'CD'), (',', ','), ('Galaxy', 'NNP'), ('Rugby', 'NNP'), ('Pro', 'NNP'), ('and', 'CC'), ('Galaxy', 'NNP'), ('S', 'NNP'), ('III', 'NNP'), Tree('NEP', [Tree('NP', [('mini', 'NN')])]), ('.', '.')]), Tree('S', [('Apple', 'NNP'), Tree('VP', [Tree('V', [('stated', 'VBD')])]), ('it', 'PRP'), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('“', 'NNP'), Tree('VP', [Tree('V', [('acted', 'VBD')])]), ('quickly', 'RB'), ('and', 'CC'), ('diligently', 'RB'), (\"''\", \"''\"), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('order', 'NN')])]), ('to', 'TO'), ('``', '``'), Tree('VP', [Tree('V', [('determine', 'VB')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('these', 'DT')])])]), ('newly', 'RB'), Tree('VP', [Tree('V', [('released', 'VBN')])]), ('products', 'NNS'), Tree('VP', [Tree('V', [('do', 'VBP')])]), Tree('VP', [Tree('V', [('infringe', 'VB')]), Tree('NP', [('many', 'JJ')]), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('same', 'JJ')])])]), ('claims', 'NNS'), ('already', 'RB'), Tree('VP', [Tree('V', [('asserted', 'VBN')])]), Tree('P', [('by', 'IN')]), ('Apple', 'NNP'), ('.', '.'), (\"''\", \"''\")]), Tree('S', [Tree('P', [('In', 'IN')]), ('August', 'NNP'), (',', ','), ('Samsung', 'NNP'), Tree('VP', [Tree('V', [('lost', 'VBD')]), Tree('NP', [('a', 'DT')])]), ('US', 'NNP'), Tree('NEP', [Tree('NP', [('patent', 'NN'), ('case', 'NN')])]), ('to', 'TO'), ('Apple', 'NNP'), ('and', 'CC'), Tree('VP', [Tree('V', [('was', 'VBD')])]), Tree('VP', [Tree('V', [('ordered', 'VBN')])]), ('to', 'TO'), Tree('VP', [Tree('V', [('pay', 'VB')])]), ('its', 'PRP$'), Tree('NEP', [Tree('NP', [('rival', 'JJ')])]), ('$', '$'), ('1.05bn', 'CD'), ('(', '('), Tree('NEP', [Tree('NP', [('£0.66bn', 'NN')])]), (')', ')'), Tree('P', [('in', 'IN')]), ('damages', 'NNS'), Tree('P', [('for', 'IN')]), Tree('VP', [Tree('V', [('copying', 'VBG')])]), ('features', 'NNS'), Tree('PP', [Tree('P', [('of', 'IN')]), Tree('NP', [('the', 'DT'), ('iPad', 'NN')])]), ('and', 'CC'), Tree('NEP', [Tree('NP', [('iPhone', 'NN')])]), Tree('P', [('in', 'IN')]), ('its', 'PRP$'), ('Galaxy', 'NNP'), Tree('NEP', [Tree('NP', [('range', 'NN')])]), Tree('P', [('of', 'IN')]), ('devices', 'NNS'), ('.', '.')]), Tree('S', [('Samsung', 'NNP'), (',', ','), ('which', 'WDT'), Tree('VP', [Tree('V', [('is', 'VBZ')]), Tree('NP', [('the', 'DT'), ('world', 'NN')])]), (\"'s\", 'POS'), Tree('NEP', [Tree('NP', [('top', 'JJ'), ('mobile', 'NN'), ('phone', 'NN'), ('maker', 'NN')])]), (',', ','), Tree('VP', [Tree('V', [('is', 'VBZ')])]), Tree('VP', [Tree('V', [('appealing', 'VBG')]), Tree('NP', [('the', 'DT'), ('ruling', 'NN')])]), ('.', '.')]), Tree('S', [Tree('NEP', [Tree('NP', [('A', 'DT'), ('similar', 'JJ'), ('case', 'NN')])]), Tree('PP', [Tree('P', [('in', 'IN')]), Tree('NP', [('the', 'DT')])]), ('UK', 'NNP'), Tree('VP', [Tree('V', [('found', 'VBD')])]), Tree('P', [('in', 'IN')]), ('Samsung', 'NNP'), (\"'s\", 'POS'), Tree('NEP', [Tree('NP', [('favour', 'NN')])]), ('and', 'CC'), Tree('VP', [Tree('V', [('ordered', 'VBD')])]), ('Apple', 'NNP'), ('to', 'TO'), Tree('VP', [Tree('V', [('publish', 'VB')]), Tree('NP', [('an', 'DT'), ('apology', 'NN')])]), Tree('VP', [Tree('V', [('making', 'VBG')]), Tree('NP', [('clear', 'JJ')]), Tree('PP', [Tree('P', [('that', 'IN')]), Tree('NP', [('the', 'DT'), ('South', 'JJ'), ('Korean', 'JJ'), ('firm', 'NN')])])]), Tree('VP', [Tree('V', [('had', 'VBD')])]), ('not', 'RB'), Tree('VP', [Tree('V', [('copied', 'VBN')])]), ('its', 'PRP$'), Tree('NEP', [Tree('NP', [('iPad', 'NN')])]), ('when', 'WRB'), Tree('VP', [Tree('V', [('designing', 'VBG')])]), ('its', 'PRP$'), Tree('NEP', [Tree('NP', [('own', 'JJ')])]), ('devices', 'NNS'), ('.', '.')])]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(constituency_v2_output_per_sentence)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## [total points: 1] Exercise 2: spaCy\n",
+ "Use Spacy to process the same text as you analyzed with NLTK."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import spacy\n",
+ "nlp = spacy.load('en_core_web_sm')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+<<<<<<< HEAD
+ "[['San Jose', 'GPE'], ['California', 'GPE'], ['November 23', 'DATE'], ['six', 'CARDINAL'], ['Samsung', 'ORG'], ['Apple', 'ORG'], ['six', 'CARDINAL'], ['the Galaxy S III', 'ORG'], ['Jelly Bean', 'ORG'], ['Apple', 'ORG'], ['Apple', 'ORG'], ['August', 'DATE'], ['Samsung', 'ORG'], ['US', 'GPE'], ['Apple', 'ORG'], ['$1.05bn (£0.66bn', 'MONEY'], ['iPad', 'ORG'], ['iPhone', 'ORG'], ['Samsung', 'ORG'], ['UK', 'GPE'], ['Samsung', 'ORG'], ['Apple', 'ORG'], ['South Korean', 'NORP'], ['iPad', 'ORG']]\n"
+=======
+ "[https://www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html, \n",
+ "\n",
+ ", Documents, filed, to, the, San, Jose, federal, court, in, California, on, November, 23, list, six, Samsung, products, running, the, \", Jelly, Bean, \", and, \", Ice, Cream, Sandwich, \", operating, systems, ,, which, Apple, claims, infringe, its, patents, ., \n",
+ ", The, six, phones, and, tablets, affected, are, the, Galaxy, S, III, ,, running, the, new, Jelly, Bean, system, ,, the, Galaxy, Tab, 8.9, Wifi, tablet, ,, the, Galaxy, Tab, 2, 10.1, ,, Galaxy, Rugby, Pro, and, Galaxy, S, III, mini, ., \n",
+ ", Apple, stated, it, had, “acted, quickly, and, diligently, \", in, order, to, \", determine, that, these, newly, released, products, do, infringe, many, of, the, same, claims, already, asserted, by, Apple, ., \", \n",
+ ", In, August, ,, Samsung, lost, a, US, patent, case, to, Apple, and, was, ordered, to, pay, its, rival, $, 1.05bn, (, £0.66bn, ), in, damages, for, copying, features, of, the, iPad, and, iPhone, in, its, Galaxy, range, of, devices, ., Samsung, ,, which, is, the, world, 's, top, mobile, phone, maker, ,, is, appealing, the, ruling, ., \n",
+ ", A, similar, case, in, the, UK, found, in, Samsung, 's, favour, and, ordered, Apple, to, publish, an, apology, making, clear, that, the, South, Korean, firm, had, not, copied, its, iPad, when, designing, its, own, devices, .]\n",
+ "[[https://www.telegraph.co.uk/technology/apple/9702716/Apple-Samsung-lawsuit-six-more-products-under-scrutiny.html, 'NNP'], [\n",
+ "\n",
+ ", '_SP'], [Documents, 'NNS'], [filed, 'VBD'], [to, 'IN'], [the, 'DT'], [San, 'NNP'], [Jose, 'NNP'], [federal, 'JJ'], [court, 'NN'], [in, 'IN'], [California, 'NNP'], [on, 'IN'], [November, 'NNP'], [23, 'CD'], [list, 'NN'], [six, 'CD'], [Samsung, 'NNP'], [products, 'NNS'], [running, 'VBG'], [the, 'DT'], [\", '``'], [Jelly, 'NNP'], [Bean, 'NNP'], [\", \"''\"], [and, 'CC'], [\", '``'], [Ice, 'NNP'], [Cream, 'NNP'], [Sandwich, 'NNP'], [\", \"''\"], [operating, 'NN'], [systems, 'NNS'], [,, ','], [which, 'WDT'], [Apple, 'NNP'], [claims, 'NNS'], [infringe, 'VBP'], [its, 'PRP$'], [patents, 'NNS'], [., '.'], [\n",
+ ", '_SP'], [The, 'DT'], [six, 'CD'], [phones, 'NNS'], [and, 'CC'], [tablets, 'NNS'], [affected, 'VBN'], [are, 'VBP'], [the, 'DT'], [Galaxy, 'NNP'], [S, 'NNP'], [III, 'NNP'], [,, ','], [running, 'VBG'], [the, 'DT'], [new, 'JJ'], [Jelly, 'NNP'], [Bean, 'NNP'], [system, 'NN'], [,, ','], [the, 'DT'], [Galaxy, 'NNP'], [Tab, 'NNP'], [8.9, 'CD'], [Wifi, 'NNP'], [tablet, 'NNP'], [,, ','], [the, 'DT'], [Galaxy, 'NNP'], [Tab, 'NNP'], [2, 'CD'], [10.1, 'CD'], [,, ','], [Galaxy, 'NNP'], [Rugby, 'NNP'], [Pro, 'NNP'], [and, 'CC'], [Galaxy, 'NNP'], [S, 'NNP'], [III, 'NNP'], [mini, 'NN'], [., '.'], [\n",
+ ", '_SP'], [Apple, 'NNP'], [stated, 'VBD'], [it, 'PRP'], [had, 'VBD'], [“acted, 'VBN'], [quickly, 'RB'], [and, 'CC'], [diligently, 'RB'], [\", \"''\"], [in, 'IN'], [order, 'NN'], [to, 'TO'], [\", '``'], [determine, 'VB'], [that, 'IN'], [these, 'DT'], [newly, 'RB'], [released, 'VBN'], [products, 'NNS'], [do, 'VBP'], [infringe, 'VB'], [many, 'JJ'], [of, 'IN'], [the, 'DT'], [same, 'JJ'], [claims, 'NNS'], [already, 'RB'], [asserted, 'VBN'], [by, 'IN'], [Apple, 'NNP'], [., '.'], [\", \"''\"], [\n",
+ ", '_SP'], [In, 'IN'], [August, 'NNP'], [,, ','], [Samsung, 'NNP'], [lost, 'VBD'], [a, 'DT'], [US, 'NNP'], [patent, 'NN'], [case, 'NN'], [to, 'IN'], [Apple, 'NNP'], [and, 'CC'], [was, 'VBD'], [ordered, 'VBN'], [to, 'TO'], [pay, 'VB'], [its, 'PRP$'], [rival, 'JJ'], [$, '$'], [1.05bn, 'CD'], [(, '-LRB-'], [£0.66bn, 'NNP'], [), '-RRB-'], [in, 'IN'], [damages, 'NNS'], [for, 'IN'], [copying, 'VBG'], [features, 'NNS'], [of, 'IN'], [the, 'DT'], [iPad, 'NNP'], [and, 'CC'], [iPhone, 'NNP'], [in, 'IN'], [its, 'PRP$'], [Galaxy, 'NNP'], [range, 'NN'], [of, 'IN'], [devices, 'NNS'], [., '.'], [Samsung, 'NNP'], [,, ','], [which, 'WDT'], [is, 'VBZ'], [the, 'DT'], [world, 'NN'], ['s, 'POS'], [top, 'JJ'], [mobile, 'JJ'], [phone, 'NN'], [maker, 'NN'], [,, ','], [is, 'VBZ'], [appealing, 'VBG'], [the, 'DT'], [ruling, 'NN'], [., '.'], [\n",
+ ", '_SP'], [A, 'DT'], [similar, 'JJ'], [case, 'NN'], [in, 'IN'], [the, 'DT'], [UK, 'NNP'], [found, 'VBD'], [in, 'IN'], [Samsung, 'NNP'], ['s, 'POS'], [favour, 'NN'], [and, 'CC'], [ordered, 'VBD'], [Apple, 'NNP'], [to, 'TO'], [publish, 'VB'], [an, 'DT'], [apology, 'NN'], [making, 'VBG'], [clear, 'JJ'], [that, 'IN'], [the, 'DT'], [South, 'JJ'], [Korean, 'JJ'], [firm, 'NN'], [had, 'VBD'], [not, 'RB'], [copied, 'VBN'], [its, 'PRP$'], [iPad, 'NNP'], [when, 'WRB'], [designing, 'VBG'], [its, 'PRP$'], [own, 'JJ'], [devices, 'NNS'], [., '.']]\n"
+>>>>>>> 2fd72c3af0aea7a6c8dc166475a45a4496b3d4b1
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'verb, gerund or present participle'"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "doc = nlp(text)\n",
+ "#finding tokens in text\n",
+ "tokens = []\n",
+ "for token in doc:\n",
+ " tokens.append(token)\n",
+ " \n",
+ "#finding tags per each token\n",
+ "tags = []\n",
+ "for tag in tokens:\n",
+ " tagging = [tag, tag.tag_]\n",
+ " tags.append(tagging) \n",
+ "print(tags)\n",
+ "\n",
+ "#ner recognition\n",
+ "ner = []\n",
+ "for entity in doc.ents:\n",
+ " tuples = [entity.text, entity.label_]\n",
+ " ner.append(tuples)\n",
+ "print(ner)\n",
+ "\n",
+ "\n",
+ "#constituency parsing\n",
+ "cp = []\n",
+ "for token in doc:\n",
+ " cp.append(token.dep_)\n",
+ " #I comment displacy out because my laptop could not handle it\n",
+ " #displacy.render(doc, jupyter=True, style='dep')\n",
+ "spacy.explain('VBG') \n",
+ "\n",
+ "# insert code here"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "small tip: You can use **sents = list(doc.sents)** to be able to use the index to access a sentence like **sents[2]** for the third sentence.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## [total points: 7] Exercise 3: Comparison NLTK and spaCy\n",
+ "We will now compare the output of NLTK and spaCy, i.e., in what do they differ?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 3] Exercise 3a: Part of speech tagging\n",
+ "Compare the output from NLTK and spaCy regarding part of speech tagging.\n",
+ "\n",
+ "* To compare, you probably would like to compare sentence per sentence. Describe if the sentence splitting is different for NLTK than for spaCy. If not, where do they differ?\n",
+ "* After checking the sentence splitting, select a sentence for which you expect interesting results and perhaps differences. Motivate your choice.\n",
+ "* Compare the output in `token.tag` from spaCy to the part of speech tagging from NLTK for each token in your selected sentence. Are there any differences? This is not a trick question; it is possible that there are no differences."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise 3a - Answers \n",
+ "1) it looks like the difference between sentence splitting with NLTK and spaCy differs in what method they use to split. With NLTK, all the tokens end up with quotationmarks around them, whereas spaCy does not. This is presumably the case because NLTK uses the split function and spaCy treats the tokens as objects.\n",
+ "\n",
+ "2) \"A similar case in the UK found in Samsung's favour and ordered Apple to publish an apology making clear that the South Korean firm had not copied its iPad when designing its own devices.\" We think this sentence would have some interesting outcomes because there are a couple of parts of this sentences that are ambiguous, for example \" 's\" could be the concatenation of \"is\" as well as a possessive pronoun, or \"ordered\" could be either \"to command\" or ordered in the sense of ordering something from a restaurant.\n",
+ "\n",
+<<<<<<< HEAD
+ "3) There was only one difference found in this sentence, which is that nltk says iPad is a singular noun (NN) while spaCy says it is a proper noun singular (NNP)."
+=======
+ "3) There was only one difference found in this sentence, which is that nltk says iPad is a singular noun (NN) while spaCy says it is a proper noun singular (NNP). "
+>>>>>>> 2fd72c3af0aea7a6c8dc166475a45a4496b3d4b1
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 2] Exercise 3b: Named Entity Recognition (NER)\n",
+ "* Describe differences between the output from NLTK and spaCy for Named Entity Recognition. Which one do you think performs better?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise 3b - Answers\n",
+ "The output of NLTK is in the form trees instead of the individual words as in spaCy. This could potentially provide more information as it shows the entity that is most dominant in each row, while showing the entities of each word in said row. While spaCy only shows the entity and the word it is referring to, meaning with spaCy, it's more likely that you would need to compare with the original sentences to make more sense of the output. However, spaCy's output being a list of lists instead of a list of trees makes it easier to work with and manipulate compared to the tree structure of NLTK.\n",
+ "\n",
+ "Another difference is NLTK shows the entity of each word in each sentence while spaCy only shows a few entity types. For example, NLTK shows \"A\" is a determiner, while that cannot be found in the spaCy output. However, spaCy performs better in the identification. For example, spaCy recognizes \"Novermber 23\" as a Date entity, while NLTK splits November and 23 into two seperate words. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### [points: 2] Exercise 3c: Constituency/dependency parsing\n",
+ "Choose one sentence from the text and run constituency parsing using NLTK and dependency parsing using spaCy.\n",
+ "* describe briefly the difference between constituency parsing and dependency parsing\n",
+ "* describe differences between the output from NLTK and spaCy."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# End of this notebook"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/lab_sessions/lab3/Lab3-assignment-sentiment - Jupyter Notebook.pdf b/lab_sessions/lab3/Lab3-assignment-sentiment - Jupyter Notebook.pdf
new file mode 100644
index 00000000..4b29172e
Binary files /dev/null and b/lab_sessions/lab3/Lab3-assignment-sentiment - Jupyter Notebook.pdf differ
diff --git a/lab_sessions/lab3/Lab3-assignment-sentiment.ipynb b/lab_sessions/lab3/Lab3-assignment-sentiment.ipynb
index e5967d8e..cbe9ab3a 100644
--- a/lab_sessions/lab3/Lab3-assignment-sentiment.ipynb
+++ b/lab_sessions/lab3/Lab3-assignment-sentiment.ipynb
@@ -95,6 +95,27 @@
"```"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "_**Answers:**_\n",
+ "\n",
+ "**Sentence 1:** The world love is a very positive word while apples is neutral so this makes the sentence really positive.\n",
+ "\n",
+ "**Sentence 2:** Love is a strong positive word but it is negated by the \"don't\" which makes this count as highly negative. This combined with the neurtrality of apples, the sentence becomes negative.\n",
+ "\n",
+ "**Sentence 3:** This is similar to the first sentence, but the smiley emoticon is positive which makes the compound more positive than the first sentence.\n",
+ "\n",
+ "**Sentence 4:** This sentence is generally a statement which is neutral but the \"ruins\" is negative which makes the compound, and hence sentence, negative.\n",
+ "\n",
+ "**Sentence 5:** Similar to sentence 4, the sentence is more of a statement but sinc the \"not\" negates the negativity of \"ruins\" making the description more positive. However, the \"considered\" is more factual, inclreasing the neutrality compared to sentence 4, which \"certainly\" is an added positive, which is why the compound is more than the compound of sentence 4.\n",
+ "\n",
+ "**Sentence 6:** The full statement is an observation which makes it neutral, however, the word \"lies\" is considered as he told a lie which is negative. Meaning the compound of this sentence is incorrect as it should only be neutral as there's nothing negative in the sentence. \n",
+ "\n",
+ "**Sentence 7:** Overall the sentence is a statement which makes it mostly neutral, however the word \"like\" is considered as a positive word, even though the sentence would probably be considered as a negative by a human as it means the house is not special."
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -134,7 +155,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
@@ -143,7 +164,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
@@ -152,9 +173,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 52,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 {'sentiment_label': 'positive', 'text_of_tweet': 'Very happy to score my first goal with the blues. Come on!! Next round @ChelseaFC @EmiratesFACup', 'tweet_url': 'https://twitter.com/saulniguez/status/1499135951003697156'}\n"
+ ]
+ }
+ ],
"source": [
"for id_, tweet_info in my_tweets.items():\n",
" print(id_, tweet_info)\n",
@@ -174,7 +203,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
@@ -208,28 +237,146 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 59,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0.00 0.00 0.00 1\n",
+ " negative 0.77 0.50 0.61 20\n",
+ " neutral 0.40 0.40 0.40 10\n",
+ " positive 0.48 0.72 0.58 18\n",
+ "she’s just a little guy who’s just here to vibe 0.00 0.00 0.00 1\n",
+ "\n",
+ " accuracy 0.54 50\n",
+ " macro avg 0.33 0.32 0.32 50\n",
+ " weighted avg 0.56 0.54 0.53 50\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\zahra_6hcxkfv\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
+ "C:\\Users\\zahra_6hcxkfv\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
+ "C:\\Users\\zahra_6hcxkfv\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+ " _warn_prf(average, modifier, msg_start, len(result))\n"
+ ]
+ }
+ ],
"source": [
+ "import nltk\n",
+ "from nltk.sentiment import vader\n",
+ "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+ "import spacy\n",
+ "import pathlib\n",
+ "import sklearn\n",
+ "import numpy\n",
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "from collections import Counter\n",
+ "from sklearn.metrics import classification_report\n",
+ "vader_model = SentimentIntensityAnalyzer()\n",
+ "\n",
+ "nlp = spacy.load('en_core_web_sm') # 'en_core_web_sm'\n",
+ "cwd = pathlib.Path.cwd()\n",
+ "tweets = cwd.joinpath('my_tweets.json')\n",
+ "#print('path:', tweets)\n",
+ "#print('this will print True if the folder exists:', tweets.exists())\n",
+ "\n",
"tweets = []\n",
"all_vader_output = []\n",
"gold = []\n",
"\n",
+ "\n",
"# settings (to change for different experiments)\n",
"to_lemmatize = True \n",
"pos = set()\n",
"\n",
"for id_, tweet_info in my_tweets.items():\n",
" the_tweet = tweet_info['text_of_tweet']\n",
- " vader_output = ''# run vader\n",
- " vader_label = ''# convert vader output to category\n",
+ " vader_output = run_vader(the_tweet, lemmatize=True) # run vader\n",
+ " vader_label = vader_output_to_label(vader_output)# convert vader output to category\n",
" \n",
" tweets.append(the_tweet)\n",
" all_vader_output.append(vader_label)\n",
" gold.append(tweet_info['sentiment_label'])\n",
" \n",
- "# use scikit-learn's classification report"
+ "# use scikit-learn's classification report\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "***Answers***\n",
+ "\n",
+ "a. The precision for the tweets analyzed using Vader seems to be overall not high. Indeed, the weighted average barely\n",
+ "goes beyond 50% settling at 56%. Out of all the categories, negatives seem to perform better than neutral and positive.\n",
+ "Whereas negative see a precision of 77%, neutral and positive do not reach the 50%, being 40% and 48% respectively.\n",
+ "For what concerns recall, instead, the general weighted recall is also not high, reaching 54%. In the lead of \n",
+ "highest recall is positive with 72%, followed by negative and neutral, 50% and 40% respectively. To sum up the\n",
+ "two, we analyze the f1 score, which we can therefore consider as the most describing metric. Overall, the positive category\n",
+ "seem to have a higher f1 score, namely 58%, neutral have 40% and negative 61%. The weighted average in this case is 53%. \n",
+ "The results have a low f1, which is confirmed by the low accuracy of 54%. Considering the imbalance between the \n",
+ "scores of the same categories across precision and recall, the most relevant metric to consider is the f1, which suggest that \n",
+ "the category positives is the most relevant since it is the one with the highest score. Its high recall must mean that there\n",
+ "are probably more positively categorized instances than actual positive instances. It might be due to the tweets set chosen."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "misclassified 1 NAH BRUH THIS ONE OF THE CRAZIEST THINGS IGE EVER SEEN BRON IS THE GOAT\n",
+ "vader: negative\n",
+ "real class: positive\n"
+ ]
+ }
+ ],
+ "source": [
+ "errors = []\n",
+ "scores = vader_model.polarity_scores(the_tweet)\n",
+ "if tweet_info['sentiment_label'] != vader_output_to_label(scores):\n",
+ " errors.append((id_, vader_output_to_label(scores)))\n",
+ "\n",
+ "all_list = []\n",
+ "index = 1\n",
+ "for e in errors:\n",
+ " id_, sentiment_labeling = e\n",
+ " print(\"misclassified\", index, my_tweets[id_]['text_of_tweet'])\n",
+ " print(\"vader: \", sentiment_labeling)\n",
+ " print(\"real class: \", my_tweets[id_]['sentiment_label'])\n",
+ " index += 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "***Answers***\n",
+ "b.\n",
+ "The only instance we got incorrectly classified was the following:\n",
+ "\n",
+ "misclassified 1 NAH BRUH THIS ONE OF THE CRAZIEST THINGS IGE EVER SEEN BRON IS THE GOAT\n",
+ "vader: negative\n",
+ "real class: positive\n",
+ "\n",
+ "The reason why it is misclassified is that the statement is sarcastic, therefore using words in a non-strict semantic manner. Indeed, literally \"nah\" is considered as negative in the lexicon, with a score of -0.4 and \"craziest\" is also considered as negative, scoring -0.2. Therefore, the statement is interpreted as overall negative."
]
},
{
@@ -254,13 +401,451 @@
"* - Are all parts of speech equally important for sentiment analysis? Explain why or why not."
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#imports\n",
+ "import nltk\n",
+ "#nltk.download('vader_lexicon', quiet=False) #only need to run it once\n",
+ "from nltk.sentiment import vader\n",
+ "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+ "import spacy\n",
+ "import pathlib\n",
+ "import sklearn\n",
+ "import numpy\n",
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "from collections import Counter\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "path: c:\\Users\\zahra_6hcxkfv\\ba-text-mining\\lab_sessions\\lab3\\airlinetweets\n",
+ "this will print True if the folder exists: True\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Get the tweets\n",
+ "cwd = pathlib.Path.cwd()\n",
+ "airline_tweets_folder = cwd.joinpath('airlinetweets')\n",
+ "print('path:', airline_tweets_folder)\n",
+ "print('this will print True if the folder exists:', \n",
+ " airline_tweets_folder.exists())\n",
+ "\n",
+ "airline_tweets_train = load_files(\"C:/Users/zahra_6hcxkfv/ba-text-mining/lab_sessions/lab3/airlinetweets/airlinetweets\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Vader\n",
+ "nlp = spacy.load('en_core_web_sm') \n",
+ "\n",
+ "def run_vader(textual_unit, \n",
+ " lemmatize=False, \n",
+ " parts_of_speech_to_consider=None,\n",
+ " verbose=0):\n",
+ " \"\"\"\n",
+ " Run VADER on a sentence from spacy\n",
+ " \n",
+ " :param str textual unit: a textual unit, e.g., sentence, sentences (one string) (by looping over doc.sents)\n",
+ " :param bool lemmatize: If True, provide lemmas to VADER instead of words\n",
+ " :param set parts_of_speech_to_consider:\n",
+ " -None or empty set: all parts of speech are provided\n",
+ " -non-empty set: only these parts of speech are considered.\n",
+ " :param int verbose: if set to 1, information is printed about input and output\n",
+ " :rtype: dict\n",
+ " :return: vader output dict\n",
+ " \"\"\"\n",
+ " doc = nlp(textual_unit)\n",
+ " \n",
+ " input_to_vader = []\n",
+ "\n",
+ " for sent in doc.sents:\n",
+ " for token in sent:\n",
+ " to_add = token.text\n",
+ " if lemmatize:\n",
+ " to_add = token.lemma_\n",
+ " if to_add == '-PRON-': \n",
+ " to_add = token.text\n",
+ " if parts_of_speech_to_consider:\n",
+ " if token.pos_ in parts_of_speech_to_consider:\n",
+ " input_to_vader.append(to_add) \n",
+ " else:\n",
+ " input_to_vader.append(to_add)\n",
+ "\n",
+ " scores = vader_model.polarity_scores(' '.join(input_to_vader))\n",
+ " \n",
+ " if verbose >= 1:\n",
+ " print()\n",
+ " print('INPUT SENTENCE', sent)\n",
+ " print('INPUT TO VADER', input_to_vader)\n",
+ " print('VADER OUTPUT', scores)\n",
+ "\n",
+ " return scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# output to label\n",
+ "def vader_output_to_label(vader_output):\n",
+ " \"\"\"\n",
+ " map vader output e.g.,\n",
+ " {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}\n",
+ " to one of the following values:\n",
+ " a) positive float -> 'positive'\n",
+ " b) 0.0 -> 'neutral'\n",
+ " c) negative float -> 'negative'\n",
+ " \n",
+ " :param dict vader_output: output dict from vader\n",
+ " \n",
+ " :rtype: str\n",
+ " :return: 'negative' | 'neutral' | 'positive'\n",
+ " \"\"\"\n",
+ " compound = vader_output['compound']\n",
+ " \n",
+ " if compound < 0:\n",
+ " return 'negative'\n",
+ " elif compound == 0.0:\n",
+ " return 'neutral'\n",
+ " elif compound > 0.0:\n",
+ " return 'positive'\n",
+ " \n",
+ "assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'\n",
+ "assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'\n",
+ "assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER (as it is) on the set of airline tweets \n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet)\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.80 0.51 0.63 1750\n",
+ " neutral 0.60 0.51 0.55 1515\n",
+ " positive 0.56 0.88 0.68 1490\n",
+ "\n",
+ " accuracy 0.63 4755\n",
+ " macro avg 0.65 0.63 0.62 4755\n",
+ "weighted avg 0.66 0.63 0.62 4755\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets after having lemmatized the text\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet,lemmatize=True)\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets with only adjectives\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet, parts_of_speech_to_consider={'ADJ'})\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets with only adjectives and after having lemmatized the text\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet, lemmatize=True, parts_of_speech_to_consider={'ADJ'})\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets with only nouns\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet, parts_of_speech_to_consider={'NOUN'})\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets with only nouns and after having lemmatized the text\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet, lemmatize=True, parts_of_speech_to_consider={'NOUN'})\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets with only verbs\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet, parts_of_speech_to_consider={'VERB'})\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run VADER on the set of airline tweets with only verbs and after having lemmatized the text\n",
+ "vader_model = SentimentIntensityAnalyzer() #reset model\n",
+ "\n",
+ "tweets = []\n",
+ "all_vader_output = []\n",
+ "gold = []\n",
+ "\n",
+ "for i in range(len(airline_tweets_train.data)):\n",
+ " the_tweet = str(airline_tweets_train.data[i])\n",
+ " vader_output = run_vader(the_tweet, lemmatize=True, parts_of_speech_to_consider={'VERB'})\n",
+ " vader_label = vader_output_to_label(vader_output) \n",
+ " \n",
+ " tweets.append(the_tweet)\n",
+ " all_vader_output.append(vader_label)\n",
+ " gold.append(airline_tweets_train.target_names[airline_tweets_train.target[i]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# analyize\n",
+ "report = classification_report(gold,all_vader_output)\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Comparison:**\n",
+ "Firstly, we will compare how well the models perform when with and without lemmatizing the text. Since the data is balanced considering the 3 classes have similar instances, namely 1750, 1515, and 1490 for negative, neutral, and positive respectively, we will focus on the macro average instead of the weighted average, however, the micro average, or the accuracy, will be used for an overall comparison of performance, regardless of class. \n",
+ "\n",
+ "For the basic model, the precision has a macro average of 0.65, while the recall is 0.63, leaving the f1_score at 0.62, and the accuracy, or micro average at 0.63. While, for the basic model with lemmatizing the text, the precision is 0.65, recall is 0.63, f1_score of 0.62, and accuracy of 0.62. This means the difference between the models is only evident in the accuracy score, however since the difference is only 0.01, we will consider that lemmatizing the text has no effect on the basic model.\n",
+ "\n",
+ "For the model concerning the adjectives, the model without lemmatizing the text has a precision of 0.65, recall of 0.52, and f1_score of 0.48, which the accuracy is 0.50. While the same model but with lemmatized text, the precision is 0.65, recall is 0.52, f1_score is 0.48, and accuracy is 0.50. Similar to the previous situation, the comparison between lemmatized and none-lemmatized shows that lemmatizing the text has no effect on the model.\n",
+ "\n",
+ "For the model concerning the nouns, the model without lemmatizing the text has a precision of 0.54, recall of 0.43, and f1_score of 0.38, which the accuracy is 0.42. While the same model but with lemmatized text, the precision is 0.53, recall is 0.43, f1_score is 0.39, and accuracy is 0.42. Similar to the previous situations, the comparison between lemmatized and none-lemmatized shows that lemmatizing the text has no effect on the model.\n",
+ "\n",
+ "Finally, For the model concerning the verbs, the model without lemmatizing the text has a precision of 0.58, recall of 0.48, and f1_score of 0.45, which the accuracy is 0.47. While the same model but with lemmatized text, the precision is 0.56, recall is 0.47, f1_score is 0.45, and accuracy is 0.47. Similar to the previous situations, the comparison between lemmatized and none-lemmatized shows that lemmatizing the text has no effect on the model.\n",
+ "\n",
+ "Therefore, we can conclude that lemmatizing the text only has no effect on the model's performance, especially in the accuracy as it's always the same in both models of lemmatizing and none-lemmatizing.\n",
+ "\n",
+ "Secondly, we compare the performances between the models, in which the comparison will be divided upon comparing the models with lemmatized text and models without. In the case of none-lemmatized text, the model with the most precision is the model that filters on the adjectives part-of-speech, and the basic model with the same 0.65 precision. However, the nouns and verbs models have significantly lower precision with a difference of 0.70 and 0.11 difference. The verbs model is the higher of the two with again a 0.02 difference between it and the nouns model. However, when considering the recall of the models, the basic model performs significantly better than the other three, with a 0.11 difference between the adjectives model, 0.20 with the nouns model, and 0.15 with the verbs model. However, when we check the f1_scores of the models, we can conclude that the basic model performs much better than the other models, which is supported by the accuracy comparison.\n",
+ "\n",
+ "Now looking at the models with lemmatized text, we can see the same pattern of the basic model performing better than the others overall, with the adjectives model being second, followed by the verbs model then the nouns model.\n",
+ "\n",
+ "A reason for this observation could be that since tweets are normally short, only focusing on parts of the sentence does not allow the model to perform well, and the reason for the nouns model performing the worst is because the nouns usually do not give a good indication of the label of a sentence or tweet, but rather give context. Similarly, verbs do not give a good indication of the nature of the tweet, unless it's an extreme verb like crashed would be negative or smiled would be positive but most verbs would be neutral like walked, said, etc. Similarly, with lemmatizing text, it doesn't have much of an effect on the label of the text as for example crash and crashed are both negative, and hence returning the word to its base/dictionary form doesn't have much of an effect on how positive, negative, or neutral it is."
]
},
{
@@ -283,11 +868,388 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pathlib\n",
+ "import sklearn\n",
+ "import numpy\n",
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "from collections import Counter\n",
+ "from sklearn.datasets import load_files\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from sklearn.feature_extraction.text import TfidfTransformer\n",
+ "from sklearn.naive_bayes import MultinomialNB\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here\n"
+ "cwd = pathlib.Path.cwd()\n",
+ "airline_tweets_folder = cwd.joinpath('airlinetweets')\n",
+ "airline_tweets_train = load_files(\"C:/Users/zahra_6hcxkfv/ba-text-mining/lab_sessions/lab3/airlinetweets/airlinetweets\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\zahra_6hcxkfv\\anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:388: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens [\"'d\", \"'ll\", \"'re\", \"'s\", \"'ve\", 'could', 'might', 'must', \"n't\", 'need', 'sha', 'wo', 'would'] not in stop_words.\n",
+ " warnings.warn('Your stop_words may be inconsistent with '\n"
+ ]
+ }
+ ],
+ "source": [
+ "# initialize airline object, and then turn airline tweets train data into a vector \n",
+ "\n",
+ "airline_vec = CountVectorizer(min_df=2, # If a token appears fewer times than this, across all documents, it will be ignored\n",
+ " tokenizer=nltk.word_tokenize, # we use the nltk tokenizer\n",
+ " stop_words=stopwords.words('english')) # stopwords are removed\n",
+ "\n",
+ "airline_counts = airline_vec.fit_transform(airline_tweets_train.data)\n",
+ "# Convert raw frequency counts into TF-IDF values\n",
+ "tfidf_transformer = TfidfTransformer()\n",
+ "airline_tfidf = tfidf_transformer.fit_transform(airline_counts)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# airline tweets 80/20 split, tf-idf, min_df=2\n",
+ "docs_train, docs_test, y_train, y_test = train_test_split(\n",
+ " airline_tfidf, # the tf-idf model\n",
+ " airline_tweets_train.target, # the category values for each tweet \n",
+ " test_size = 0.20 # we use 80% for training and 20% for development\n",
+ " ) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "clf = MultinomialNB().fit(docs_train, y_train)\n",
+ "y_pred = clf.predict(docs_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Results for airline tweets with 80/20 split, tf-idf, min_df=2\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.810 0.905 0.854 357\n",
+ " 1 0.809 0.699 0.750 296\n",
+ " 2 0.845 0.839 0.842 298\n",
+ "\n",
+ " accuracy 0.820 951\n",
+ " macro avg 0.821 0.814 0.815 951\n",
+ "weighted avg 0.820 0.820 0.818 951\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "report = classification_report(y_test,y_pred,digits = 3)\n",
+ "\n",
+ "print(\"Results for airline tweets with 80/20 split, tf-idf, min_df=2\")\n",
+ "print(report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# airline tweets 80/20 split, bag-of-words\n",
+ "docs_train1, docs_test1, y_train1, y_test1 = train_test_split(\n",
+ " airline_counts, # the bag-of-words model\n",
+ " airline_tweets_train.target, # the category values for each tweet \n",
+ " test_size = 0.20 # we use 80% for training and 20% for development\n",
+ " ) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Results for airline tweets with 80/20 split, bag-of-words\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.863 0.919 0.890 357\n",
+ " 1 0.870 0.754 0.808 285\n",
+ " 2 0.852 0.893 0.872 309\n",
+ "\n",
+ " accuracy 0.861 951\n",
+ " macro avg 0.862 0.855 0.857 951\n",
+ "weighted avg 0.862 0.861 0.860 951\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "clf1 = MultinomialNB().fit(docs_train1, y_train1)\n",
+ "y_pred1 = clf1.predict(docs_test1)\n",
+ "\n",
+ "report1 = classification_report(y_test1,y_pred1,digits = 3)\n",
+ "\n",
+ "print(\"Results for airline tweets with 80/20 split, bag-of-words\")\n",
+ "print(report1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\zahra_6hcxkfv\\anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:388: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens [\"'d\", \"'ll\", \"'re\", \"'s\", \"'ve\", 'could', 'might', 'must', \"n't\", 'need', 'sha', 'wo', 'would'] not in stop_words.\n",
+ " warnings.warn('Your stop_words may be inconsistent with '\n"
+ ]
+ }
+ ],
+ "source": [
+ "# airline tweets 80/20 split, tf-idf, min_df=5\n",
+ "airline_vec2 = CountVectorizer(min_df=5, # If a token appears fewer times than this, across all documents, it will be ignored\n",
+ " tokenizer=nltk.word_tokenize, # we use the nltk tokenizer\n",
+ " stop_words=stopwords.words('english')) # stopwords are removed\n",
+ "airline_counts2 = airline_vec2.fit_transform(airline_tweets_train.data)\n",
+ "tfidf_transformer = TfidfTransformer()\n",
+ "airline_tfidf2 = tfidf_transformer.fit_transform(airline_counts2)\n",
+ "\n",
+ "docs_train2, docs_test2, y_train2, y_test2 = train_test_split(\n",
+ " airline_tfidf2, # the tf-idf model\n",
+ " airline_tweets_train.target, # the category values for each tweet \n",
+ " test_size = 0.20 # we use 80% for training and 20% for development\n",
+ " ) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Results for airline tweets with 80/20 split, tf-idf, min_df=5\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.828 0.897 0.861 349\n",
+ " 1 0.803 0.747 0.774 273\n",
+ " 2 0.850 0.824 0.836 329\n",
+ "\n",
+ " accuracy 0.829 951\n",
+ " macro avg 0.827 0.823 0.824 951\n",
+ "weighted avg 0.828 0.829 0.828 951\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "clf2 = MultinomialNB().fit(docs_train2, y_train2)\n",
+ "y_pred2 = clf2.predict(docs_test2)\n",
+ "\n",
+ "report2 = classification_report(y_test2,y_pred2,digits = 3)\n",
+ "\n",
+ "print(\"Results for airline tweets with 80/20 split, tf-idf, min_df=5\")\n",
+ "print(report2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\zahra_6hcxkfv\\anaconda3\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:388: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens [\"'d\", \"'ll\", \"'re\", \"'s\", \"'ve\", 'could', 'might', 'must', \"n't\", 'need', 'sha', 'wo', 'would'] not in stop_words.\n",
+ " warnings.warn('Your stop_words may be inconsistent with '\n"
+ ]
+ }
+ ],
+ "source": [
+ "# airline tweets 80/20 split, tf-idf, min_df=10\n",
+ "airline_vec3 = CountVectorizer(min_df=10, # If a token appears fewer times than this, across all documents, it will be ignored\n",
+ " tokenizer=nltk.word_tokenize, # we use the nltk tokenizer\n",
+ " stop_words=stopwords.words('english')) # stopwords are removed\n",
+ "airline_counts3 = airline_vec3.fit_transform(airline_tweets_train.data)\n",
+ "tfidf_transformer = TfidfTransformer()\n",
+ "airline_tfidf3 = tfidf_transformer.fit_transform(airline_counts3)\n",
+ "\n",
+ "docs_train3, docs_test3, y_train3, y_test3 = train_test_split(\n",
+ " airline_tfidf3, # the tf-idf model\n",
+ " airline_tweets_train.target, # the category values for each tweet \n",
+ " test_size = 0.20 # we use 80% for training and 20% for development\n",
+ " ) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Results for airline tweets with 80/20 split, tf-idf, min_df=10\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.802 0.868 0.833 340\n",
+ " 1 0.787 0.710 0.747 307\n",
+ " 2 0.804 0.809 0.807 304\n",
+ "\n",
+ " accuracy 0.798 951\n",
+ " macro avg 0.798 0.796 0.795 951\n",
+ "weighted avg 0.798 0.798 0.797 951\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "clf3 = MultinomialNB().fit(docs_train3, y_train3)\n",
+ "y_pred3 = clf3.predict(docs_test3)\n",
+ "\n",
+ "report3 = classification_report(y_test3,y_pred3,digits = 3)\n",
+ "\n",
+ "print(\"Results for airline tweets with 80/20 split, tf-idf, min_df=10\")\n",
+ "print(report3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Q5a Classification reports gathered below**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Results for airline tweets with 80/20 split, tf-idf, min_df=2\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.810 0.905 0.854 357\n",
+ " 1 0.809 0.699 0.750 296\n",
+ " 2 0.845 0.839 0.842 298\n",
+ "\n",
+ " accuracy 0.820 951\n",
+ " macro avg 0.821 0.814 0.815 951\n",
+ "weighted avg 0.820 0.820 0.818 951\n",
+ "\n",
+ "Results for airline tweets with 80/20 split, bag-of-words\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.863 0.919 0.890 357\n",
+ " 1 0.870 0.754 0.808 285\n",
+ " 2 0.852 0.893 0.872 309\n",
+ "\n",
+ " accuracy 0.861 951\n",
+ " macro avg 0.862 0.855 0.857 951\n",
+ "weighted avg 0.862 0.861 0.860 951\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Results for airline tweets with 80/20 split, tf-idf, min_df=2\")\n",
+ "print(report)\n",
+ "\n",
+ "print(\"Results for airline tweets with 80/20 split, bag-of-words\")\n",
+ "print(report1)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Results for airline tweets with 80/20 split, tf-idf, min_df=5\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.828 0.897 0.861 349\n",
+ " 1 0.803 0.747 0.774 273\n",
+ " 2 0.850 0.824 0.836 329\n",
+ "\n",
+ " accuracy 0.829 951\n",
+ " macro avg 0.827 0.823 0.824 951\n",
+ "weighted avg 0.828 0.829 0.828 951\n",
+ "\n",
+ "Results for airline tweets with 80/20 split, tf-idf, min_df=10\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.802 0.868 0.833 340\n",
+ " 1 0.787 0.710 0.747 307\n",
+ " 2 0.804 0.809 0.807 304\n",
+ "\n",
+ " accuracy 0.798 951\n",
+ " macro avg 0.798 0.796 0.795 951\n",
+ "weighted avg 0.798 0.798 0.797 951\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Results for airline tweets with 80/20 split, tf-idf, min_df=5\")\n",
+ "print(report2)\n",
+ "\n",
+ "print(\"Results for airline tweets with 80/20 split, tf-idf, min_df=10\")\n",
+ "print(report3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Q5b**\n",
+ "The bag-of-words approach seems to get the best results, as indicated by the f1-score. It has higher f1-scores than all three of the tf-idf results for each category (so positive/neutral/negative). The accuracy score is also higher with the bag-of-words approach. \n",
+ "\n",
+ "The frequency threshold does have a slight influence on the categories separately, as indicated by the f1-scores. However, it doesn't seem to have too much of an influence on the bigger picture, as all three of the tf-idf strategies have nearly the same accuracy (0.816/0.817/0.816)."
]
},
{
@@ -306,9 +1268,261 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 49,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Important words in negative documents\n",
+ "0 157.88531551543522 united\n",
+ "0 113.873151027286 .\n",
+ "0 98.12839667842987 ``\n",
+ "0 96.98103486178553 @\n",
+ "0 55.646084640149795 flight\n",
+ "0 50.57437534912116 ?\n",
+ "0 45.08738792587808 #\n",
+ "0 40.9325096519484 !\n",
+ "0 40.838513332793184 n't\n",
+ "0 32.74447996440087 ''\n",
+ "0 23.833706773832752 service\n",
+ "0 23.210892142295194 's\n",
+ "0 21.920904111130767 virginamerica\n",
+ "0 21.64121036839044 delayed\n",
+ "0 21.513437607220986 bag\n",
+ "0 20.864141433090914 customer\n",
+ "0 19.976657169737457 cancelled\n",
+ "0 19.562853880284525 plane\n",
+ "0 19.38648922748153 get\n",
+ "0 18.807408968936883 'm\n",
+ "0 17.8767887626248 hours\n",
+ "0 17.339827966380682 gate\n",
+ "0 17.136748919179777 time\n",
+ "0 16.8898741166855 hour\n",
+ "0 16.284697707068062 :\n",
+ "0 16.091271557196947 ...\n",
+ "0 15.031165703634063 still\n",
+ "0 14.78565494443839 late\n",
+ "0 14.38157785062803 waiting\n",
+ "0 14.270728388434843 delay\n",
+ "0 14.106676687003848 -\n",
+ "0 14.048054804174056 would\n",
+ "0 13.973175821464023 airline\n",
+ "0 13.704077810861364 worst\n",
+ "0 13.225436376160797 ;\n",
+ "0 12.994879457538246 one\n",
+ "0 12.8463767214656 help\n",
+ "0 12.700666931191659 ca\n",
+ "0 12.570260053250085 &\n",
+ "0 12.493197591572123 like\n",
+ "0 12.201763472981684 2\n",
+ "0 12.159394339632682 never\n",
+ "0 12.122287226866415 http\n",
+ "0 12.066575620815499 flightled\n",
+ "0 11.681112620185138 amp\n",
+ "0 11.636176779563996 luggage\n",
+ "0 11.529198400849474 flights\n",
+ "0 10.434622254794563 3\n",
+ "0 10.386831355109425 us\n",
+ "0 10.36691559934203 've\n",
+ "0 10.268781466544054 lost\n",
+ "0 10.150832812637914 due\n",
+ "0 10.09600063608324 really\n",
+ "0 10.069743275064472 (\n",
+ "0 9.896173639260597 yes\n",
+ "0 9.78795312849608 $\n",
+ "0 9.758816798417383 check\n",
+ "0 9.684889771656284 day\n",
+ "0 9.515670414292101 ever\n",
+ "0 9.494152936113075 bags\n",
+ "0 9.192768246817778 another\n",
+ "0 9.180143001124854 wait\n",
+ "0 8.928911596700171 staff\n",
+ "0 8.885035668122713 trying\n",
+ "0 8.795252528528078 last\n",
+ "0 8.777868321587897 seat\n",
+ "0 8.725595491460044 sitting\n",
+ "0 8.590979072677047 people\n",
+ "0 8.51335227641699 're\n",
+ "0 8.491220761854903 4\n",
+ "0 8.34006752094781 back\n",
+ "0 8.317136184244964 u\n",
+ "0 8.291400706088426 crew\n",
+ "0 8.289674522692911 min\n",
+ "0 8.255088421344547 fly\n",
+ "0 7.905840040125858 days\n",
+ "0 7.898436460768168 even\n",
+ "0 7.824836969301282 )\n",
+ "0 7.7570188109564375 told\n",
+ "0 7.687649893772306 passengers\n",
+ "-----------------------------------------\n",
+ "Important words in neutral documents\n",
+ "1 106.74416685021379 @\n",
+ "1 85.55178935803296 ?\n",
+ "1 64.33473342047513 jetblue\n",
+ "1 58.236039158375924 southwestair\n",
+ "1 57.22238628789153 .\n",
+ "1 57.21567730661497 ``\n",
+ "1 53.52651643286803 americanair\n",
+ "1 52.795415385494366 :\n",
+ "1 42.55674521095053 usairways\n",
+ "1 39.62028005150853 flight\n",
+ "1 38.28634781761471 #\n",
+ "1 37.723268163692246 http\n",
+ "1 37.16391306126767 united\n",
+ "1 31.093361530510123 's\n",
+ "1 24.110408462169747 !\n",
+ "1 21.070703187030954 dm\n",
+ "1 19.60821076052033 virginamerica\n",
+ "1 18.6059731356974 please\n",
+ "1 18.02489933908256 get\n",
+ "1 17.227430920347608 flights\n",
+ "1 15.91307124146993 hi\n",
+ "1 15.911158402028057 help\n",
+ "1 14.584689752367657 -\n",
+ "1 14.333476375126558 tomorrow\n",
+ "1 13.79903281086604 sent\n",
+ "1 13.63270399375317 need\n",
+ "1 13.35529792702796 fleek\n",
+ "1 13.32094189356208 ...\n",
+ "1 13.303526373974764 )\n",
+ "1 13.217389847073207 fleet\n",
+ "1 11.94839555768357 “\n",
+ "1 11.472747456362454 n't\n",
+ "1 11.382299644638712 ”\n",
+ "1 11.36242903332639 (\n",
+ "1 11.360915085705978 would\n",
+ "1 11.219327353746957 know\n",
+ "1 10.447016255415344 flying\n",
+ "1 9.407954682678547 go\n",
+ "1 9.331175617715033 change\n",
+ "1 9.194737695754412 follow\n",
+ "1 9.053605626019746 ''\n",
+ "1 9.04224764652315 like\n",
+ "1 9.005863628227607 ;\n",
+ "1 8.80392036264528 way\n",
+ "1 8.65017066137527 thanks\n",
+ "1 8.632745627526369 cancelled\n",
+ "1 8.572412857108285 number\n",
+ "1 8.532252043453179 us\n",
+ "1 8.512340402980653 'm\n",
+ "1 8.409429443444152 fly\n",
+ "1 8.311088601505958 one\n",
+ "1 7.936366730329809 chance\n",
+ "1 7.908331528538644 see\n",
+ "1 7.774116522442779 next\n",
+ "1 7.633574062154693 &\n",
+ "1 7.401479846467683 check\n",
+ "1 7.376487608304668 rt\n",
+ "1 7.158026685264434 airport\n",
+ "1 7.074543762370153 new\n",
+ "1 6.986760074900994 guys\n",
+ "1 6.725286099287157 yes\n",
+ "1 6.717290578344065 destinationdragons\n",
+ "1 6.685023021374769 could\n",
+ "1 6.681067836437876 morning\n",
+ "1 6.561948042444977 going\n",
+ "1 6.555344416742465 time\n",
+ "1 6.468203922253822 amp\n",
+ "1 6.4301097851170015 done\n",
+ "1 6.327480515292292 travel\n",
+ "1 6.31487231718577 today\n",
+ "1 6.237171062090071 use\n",
+ "1 6.160307635051282 ticket\n",
+ "1 6.0086109566745 make\n",
+ "1 5.990107095930149 show\n",
+ "1 5.7416340899952445 question\n",
+ "1 5.717327512391316 tickets\n",
+ "1 5.670165741711722 want\n",
+ "1 5.645958813817033 back\n",
+ "1 5.621089932775184 possible\n",
+ "1 5.379225346500409 airways\n",
+ "-----------------------------------------\n",
+ "Important words in positive documents\n",
+ "2 170.41065472376397 !\n",
+ "2 105.34559865116026 @\n",
+ "2 87.08671988934591 thank\n",
+ "2 85.85639418312397 .\n",
+ "2 80.29923663292738 thanks\n",
+ "2 69.02270678380802 jetblue\n",
+ "2 64.32300622754985 ``\n",
+ "2 63.89281845623399 southwestair\n",
+ "2 51.81047104693908 americanair\n",
+ "2 50.812185156240766 #\n",
+ "2 41.466098978285736 great\n",
+ "2 38.22599819199688 usairways\n",
+ "2 36.104636928252354 united\n",
+ "2 33.229992907532285 :\n",
+ "2 29.011687679036786 flight\n",
+ "2 23.746560301355373 much\n",
+ "2 22.98609714661519 love\n",
+ "2 22.392827506348493 awesome\n",
+ "2 22.090965886216374 )\n",
+ "2 21.61502025886306 virginamerica\n",
+ "2 21.12122168088914 service\n",
+ "2 19.91472480469344 best\n",
+ "2 18.512372845318037 guys\n",
+ "2 16.524053081422093 http\n",
+ "2 15.818610694850367 customer\n",
+ "2 15.718435863988267 -\n",
+ "2 15.3869937939704 good\n",
+ "2 14.36787634677145 airline\n",
+ "2 14.24820610047806 amazing\n",
+ "2 13.762124215787367 got\n",
+ "2 12.89794305028997 appreciate\n",
+ "2 12.529538946719295 time\n",
+ "2 12.184895442356465 ;\n",
+ "2 11.951751969589301 response\n",
+ "2 11.863837938595449 today\n",
+ "2 10.954258437212932 help\n",
+ "2 10.765085472475702 flying\n",
+ "2 10.70227078638209 's\n",
+ "2 10.146285503305545 us\n",
+ "2 10.111658456076476 crew\n",
+ "2 9.275996664552881 made\n",
+ "2 9.217239759099176 always\n",
+ "2 8.539939122437557 &\n",
+ "2 8.352461861801059 n't\n",
+ "2 8.34407612169727 gate\n",
+ "2 8.185579605673533 ever\n",
+ "2 7.995051029613868 yes\n",
+ "2 7.920512369520521 fly\n",
+ "2 7.795843265754015 southwest\n",
+ "2 7.761913801299537 well\n",
+ "2 7.516819755308916 get\n",
+ "2 7.5153367139093294 're\n",
+ "2 7.46557435250544 new\n",
+ "2 7.39648935580001 helpful\n",
+ "2 7.385909758120159 ...\n",
+ "2 7.170159451775658 job\n",
+ "2 7.128486845789566 home\n",
+ "2 7.081562565475099 quick\n",
+ "2 7.0755647905993175 amp\n",
+ "2 7.0513998814028565 happy\n",
+ "2 7.050383563626415 day\n",
+ "2 6.999809130559078 work\n",
+ "2 6.923738921044472 nice\n",
+ "2 6.90887633533235 back\n",
+ "2 6.839891804096854 'll\n",
+ "2 6.724980417603007 see\n",
+ "2 6.631172901062412 thx\n",
+ "2 6.606746034631406 team\n",
+ "2 6.545656748801527 like\n",
+ "2 6.5046345692923975 ''\n",
+ "2 6.464945926645512 tonight\n",
+ "2 6.399491336766902 finally\n",
+ "2 6.359483518210278 everything\n",
+ "2 6.336300555961514 first\n",
+ "2 6.306434295311223 rock\n",
+ "2 6.292176905728015 excellent\n",
+ "2 6.037730057769342 really\n",
+ "2 6.013271564042919 follow\n",
+ "2 5.997316437954213 done\n",
+ "2 5.861557241404781 class\n"
+ ]
+ }
+ ],
"source": [
"def important_features_per_class(vectorizer,classifier,n=80):\n",
" class_labels = classifier.classes_\n",
@@ -329,7 +1543,20 @@
" print(class_labels[2], coef, feat) \n",
"\n",
"# example of how to call from notebook:\n",
- "#important_features_per_class(airline_vec, clf)"
+ "important_features_per_class(airline_vec, clf)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*Answers:*\n",
+ "\n",
+ "1. For the negative documents, I expected words in their negated form, showed by the output \"n't\" and more punctuation. For the neutral documents, I expected words that would be used more frequently, so the name of the airlines was a good fit to this. Lastly, for the positive documents, I expected words of appreciation or just positive words in general, and the output contained words related to \"thank you\" and also punctuation. Punctuations in both positive and negative were to be expected as they are normally used when conveying emotions, as we don't need them in a neutral setting, they also don't appear as much.\n",
+ "\n",
+ "2. There are no words that I didn't expect, given that all of the outputs relate to their categories. Perhaps, I didn't expect punctuation, such as question marks, to have such a high scoring in neutral documents. However, this can be liked to just asking a direct question to the airline.\n",
+ "\n",
+ "3. Even though punctuation can be very expressive, it is easier to understand in context. For example, exclamation marks have a high scoring in both postive and negative settings, so it is hard to analyze why they are there with just this information. Therefore, I would remove punctuation in order to improve the readability and understandability of the model. I would also consider removing \"@\" symbols as they are just used to tag the airline in a tweet and the name of the airlines themselves as they will always appear, don't matter the context."
]
},
{
@@ -386,7 +1613,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.4"
+ "version": "3.8.8"
}
},
"nbformat": 4,
diff --git a/lab_sessions/lab3/Lab3.2-Sentiment-analysis-with-VADER.ipynb b/lab_sessions/lab3/Lab3.2-Sentiment-analysis-with-VADER.ipynb
index cb33fda6..bd82e169 100644
--- a/lab_sessions/lab3/Lab3.2-Sentiment-analysis-with-VADER.ipynb
+++ b/lab_sessions/lab3/Lab3.2-Sentiment-analysis-with-VADER.ipynb
@@ -177,12 +177,12 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
- "nlp = spacy.load('en') # 'en_core_web_sm'"
+ "nlp = spacy.load('en_core_web_sm') # 'en_core_web_sm'"
]
},
{
@@ -493,7 +493,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.8.8"
}
},
"nbformat": 4,
diff --git a/lab_sessions/lab3/airlinetweets.zip b/lab_sessions/lab3/airlinetweets.zip
deleted file mode 100644
index 98b60dac..00000000
Binary files a/lab_sessions/lab3/airlinetweets.zip and /dev/null differ
diff --git a/lab_sessions/lab3/group1_lab3.zip b/lab_sessions/lab3/group1_lab3.zip
new file mode 100644
index 00000000..8df4b957
Binary files /dev/null and b/lab_sessions/lab3/group1_lab3.zip differ
diff --git a/lab_sessions/lab3/my_tweets.json b/lab_sessions/lab3/my_tweets.json
index d4142d11..6dd6c320 100644
--- a/lab_sessions/lab3/my_tweets.json
+++ b/lab_sessions/lab3/my_tweets.json
@@ -1,252 +1,252 @@
{
"1": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "Very happy to score my first goal with the blues. Come on!! Next round @ChelseaFC @EmiratesFACup",
+ "tweet_url": "https://twitter.com/saulniguez/status/1499135951003697156"
},
"2": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "Kim Kardashian has officially been granted her request to be legally single, restoring her maiden name and dropping West.",
+ "tweet_url": "https://twitter.com/i/topics/1095352268046495745"
},
"3": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "No one has an hour to listen to it",
+ "tweet_url": "https://twitter.com/Ievitatingg/status/1499102665128747017"
},
"4": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "I hope Kim and Pete use this in court to get restraining orders against Kanye.",
+ "tweet_url": "https://twitter.com/Lanalbb/status/1499124738999959553"
},
"5": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "Grateful for everything that’s missed me . I know it wasn’t for me",
+ "tweet_url": "https://twitter.com/sza/status/1499115352260706308"
},
"6": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "@LettyA ahh ive always wanted to see rent :( love the soundtrack!!",
+ "tweet_url": "https://twitter.com/starkissed/status/1467813579"
},
"7": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "@Viennah Yay! I'm happy for you with your job! But that also means less time for me and you...",
+ "tweet_url": "https://twitter.com/antzpantz/status/1467819650"
},
"8": {
"sentiment_label": "",
"text_of_tweet": "",
- "tweet_url": ""
+ "tweet_url": "https://twitter.com/anjaaminki/status/1468161209"
},
"9": {
- "sentiment_label": "",
- "text_of_tweet": "",
+ "sentiment_label": "negative",
+ "text_of_tweet": "i don't wanna go to work i wanna go to bed!",
"tweet_url": ""
},
"10": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "I just made the prettiest latte! I even used the milk steamer and put a dollop of foam on top, and sprinkled cinnamon/sugar on it! ",
+ "tweet_url": "https://twitter.com/stephoodle/status/1551681354"
},
"11": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "listening to some songs from the Punk goes Pop 2 album. awesome! haha!",
+ "tweet_url": "https://twitter.com/heypatmarie/status/1551744472"
},
"12": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "On this day, 12 years ago: Lady Gaga performs 'Telephone' and 'Brown Eyes' on Friday Night With Jonathan Ross.",
+ "tweet_url": "https://twitter.com/ThrowbacksGaga/status/1499308610819608577"
},
"13": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "I am microbiologist and I eat raw cookie dough",
+ "tweet_url": "https://twitter.com/sciliz/status/1498814095038029825"
},
"14": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "now what if labrinth and zendaya are secretly working on an album or EP together... thinking",
+ "tweet_url": "https://twitter.com/thedayacure/status/1499076393136791554"
},
"15": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "It’s so fucking awkward that CNN will cut from an interview with a Ukrainian family that has had their home bombed and lost everything to a commercial with upbeat music for fucking Applebees so quickly.",
+ "tweet_url": "https://twitter.com/IWDominateLoL/status/1496885266468028418?s=20&t=M1CKJCli-wxv8xWAZOLwPQ"
},
"16": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "Reasons to hate are remembered better than reasons to love",
+ "tweet_url": "https://twitter.com/elonmusk/status/1494400631712501764?s=20&t=UxsWf65IRVtxt7_TMVbTqw"
},
"17": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "So amazing that we decompress from a single cell to over 30 trillion cells",
+ "tweet_url": "https://twitter.com/elonmusk/status/1494821520815779843?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"18": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "Yoooo! What a dream that would be… @Labrinth ???",
+ "tweet_url": "https://twitter.com/Zendaya/status/1498477238676639744?s=20&t=evwQi_cNzevlZyDZknQmyw"
},
"19": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "Additionally, new tickets have been added for certain cities on the tour! Tickets are available now at http://billieeilish.com/tour",
+ "tweet_url": "https://twitter.com/billieeilish/status/1491546831611781120?s=20&t=zlWsVPBlRuk9T2qwAYxA8w"
},
"20": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "It's hard to beat a person who never gives up. ~ Babe Ruth #quotes #success #startup",
+ "tweet_url": "https://twitter.com/exceptionalfood/status/1499493315296862216?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"21": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "It just hit me that one of my oldest and closest (or so I thought) friend, who has my number wished me a happy birthday on twitter...wow, so personal. Feel so loved!",
+ "tweet_url": "https://twitter.com/brwngrls_diary/status/1499493859042242564?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"22": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "shoutout to the annoying kid in eighth grade who wouldn’t stop talking about demons souls he’s probably so happy now",
+ "tweet_url": "https://twitter.com/onlineicon01/status/1499493845041561602?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"23": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "IKEA charging £40 for delivery is VERY VERY crazy.",
+ "tweet_url": "https://twitter.com/iAmCroe/status/1499357461630926852?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"24": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "who plays fifa I need a fifa buddy",
+ "tweet_url": "https://twitter.com/n4sserrr/status/1499496057042055174?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"25": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "5 Million hectares of forest is lost to deforestation a year. Beef is responsible for 41% of global deforestation. Deforestation contributes about 4.8 Billion tonnes of Carbon Dioxide a year, turning the Amazon Rainforest into a carbon source. #ClimateCrisis #CIimateEmergency.",
+ "tweet_url": "https://twitter.com/KaraJarina/status/1499419995469762570?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"26": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "These stories are rampant in warehouses, from Best Buy to UPS to Amazon. These companies push workers to injury, fight them through workers comp to get medical care, then trash them entirely when they're disabled.",
+ "tweet_url": "https://twitter.com/SarafromMI/status/1499470934889095169?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"27": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "Im goin on a mf trip. Who's with me?",
+ "tweet_url": "https://twitter.com/SpacePa21927768/status/1499422158501691392?s=20&t=Jh2P0cAsEWRoT9DT-5o_9A"
},
"28": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "alex karev left us 2 years ago today",
+ "tweet_url": "https://twitter.com/swiftsmer/status/1500189717983375360"
},
"29": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "today marks two years since the writers ruined alex karev's character and dumped jo wilson for a letter. i’ll never really forgive them",
+ "tweet_url": "https://twitter.com/iANIST0N/status/1500178119956828161?s=20&t=OvQNlp7CqErI33W7meNpnw"
},
"30": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "Yeah we keep on winning everyday!",
+ "tweet_url": "https://twitter.com/MermaidMe4/status/1500282758253346817?s=20&t=54v1p-o8wNPOseaRdxM8Gg"
},
"31": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "Too many damn commercials. #48hours #whatallykostialdidntknow",
+ "tweet_url": "https://twitter.com/brandon_orgeron/status/1500312637346914316?s=20&t=JarPJO2coTIilE4FD_Rfhg"
},
"32": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "oh jeez this guy has the playbook for being a gaslighting jackhole #48Hours",
+ "tweet_url": "https://twitter.com/flutiefan/status/1500313070652071939?s=20&t=JarPJO2coTIilE4FD_Rfhg"
},
"33": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "what are good cat food brands? or like wet food or treats?",
+ "tweet_url": "https://twitter.com/bigfootjinx/status/1500196301778001921?s=20&t=G2Eq8kqCFxruhzcnUoMFbw"
},
"34": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "everyone say good morning jinx",
+ "tweet_url": "https://twitter.com/bigfootjinx/status/1499832101138419718?s=20&t=G2Eq8kqCFxruhzcnUoMFbw"
},
"35": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "she said NO!!!",
+ "tweet_url": "https://twitter.com/bigfootjinx/status/1499835404387708928?s=20&t=G2Eq8kqCFxruhzcnUoMFbw"
},
"36": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "she’s just a little guy who’s just here to vibe",
+ "text_of_tweet": "neutral",
+ "tweet_url": "https://twitter.com/bigfootjinx/status/1499121832133201920?s=20&t=G2Eq8kqCFxruhzcnUoMFbw"
},
"37": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "Pépito is back home (20:52:07)",
+ "tweet_url": "https://twitter.com/PepitoTheCat/status/1468307355649589250?s=20&t=04zp-JS5SI9x_azDLa0FIg"
},
"38": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "Man. Steph Curry is just so much better at shooting than the rest of the world it's unfair",
+ "tweet_url": "https://twitter.com/MKBHD/status/1495598546799706112?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"39": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "neutral",
+ "text_of_tweet": "Free video idea: Curling robot does the same slide with vs without the sweepers. To show how much of a difference the sweeping ACTUALLY makes.",
+ "tweet_url": "https://twitter.com/MKBHD/status/1495580338659147779?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"40": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "I've lost track of how many times I've watched Planet Earth and Blue Planet. They're still incredible. Every time.",
+ "tweet_url": "https://twitter.com/MKBHD/status/1492896729687109639?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"41": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "bella hadid stays using her platform correctly. love to see it",
+ "tweet_url": "https://twitter.com/itsnotnabiha/status/1500298294957903872?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"42": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "i’m sorry but after harry’s recent stalker incidents it is really weird to actively search out his venmo… people really have no sense of privacy on here",
+ "tweet_url": "https://twitter.com/filmrry/status/1500144478937567233?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"43": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "Visa & Mastercard had no problem servicing Epstein and his kid fucking clients.",
+ "tweet_url": "https://twitter.com/SaltyCracker9/status/1500275075022344194?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"44": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "negative",
+ "text_of_tweet": "My grandma had Parkinson’s. This looks familiar unfortunately. Can’t believe he’s not stepping down. He needs help.",
+ "tweet_url": "https://twitter.com/RobertaByTheSea/status/1500251346443853824?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"45": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "This is what Texas is all about! Best fan base in the nation!!!",
+ "tweet_url": "https://twitter.com/AGENT0__/status/1500220404052471818?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"46": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "What a crowd for #1 Texas vs. #3 LSU (via @BarstoolMintzy)",
+ "tweet_url": "https://twitter.com/BaseballBros/status/1500283697219653634?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"47": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "You Beto believe Texas is ready for change!",
+ "tweet_url": "https://twitter.com/TeamBeto/status/1500274585538682884?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"48": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "The first poster for The Texas Chainsaw Massacre 2022 movie is still fucking amazing, regardless of the movie quality.",
+ "tweet_url": "https://twitter.com/Struggler2Dark/status/1500269361642323968?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"49": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "BRON BRON GOING CRAZY!!!!",
+ "tweet_url": "https://twitter.com/NoLifeShaq/status/1500316127918006277?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
},
"50": {
- "sentiment_label": "",
- "text_of_tweet": "",
- "tweet_url": ""
+ "sentiment_label": "positive",
+ "text_of_tweet": "NAH BRUH THIS ONE OF THE CRAZIEST THINGS IGE EVER SEEN BRON IS THE GOAT",
+ "tweet_url": "https://twitter.com/6ixLeBron/status/1500316241399058434?s=20&t=ohhIQA6qGcdblYvf2dpfRw"
}
-}
\ No newline at end of file
+}