diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..3c6af14 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -46,11 +46,189 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertext
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
.....................
1999501556976068Sun Apr 19 01:19:30 PDT 2009NO_QUERYnikibennnOne more day of holidays
1999601556976167Sun Apr 19 01:19:32 PDT 2009NO_QUERYeifflesummerfeeling so down right now .. i hate you DAMN H...
1999701556976222Sun Apr 19 01:19:34 PDT 2009NO_QUERYlomobabesgeez,i hv to READ the whole book of personalit...
1999801556976246Sun Apr 19 01:19:34 PDT 2009NO_QUERYthatsblue2uI threw my sign at donnie and he bent over to ...
1999901556976546Sun Apr 19 01:19:44 PDT 2009NO_QUERYAlicepire@heather2711 Good thing I didn't find any then...
\n", + "

20000 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", + "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", + "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "19995 0 1556976068 Sun Apr 19 01:19:30 PDT 2009 NO_QUERY \n", + "19996 0 1556976167 Sun Apr 19 01:19:32 PDT 2009 NO_QUERY \n", + "19997 0 1556976222 Sun Apr 19 01:19:34 PDT 2009 NO_QUERY \n", + "19998 0 1556976246 Sun Apr 19 01:19:34 PDT 2009 NO_QUERY \n", + "19999 0 1556976546 Sun Apr 19 01:19:44 PDT 2009 NO_QUERY \n", + "\n", + " user text \n", + "0 scotthamilton is upset that he can't update his Facebook by ... \n", + "1 mattycus @Kenichan I dived many times for the ball. Man... \n", + "2 ElleCTF my whole body feels itchy and like its on fire \n", + "3 Karoli @nationwideclass no, it's not behaving at all.... \n", + "4 joy_wolf @Kwesidei not the whole crew \n", + "... ... ... \n", + "19995 nikibennn One more day of holidays \n", + "19996 eifflesummer feeling so down right now .. i hate you DAMN H... \n", + "19997 lomobabes geez,i hv to READ the whole book of personalit... \n", + "19998 thatsblue2u I threw my sign at donnie and he bent over to ... \n", + "19999 Alicepire @heather2711 Good thing I didn't find any then... \n", + "\n", + "[20000 rows x 6 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "import pandas as pd\n", + "\n", + "tweet = pd.read_csv('tweets.csv',encoding='latin-1')\n", + "tweet.columns = ['target','id','date','flag','user','text']\n", + "tweet['text']= tweet['text'].astype('string')\n", + "tweet\n", + "\n", + "tweet1 = tweet[:20000]\n", + "tweet1" ] }, { @@ -76,11 +254,231 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + ":5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tweet1['text_processed'] = [remove_stopwords(stem_and_lemmatize(tokenize(clean_up(element)))) for element in tweet1.text]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertexttext_processed
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...[upset, update, facebook, texting, might, cry,...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...[kenichan, dived, many, time, ball, managed, s...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire[whole, body, feel, itchy, like, fire]
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....[nationwideclass, behaving, mad, see]
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew[kwesidei, whole, crew]
........................
1999501556976068Sun Apr 19 01:19:30 PDT 2009NO_QUERYnikibennnOne more day of holidays[one, day, holiday]
1999601556976167Sun Apr 19 01:19:32 PDT 2009NO_QUERYeifflesummerfeeling so down right now .. i hate you DAMN H...[feeling, right, hate, damn, humprey]
1999701556976222Sun Apr 19 01:19:34 PDT 2009NO_QUERYlomobabesgeez,i hv to READ the whole book of personalit...[geez, hv, read, whole, book, personality, typ...
1999801556976246Sun Apr 19 01:19:34 PDT 2009NO_QUERYthatsblue2uI threw my sign at donnie and he bent over to ...[threw, sign, donnie, bent, get, wa, thingee, ...
1999901556976546Sun Apr 19 01:19:44 PDT 2009NO_QUERYAlicepire@heather2711 Good thing I didn't find any then...[heather, good, thing, find, none, one, like, ...
\n", + "

20000 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", + "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", + "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "19995 0 1556976068 Sun Apr 19 01:19:30 PDT 2009 NO_QUERY \n", + "19996 0 1556976167 Sun Apr 19 01:19:32 PDT 2009 NO_QUERY \n", + "19997 0 1556976222 Sun Apr 19 01:19:34 PDT 2009 NO_QUERY \n", + "19998 0 1556976246 Sun Apr 19 01:19:34 PDT 2009 NO_QUERY \n", + "19999 0 1556976546 Sun Apr 19 01:19:44 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "0 scotthamilton is upset that he can't update his Facebook by ... \n", + "1 mattycus @Kenichan I dived many times for the ball. Man... \n", + "2 ElleCTF my whole body feels itchy and like its on fire \n", + "3 Karoli @nationwideclass no, it's not behaving at all.... \n", + "4 joy_wolf @Kwesidei not the whole crew \n", + "... ... ... \n", + "19995 nikibennn One more day of holidays \n", + "19996 eifflesummer feeling so down right now .. i hate you DAMN H... \n", + "19997 lomobabes geez,i hv to READ the whole book of personalit... \n", + "19998 thatsblue2u I threw my sign at donnie and he bent over to ... \n", + "19999 Alicepire @heather2711 Good thing I didn't find any then... \n", + "\n", + " text_processed \n", + "0 [upset, update, facebook, texting, might, cry,... \n", + "1 [kenichan, dived, many, time, ball, managed, s... \n", + "2 [whole, body, feel, itchy, like, fire] \n", + "3 [nationwideclass, behaving, mad, see] \n", + "4 [kwesidei, whole, crew] \n", + "... ... \n", + "19995 [one, day, holiday] \n", + "19996 [feeling, right, hate, damn, humprey] \n", + "19997 [geez, hv, read, whole, book, personality, typ... \n", + "19998 [threw, sign, donnie, bent, get, wa, thingee, ... \n", + "19999 [heather, good, thing, find, none, one, like, ... \n", + "\n", + "[20000 rows x 7 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "\n", + "from ipynb.fs.full.challenge_1 import clean_up,tokenize,stem_and_lemmatize,remove_stopwords\n", + "\n", + "tweet1['text_processed'] = [remove_stopwords(stem_and_lemmatize(tokenize(clean_up(element)))) for element in tweet1.text] \n", + "tweet1" ] }, { @@ -98,11 +496,1038 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['wa',\n", + " 'day',\n", + " 'work',\n", + " 'go',\n", + " 'get',\n", + " 'got',\n", + " 'today',\n", + " 'like',\n", + " 'going',\n", + " 'want',\n", + " 'back',\n", + " 'time',\n", + " 'still',\n", + " 'really',\n", + " 'one',\n", + " 'good',\n", + " 'im',\n", + " 'quot',\n", + " 'need',\n", + " 'know',\n", + " 'wish',\n", + " 'sad',\n", + " 'night',\n", + " 'u',\n", + " 'miss',\n", + " 'ha',\n", + " 'home',\n", + " 'feel',\n", + " 'last',\n", + " 'think',\n", + " 'sorry',\n", + " 'na',\n", + " 'lol',\n", + " 'bad',\n", + " 'sleep',\n", + " 'oh',\n", + " 'much',\n", + " 'well',\n", + " 'love',\n", + " 'see',\n", + " 'hate',\n", + " 'amp',\n", + " 'tomorrow',\n", + " 'sick',\n", + " 'twitter',\n", + " 'hope',\n", + " 'morning',\n", + " 'make',\n", + " 'bed',\n", + " 'though',\n", + " 'new',\n", + " 'would',\n", + " 'getting',\n", + " 'week',\n", + " 'tonight',\n", + " 'could',\n", + " 'come',\n", + " 'right',\n", + " 'thing',\n", + " 'hour',\n", + " 'school',\n", + " 'feeling',\n", + " 'even',\n", + " 'friend',\n", + " 'tired',\n", + " 'cant',\n", + " 'way',\n", + " 'people',\n", + " 'working',\n", + " 'gon',\n", + " 'suck',\n", + " 'better',\n", + " 'never',\n", + " 'hurt',\n", + " 'fun',\n", + " 'find',\n", + " 'dont',\n", + " 'watching',\n", + " 'cold',\n", + " 'lost',\n", + " 'wan',\n", + " 'long',\n", + " 'look',\n", + " 'ugh',\n", + " 'phone',\n", + " 'yeah',\n", + " 'year',\n", + " 'show',\n", + " 'yet',\n", + " 'early',\n", + " 'keep',\n", + " 'missed',\n", + " 'trying',\n", + " 'house',\n", + " 'damn',\n", + " 'another',\n", + " 'away',\n", + " 'say',\n", + " 'guy',\n", + " 'great',\n", + " 'little',\n", + " 'soon',\n", + " 'something',\n", + " 'poor',\n", + " 'bored',\n", + " 'life',\n", + " 'someone',\n", + " 'missing',\n", + " 'next',\n", + " 'man',\n", + " 'already',\n", + " 'take',\n", + " 'nice',\n", + " 'game',\n", + " 'haha',\n", + " 'first',\n", + " 'weekend',\n", + " 'nothing',\n", + " 'ok',\n", + " 'done',\n", + " 'let',\n", + " 'watch',\n", + " 'guess',\n", + " 'start',\n", + " 'wait',\n", + " 'looking',\n", + " 'went',\n", + " 'tweet',\n", + " 'left',\n", + " 'girl',\n", + " 'weather',\n", + " 'x',\n", + " 'hard',\n", + " 'n',\n", + " 'thought',\n", + " 'lt',\n", + " 'please',\n", + " 'made',\n", + " 'ever',\n", + " 'always',\n", + " 'car',\n", + " 'ready',\n", + " 'thanks',\n", + " 'old',\n", + " 'mean',\n", + " 'hear',\n", + " 'late',\n", + " 'outside',\n", + " 'rain',\n", + " 'help',\n", + " 'class',\n", + " 'play',\n", + " 'woke',\n", + " 'lot',\n", + " 'head',\n", + " 'gone',\n", + " 'omg',\n", + " 'b',\n", + " 'sure',\n", + " 'happy',\n", + " 'yesterday',\n", + " 'doe',\n", + " 'baby',\n", + " 'since',\n", + " 'ta',\n", + " 'pretty',\n", + " 'cry',\n", + " 'break',\n", + " 'waiting',\n", + " 'stupid',\n", + " 'many',\n", + " 'everyone',\n", + " 'bit',\n", + " 'maybe',\n", + " 'stuck',\n", + " 'two',\n", + " 'didnt',\n", + " 'call',\n", + " 'snow',\n", + " 'around',\n", + " 'movie',\n", + " 'end',\n", + " 'lonely',\n", + " 'coming',\n", + " 'best',\n", + " 'stop',\n", + " 'hot',\n", + " 'sound',\n", + " 'anyone',\n", + " 'anymore',\n", + " 'may',\n", + " 'stuff',\n", + " 'actually',\n", + " 'must',\n", + " 'yes',\n", + " 'headache',\n", + " 'r',\n", + " 'tho',\n", + " 'almost',\n", + " 'anything',\n", + " 'saturday',\n", + " 'wanted',\n", + " 'leave',\n", + " 'dog',\n", + " 'sun',\n", + " 'enough',\n", + " 'might',\n", + " 'hey',\n", + " 'tell',\n", + " 'wow',\n", + " 'big',\n", + " 'making',\n", + " 'live',\n", + " 'found',\n", + " 'also',\n", + " 'finally',\n", + " 'party',\n", + " 'c',\n", + " 'th',\n", + " 'without',\n", + " 'spring',\n", + " 'put',\n", + " 'try',\n", + " 'tried',\n", + " 'boo',\n", + " 'monday',\n", + " 'god',\n", + " 'room',\n", + " 'later',\n", + " 'ur',\n", + " 'believe',\n", + " 'aww',\n", + " 'till',\n", + " 'homework',\n", + " 'job',\n", + " 'p',\n", + " 'said',\n", + " 'w',\n", + " 'saw',\n", + " 'probably',\n", + " 'read',\n", + " 'awake',\n", + " 'alone',\n", + " 'shit',\n", + " 'wrong',\n", + " 'sunday',\n", + " 'boy',\n", + " 'talk',\n", + " 'ill',\n", + " 'kid',\n", + " 'give',\n", + " 'least',\n", + " 'world',\n", + " 'crap',\n", + " 'minute',\n", + " 'eye',\n", + " 'family',\n", + " 'money',\n", + " 'mom',\n", + " 'thinking',\n", + " 'thats',\n", + " 'video',\n", + " 'computer',\n", + " 'month',\n", + " 'update',\n", + " 'company',\n", + " 'pain',\n", + " 'follower',\n", + " 'food',\n", + " 'far',\n", + " 'use',\n", + " 'playing',\n", + " 'run',\n", + " 'throat',\n", + " 'test',\n", + " 'totally',\n", + " 'dream',\n", + " 'every',\n", + " 'music',\n", + " 'else',\n", + " 'everything',\n", + " 'cause',\n", + " 'yay',\n", + " 'hell',\n", + " 'beautiful',\n", + " 'coffee',\n", + " 'raining',\n", + " 'mine',\n", + " 'broke',\n", + " 'okay',\n", + " 'sitting',\n", + " 'forgot',\n", + " 'awesome',\n", + " 'busy',\n", + " 'finished',\n", + " 'able',\n", + " 'boring',\n", + " 'idea',\n", + " 'wishing',\n", + " 'hair',\n", + " 'eat',\n", + " 'either',\n", + " 'sore',\n", + " 'internet',\n", + " 'ago',\n", + " 'asot',\n", + " 'seems',\n", + " 'luck',\n", + " 'picture',\n", + " 'holiday',\n", + " 'unfortunately',\n", + " 'song',\n", + " 'stay',\n", + " 'real',\n", + " 'dinner',\n", + " 'ya',\n", + " 'gt',\n", + " 'check',\n", + " 'hungry',\n", + " 'tv',\n", + " 'kinda',\n", + " 'buy',\n", + " 'pic',\n", + " 'rest',\n", + " 'sigh',\n", + " 'cool',\n", + " 'f',\n", + " 'dead',\n", + " 'cat',\n", + " 'easter',\n", + " 'study',\n", + " 'awww',\n", + " 'plan',\n", + " 'site',\n", + " 'sooo',\n", + " 'place',\n", + " 'female',\n", + " 'dad',\n", + " 'taking',\n", + " 'birthday',\n", + " 'hit',\n", + " 'wake',\n", + " 'laptop',\n", + " 'amazing',\n", + " 'half',\n", + " 'ticket',\n", + " 'post',\n", + " 'book',\n", + " 'forward',\n", + " 'face',\n", + " 'nick',\n", + " 'season',\n", + " 'fuck',\n", + " 'april',\n", + " 'pm',\n", + " 'sleeping',\n", + " 'exam',\n", + " 'online',\n", + " 'name',\n", + " 'seen',\n", + " 'worst',\n", + " 'paper',\n", + " 'took',\n", + " 'super',\n", + " 'came',\n", + " 'problem',\n", + " 'page',\n", + " 'walk',\n", + " 'iphone',\n", + " 'leaving',\n", + " 'foot',\n", + " 'tuesday',\n", + " 'brother',\n", + " 'friday',\n", + " 'meet',\n", + " 'reply',\n", + " 'died',\n", + " 'seriously',\n", + " 'watched',\n", + " 'instead',\n", + " 'lunch',\n", + " 'xx',\n", + " 'kind',\n", + " 'news',\n", + " 'write',\n", + " 'til',\n", + " 'store',\n", + " 'glad',\n", + " 'office',\n", + " 'jealous',\n", + " 'horrible',\n", + " 'hand',\n", + " 'whole',\n", + " 'hug',\n", + " 'move',\n", + " 'min',\n", + " 'hoping',\n", + " 'supposed',\n", + " 'started',\n", + " 'anyway',\n", + " 'asleep',\n", + " 'broken',\n", + " 'message',\n", + " 'fan',\n", + " 'sweet',\n", + " 'wont',\n", + " 'word',\n", + " 'drive',\n", + " 'starting',\n", + " 'snowing',\n", + " 'drink',\n", + " 'heart',\n", + " 'hopefully',\n", + " 'crazy',\n", + " 'eating',\n", + " 'upset',\n", + " 'follow',\n", + " 'beach',\n", + " 'turn',\n", + " 'thank',\n", + " 'due',\n", + " 'send',\n", + " 'told',\n", + " 'open',\n", + " 'apparently',\n", + " 'clean',\n", + " 'mad',\n", + " 'ive',\n", + " 'meeting',\n", + " 'remember',\n", + " 'ate',\n", + " 'fucking',\n", + " 'doesnt',\n", + " 'sleepy',\n", + " 'seem',\n", + " 'wtf',\n", + " 'part',\n", + " 'summer',\n", + " 'reason',\n", + " 'photo',\n", + " 'fail',\n", + " 'g',\n", + " 'trip',\n", + " 'email',\n", + " 'com',\n", + " 'dude',\n", + " 'win',\n", + " 'e',\n", + " 'shop',\n", + " 'link',\n", + " 'reading',\n", + " 'care',\n", + " 'bummed',\n", + " 'soo',\n", + " 'le',\n", + " 'aw',\n", + " 'worse',\n", + " 'happened',\n", + " 'full',\n", + " 'hr',\n", + " 'change',\n", + " 'ah',\n", + " 'project',\n", + " 'running',\n", + " 'cleaning',\n", + " 'sunny',\n", + " 'red',\n", + " 'text',\n", + " 'v',\n", + " 'sister',\n", + " 'used',\n", + " 'inside',\n", + " 'heard',\n", + " 'sadly',\n", + " 'studying',\n", + " 'soooo',\n", + " 'l',\n", + " 'episode',\n", + " 'fall',\n", + " 'shopping',\n", + " 'true',\n", + " 'set',\n", + " 'stomach',\n", + " 'slow',\n", + " 'cut',\n", + " 'vacation',\n", + " 'fell',\n", + " 'seeing',\n", + " 'wondering',\n", + " 'close',\n", + " 'window',\n", + " 'story',\n", + " 'finish',\n", + " 'hahaha',\n", + " 'excited',\n", + " 'moment',\n", + " 'bought',\n", + " 'k',\n", + " 'dear',\n", + " 'called',\n", + " 'account',\n", + " 'short',\n", + " 'st',\n", + " 'cuz',\n", + " 'course',\n", + " 'listening',\n", + " 'talking',\n", + " 'city',\n", + " 'top',\n", + " 'shame',\n", + " 'happen',\n", + " 'understand',\n", + " 'second',\n", + " 'mind',\n", + " 'sometimes',\n", + " 'ride',\n", + " 'town',\n", + " 'person',\n", + " 'flight',\n", + " 'shower',\n", + " 'lose',\n", + " 'carter',\n", + " 'finger',\n", + " 'water',\n", + " 'hi',\n", + " 'following',\n", + " 'sent',\n", + " 'couple',\n", + " 'catch',\n", + " 'mood',\n", + " 'ppl',\n", + " 'lmao',\n", + " 'band',\n", + " 'shoe',\n", + " 'fast',\n", + " 'breakfast',\n", + " 'co',\n", + " 'blue',\n", + " 'driving',\n", + " 'terrible',\n", + " 'cute',\n", + " 'team',\n", + " 'ipod',\n", + " 'slept',\n", + " 'bar',\n", + " 'mum',\n", + " 'nite',\n", + " 'chance',\n", + " 'kill',\n", + " 'bus',\n", + " 'parent',\n", + " 'nd',\n", + " 'doctor',\n", + " 'past',\n", + " 'figure',\n", + " 'hang',\n", + " 'sat',\n", + " 'tweetdeck',\n", + " 'heading',\n", + " 'tummy',\n", + " 'enjoy',\n", + " 'closed',\n", + " 'dying',\n", + " 'scared',\n", + " 'uni',\n", + " 'ache',\n", + " 'definitely',\n", + " 'point',\n", + " 'drunk',\n", + " 'load',\n", + " 'quite',\n", + " 'whats',\n", + " 'death',\n", + " 'pay',\n", + " 'wonder',\n", + " 'power',\n", + " 'free',\n", + " 'pick',\n", + " 'bout',\n", + " 'argh',\n", + " 'writing',\n", + " 'btw',\n", + " 'airport',\n", + " 'uk',\n", + " 'moving',\n", + " 'dentist',\n", + " 'essay',\n", + " 'mileycyrus',\n", + " 'concert',\n", + " 'facebook',\n", + " 'spent',\n", + " 'depressing',\n", + " 'leg',\n", + " 'worried',\n", + " 'weird',\n", + " 'nap',\n", + " 'mac',\n", + " 'isnt',\n", + " 'coachella',\n", + " 'ouch',\n", + " 'together',\n", + " 'record',\n", + " 'issue',\n", + " 'bummer',\n", + " 'body',\n", + " 'three',\n", + " 'tea',\n", + " 'lame',\n", + " 'website',\n", + " 'upload',\n", + " 'funny',\n", + " 'yea',\n", + " 'park',\n", + " 'thursday',\n", + " 'goodnight',\n", + " 'train',\n", + " 'final',\n", + " 'bring',\n", + " 'answer',\n", + " 'hospital',\n", + " 'blog',\n", + " 'bug',\n", + " 'ahh',\n", + " 'afternoon',\n", + " 'behind',\n", + " 'star',\n", + " 'losing',\n", + " 'realized',\n", + " 'fight',\n", + " 'chocolate',\n", + " 'flu',\n", + " 'none',\n", + " 'huge',\n", + " 'lovely',\n", + " 'fact',\n", + " 'battery',\n", + " 'enjoying',\n", + " 'college',\n", + " 'idk',\n", + " 'puppy',\n", + " 'glass',\n", + " 'longer',\n", + " 'goin',\n", + " 'english',\n", + " 'fair',\n", + " 'stopped',\n", + " 'la',\n", + " 'traffic',\n", + " 'completely',\n", + " 'rainy',\n", + " 'fix',\n", + " 'happens',\n", + " 'hill',\n", + " 'hmm',\n", + " 'side',\n", + " 'waking',\n", + " 'hello',\n", + " 'hold',\n", + " 'nobody',\n", + " 'nose',\n", + " 'line',\n", + " 'lady',\n", + " 'fixed',\n", + " 'road',\n", + " 'mr',\n", + " 'loss',\n", + " 'gym',\n", + " 'em',\n", + " 'lucky',\n", + " 'using',\n", + " 'knew',\n", + " 'taken',\n", + " 'camera',\n", + " 'rock',\n", + " 'small',\n", + " 'listen',\n", + " 'white',\n", + " 'h',\n", + " 'box',\n", + " 'havent',\n", + " 'pizza',\n", + " 'easy',\n", + " 'fb',\n", + " 'son',\n", + " 'gave',\n", + " 'kitty',\n", + " 'scary',\n", + " 'disappointed',\n", + " 'tour',\n", + " 'google',\n", + " 'dropped',\n", + " 'london',\n", + " 'web',\n", + " 'goodbye',\n", + " 'shot',\n", + " 'low',\n", + " 'saying',\n", + " 'hubby',\n", + " 'boyfriend',\n", + " 'light',\n", + " 'mile',\n", + " 'italy',\n", + " 'spend',\n", + " 'looked',\n", + " 'allergy',\n", + " 'killing',\n", + " 'ask',\n", + " 'high',\n", + " 'service',\n", + " 'club',\n", + " 'nope',\n", + " 'yr',\n", + " 'favorite',\n", + " 'rather',\n", + " 'dvd',\n", + " 'agree',\n", + " 'trouble',\n", + " 'dang',\n", + " 'earlier',\n", + " 'turned',\n", + " 'wearing',\n", + " 'gutted',\n", + " 'near',\n", + " 'air',\n", + " 'cancelled',\n", + " 'depressed',\n", + " 'killed',\n", + " 'chat',\n", + " 'round',\n", + " 'officially',\n", + " 'sold',\n", + " 'visit',\n", + " 'stayed',\n", + " 'empty',\n", + " 'joke',\n", + " 'winter',\n", + " 'door',\n", + " 'ice',\n", + " 'tweeting',\n", + " 'fine',\n", + " 'drinking',\n", + " 'pc',\n", + " 'serious',\n", + " 'hanging',\n", + " 'gah',\n", + " 'youtube',\n", + " 'bloody',\n", + " 'anywhere',\n", + " 'wet',\n", + " 'plus',\n", + " 'failed',\n", + " 'usually',\n", + " 'lately',\n", + " 'awful',\n", + " 'sign',\n", + " 'tommcfly',\n", + " 'yep',\n", + " 'xd',\n", + " 'houston',\n", + " 'tax',\n", + " 'forever',\n", + " 'arm',\n", + " 'although',\n", + " 'gosh',\n", + " 'deal',\n", + " 'list',\n", + " 'date',\n", + " 'cream',\n", + " 'warm',\n", + " 'bike',\n", + " 'shoot',\n", + " 'passed',\n", + " 'knee',\n", + " 'lil',\n", + " 'file',\n", + " 'chicken',\n", + " 'pissed',\n", + " 'felt',\n", + " 'staying',\n", + " 'album',\n", + " 'library',\n", + " 'card',\n", + " 'germany',\n", + " 'dark',\n", + " 'age',\n", + " 'die',\n", + " 'street',\n", + " 'dress',\n", + " 'tree',\n", + " 'cup',\n", + " 'loved',\n", + " 'four',\n", + " 'single',\n", + " 'decided',\n", + " 'except',\n", + " 'interesting',\n", + " 'texas',\n", + " 'event',\n", + " 'race',\n", + " 'brain',\n", + " 'report',\n", + " 'exhausted',\n", + " 'blah',\n", + " 'ball',\n", + " 'cake',\n", + " 'black',\n", + " 'business',\n", + " 'smell',\n", + " 'bye',\n", + " 'babe',\n", + " 'lazy',\n", + " 'dunno',\n", + " 'assignment',\n", + " 'state',\n", + " 'spending',\n", + " 'matter',\n", + " 'version',\n", + " 'ran',\n", + " 'itunes',\n", + " 'egg',\n", + " 'caught',\n", + " 'evening',\n", + " 'da',\n", + " 'shirt',\n", + " 'question',\n", + " 'especially',\n", + " 'math',\n", + " 'neck',\n", + " 'wife',\n", + " 'daughter',\n", + " 'film',\n", + " 'degree',\n", + " 'fml',\n", + " 'wednesday',\n", + " 'ring',\n", + " 'tom',\n", + " 'number',\n", + " 'support',\n", + " 'somewhere',\n", + " 'cell',\n", + " 'add',\n", + " 'usual',\n", + " 'order',\n", + " 'yankee',\n", + " 'j',\n", + " 'annoying',\n", + " 'bag',\n", + " 'cough',\n", + " 'expensive',\n", + " 'bird',\n", + " 'save',\n", + " 'ear',\n", + " 'asked',\n", + " 'wear',\n", + " 'fat',\n", + " 'laying',\n", + " 'radio',\n", + " 'hero',\n", + " 'living',\n", + " 'mail',\n", + " 'confused',\n", + " 'cheese',\n", + " 'worth',\n", + " 'rip',\n", + " 'freezing',\n", + " 'green',\n", + " 'quiet',\n", + " 'ruined',\n", + " 'button',\n", + " 'apple',\n", + " 'garden',\n", + " 'aint',\n", + " 'note',\n", + " 'case',\n", + " 'gorgeous',\n", + " 'wedding',\n", + " 'chuck',\n", + " 'burnt',\n", + " 'breaking',\n", + " 'blood',\n", + " 'comment',\n", + " 'voice',\n", + " 'kutner',\n", + " 'chicago',\n", + " 'macbook',\n", + " 'couldnt',\n", + " 'fucked',\n", + " 'bc',\n", + " 'art',\n", + " 'bet',\n", + " 'mouth',\n", + " 'possible',\n", + " 'meant',\n", + " 'exactly',\n", + " 'beat',\n", + " 'dance',\n", + " 'coz',\n", + " 'speak',\n", + " 'area',\n", + " 'group',\n", + " 'learn',\n", + " 'wanting',\n", + " 'hurting',\n", + " 'vet',\n", + " 'mother',\n", + " 'cd',\n", + " 'dm',\n", + " 'peace',\n", + " 'ala',\n", + " 'system',\n", + " 'tear',\n", + " 'giving',\n", + " 'fly',\n", + " 'falling',\n", + " 'forget',\n", + " 'hmmm',\n", + " 'sing',\n", + " 'ended',\n", + " 'middle',\n", + " 'tonite',\n", + " 'canada',\n", + " 'fit',\n", + " 'child',\n", + " 'afraid',\n", + " 'wide',\n", + " 'suppose',\n", + " 'changed',\n", + " 'uh',\n", + " 'install',\n", + " 'cried',\n", + " 'different',\n", + " 'honey',\n", + " 'screen',\n", + " 'packing',\n", + " 'beer',\n", + " 'shut',\n", + " 'contact',\n", + " 'mobile',\n", + " 'revision',\n", + " 'given',\n", + " 'spam',\n", + " 'holy',\n", + " 'ng',\n", + " 'connection',\n", + " 'freaking',\n", + " 'sit',\n", + " 'jonathanrknight',\n", + " 'covered',\n", + " 'history',\n", + " 'nearly',\n", + " 'tough',\n", + " 'worked',\n", + " 'bro',\n", + " 'men',\n", + " 'vega',\n", + " 'silly',\n", + " 'sell',\n", + " 'others',\n", + " 'twilight',\n", + " 'cousin',\n", + " 'app',\n", + " 'finding',\n", + " 'dr',\n", + " 'kal',\n", + " 'penn',\n", + " 'ahead',\n", + " 'cuddle',\n", + " 'share',\n", + " 'alarm',\n", + " 'law',\n", + " 'played',\n", + " 'darn',\n", + " 'migraine',\n", + " 'shitty',\n", + " 'profile',\n", + " 'self',\n", + " 'error',\n", + " 'badly',\n", + " 'xoxo',\n", + " 'bday',\n", + " 'fault',\n", + " 'shift',\n", + " 'cheer',\n", + " 'awwww',\n", + " 'entire',\n", + " 'laundry',\n", + " 'teeth',\n", + " 'pair',\n", + " 'front',\n", + " 'random',\n", + " 'bill',\n", + " 'evil',\n", + " 'wasnt',\n", + " 'ny',\n", + " 'clock',\n", + " 'xxx',\n", + " 'download',\n", + " 'roll',\n", + " 'taste',\n", + " 'talked',\n", + " ...]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "# your code here\n", + "from nltk.probability import FreqDist\n", + "\n", + "fdist = FreqDist()\n", + "for word_list in tweet1.text_processed:\n", + " for word in word_list:\n", + " fdist[word] += 1\n", + " \n", + "fdist_5000 = []\n", + "\n", + "for word in fdist:\n", + " if len(fdist_5000)<5000:\n", + " fdist_5000.append(word)\n", + "\n", + "# fdist_5000 = list(fdist.keys())[:5000]\n", + " \n", + "fdist_5000\n" ] }, { @@ -167,11 +1592,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "\n", + "def find_features(tweet):\n", + " words = set(tweet)\n", + " features = {}\n", + " for w in fdist_5000:\n", + " features[w] = (w in words)\n", + " return features\n", + "\n", + "featuresets = [(find_features(tweet)) for (tweet) in tweet1.text_processed]\n", + "\n", + "\n" ] }, { @@ -210,11 +1646,22 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "import nltk\n", + "\n", + "# set that we'll train our classifier with\n", + "training_set = featuresets[:16000]\n", + "\n", + "# set that we'll test against.\n", + "testing_set = featuresets[16000:]\n", + "\n", + "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", + "print(\"Classifier accuracy percent:\",(nltk.classify.accuracy(classifier, testing_set))*100)\n", + "classifier.show_most_informative_features(15)\n" ] }, { @@ -312,7 +1759,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.3" } }, "nbformat": 4, diff --git a/your-code/challenge-1.ipynb b/your-code/challenge_1.ipynb similarity index 74% rename from your-code/challenge-1.ipynb rename to your-code/challenge_1.ipynb index 0808166..4db55c0 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge_1.ipynb @@ -66,10 +66,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "' ironhack s q website is '" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import re\n", + "\n", "def clean_up(s):\n", " \"\"\"\n", " Cleans up numbers, URLs, and special characters from a string.\n", @@ -79,7 +92,15 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " \n", + " cleaned = s.lower()\n", + " cleaned = re.sub(r\"http\\S+\", \"\", cleaned)\n", + " cleaned = re.sub (r'([^a-zA-Z ]+?)', ' ', cleaned)\n", + " \n", + " return cleaned\n", + " \n", + "clean_up(\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")" ] }, { @@ -101,10 +122,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import nltk\n", + "\n", + "nltk.download('punkt')\n", + "\n", "def tokenize(s):\n", " \"\"\"\n", " Tokenize a string.\n", @@ -114,7 +159,11 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " \n", + " return nltk.word_tokenize(s)\n", + "\n", + "tokenize(' ironhack s q website is ')" ] }, { @@ -145,10 +194,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from nltk.stem import PorterStemmer\n", + "from nltk.stem import WordNetLemmatizer\n", + "nltk.download('wordnet')\n", + "\n", "def stem_and_lemmatize(l):\n", " \"\"\"\n", " Perform stemming and lemmatization on a list of words.\n", @@ -158,7 +231,24 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \n", + " \n", + " \"\"\"\n", + " list_stemmed = []\n", + " list_lemmatized = []\n", + "\n", + " ps = PorterStemmer()\n", + " lemmatizer = WordNetLemmatizer()\n", + " for w in l:\n", + " list_stemmed.append(ps.stem(w))\n", + " list_lemmatized.append(lemmatizer.lemmatize(w))\n", + " #print(list_stemmed)\n", + " #print(list_lemmatized)\n", + " return list_lemmatized\n", + " #I'm not sure which to return, I didnt do one after the other on the same list because that would just remove the effect\n", + " # of the previous one I believe\n", + "\n", + "stem_and_lemmatize(['ironhack', 's', 'q', 'website', 'is'])" ] }, { @@ -176,10 +266,34 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\pedro\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "['ironhack', 'q', 'websit']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "\n", + "from nltk.corpus import stopwords \n", + "nltk.download('stopwords')\n", + "\n", "def remove_stopwords(l):\n", " \"\"\"\n", " Remove English stopwords from a list of strings.\n", @@ -189,7 +303,13 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " \n", + " stop_words = set(stopwords.words('english'))\n", + " filtered_sentence = [w for w in l if not w in stop_words] \n", + " return filtered_sentence\n", + "\n", + "remove_stopwords(['ironhack', 's', 'q', 'websit', 'is'])" ] }, { @@ -218,7 +338,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.3" } }, "nbformat": 4,