diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..7157c69 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Id | \n", + "Reputation | \n", + "CreationDate | \n", + "DisplayName | \n", + "LastAccessDate | \n", + "WebsiteUrl | \n", + "Location | \n", + "AboutMe | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "AccountId | \n", + "Age | \n", + "ProfileImageUrl | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1 | \n", + "1 | \n", + "2010-07-19 06:55:26 | \n", + "Community | \n", + "2010-07-19 06:55:26 | \n", + "http://meta.stackexchange.com/ | \n", + "on the server farm | \n", + "<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>... | \n", + "0 | \n", + "5007 | \n", + "1920 | \n", + "-1 | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "101 | \n", + "2010-07-19 14:01:36 | \n", + "Geoff Dalgas | \n", + "2013-11-12 22:07:23 | \n", + "http://stackoverflow.com | \n", + "Corvallis, OR | \n", + "<p>Developer on the StackOverflow team. Find ... | \n", + "25 | \n", + "3 | \n", + "0 | \n", + "2 | \n", + "37.0 | \n", + "NaN | \n", + "
Hi, I'm not really a person.
\\r\\n\\r\\n... 0 5007 \n", + "1
Developer on the StackOverflow team. Find ... 25 3 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_table = pd.read_csv(\"users_table.csv\")\n", + "users_table.head(2)" + ] }, { "cell_type": "markdown", @@ -37,10 +141,120 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users_table = users_table.rename(columns={\"Id\": \"userId\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | userId | \n", + "Reputation | \n", + "CreationDate | \n", + "DisplayName | \n", + "LastAccessDate | \n", + "WebsiteUrl | \n", + "Location | \n", + "AboutMe | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "AccountId | \n", + "Age | \n", + "ProfileImageUrl | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1 | \n", + "1 | \n", + "2010-07-19 06:55:26 | \n", + "Community | \n", + "2010-07-19 06:55:26 | \n", + "http://meta.stackexchange.com/ | \n", + "on the server farm | \n", + "<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>... | \n", + "0 | \n", + "5007 | \n", + "1920 | \n", + "-1 | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "101 | \n", + "2010-07-19 14:01:36 | \n", + "Geoff Dalgas | \n", + "2013-11-12 22:07:23 | \n", + "http://stackoverflow.com | \n", + "Corvallis, OR | \n", + "<p>Developer on the StackOverflow team. Find ... | \n", + "25 | \n", + "3 | \n", + "0 | \n", + "2 | \n", + "37.0 | \n", + "NaN | \n", + "
Hi, I'm not really a person.
\\r\\n\\r\\n... 0 5007 \n", + "1
Developer on the StackOverflow team. Find ... 25 3 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_table.head(2)" + ] }, { "cell_type": "markdown", @@ -51,10 +265,152 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "posts_table = pd.read_csv(\"posts_table.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Id | \n", + "PostTypeId | \n", + "AcceptedAnswerId | \n", + "CreaionDate | \n", + "Score | \n", + "ViewCount | \n", + "Body | \n", + "OwnerUserId | \n", + "LasActivityDate | \n", + "Title | \n", + "... | \n", + "AnswerCount | \n", + "CommentCount | \n", + "FavoriteCount | \n", + "LastEditorUserId | \n", + "LastEditDate | \n", + "CommunityOwnedDate | \n", + "ParentId | \n", + "ClosedDate | \n", + "OwnerDisplayName | \n", + "LastEditorDisplayName | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "15.0 | \n", + "2010-07-19 19:12:12 | \n", + "23 | \n", + "1278.0 | \n", + "<p>How should I elicit prior distributions fro... | \n", + "8.0 | \n", + "2010-09-15 21:08:26 | \n", + "Eliciting priors from experts | \n", + "... | \n", + "5.0 | \n", + "1 | \n", + "14.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "1 | \n", + "59.0 | \n", + "2010-07-19 19:12:57 | \n", + "22 | \n", + "8198.0 | \n", + "<p>In many different statistical methods there... | \n", + "24.0 | \n", + "2012-11-12 09:21:54 | \n", + "What is normality? | \n", + "... | \n", + "7.0 | \n", + "1 | \n", + "8.0 | \n", + "88.0 | \n", + "2010-08-07 17:56:44 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
2 rows × 21 columns
\n", + "How should I elicit prior distributions fro... 8.0 \n", + "1
In many different statistical methods there... 24.0 \n", + "\n", + " LasActivityDate Title ... AnswerCount \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts ... 5.0 \n", + "1 2012-11-12 09:21:54 What is normality? ... 7.0 \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaN \n", + "1 1 8.0 88.0 2010-08-07 17:56:44 \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "\n", + " LastEditorDisplayName \n", + "0 NaN \n", + "1 NaN \n", + "\n", + "[2 rows x 21 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_table.head(2)" + ] }, { "cell_type": "markdown", @@ -65,10 +421,173 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "posts_table = posts_table.rename(columns={\"Id\": \"postId\", }).rename(columns={\"OwnerUserID\": \"userId\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# I give up, I've used the syntax before and the one following and none are working\n", + "\n", + "posts_table = posts_table.rename(columns=({\"Id\":\"postId\"}))\n", + "posts_table = posts_table.rename(columns=({\"OwnerUserID\":\"userId\"}))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "posts_table = posts_table.rename(columns=({\"Id\":\"postId\",\"OwnerUserID\":\"userId\"}))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | postId | \n", + "PostTypeId | \n", + "AcceptedAnswerId | \n", + "CreaionDate | \n", + "Score | \n", + "ViewCount | \n", + "Body | \n", + "OwnerUserId | \n", + "LasActivityDate | \n", + "Title | \n", + "... | \n", + "AnswerCount | \n", + "CommentCount | \n", + "FavoriteCount | \n", + "LastEditorUserId | \n", + "LastEditDate | \n", + "CommunityOwnedDate | \n", + "ParentId | \n", + "ClosedDate | \n", + "OwnerDisplayName | \n", + "LastEditorDisplayName | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "15.0 | \n", + "2010-07-19 19:12:12 | \n", + "23 | \n", + "1278.0 | \n", + "<p>How should I elicit prior distributions fro... | \n", + "8.0 | \n", + "2010-09-15 21:08:26 | \n", + "Eliciting priors from experts | \n", + "... | \n", + "5.0 | \n", + "1 | \n", + "14.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "1 | \n", + "59.0 | \n", + "2010-07-19 19:12:57 | \n", + "22 | \n", + "8198.0 | \n", + "<p>In many different statistical methods there... | \n", + "24.0 | \n", + "2012-11-12 09:21:54 | \n", + "What is normality? | \n", + "... | \n", + "7.0 | \n", + "1 | \n", + "8.0 | \n", + "88.0 | \n", + "2010-08-07 17:56:44 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
2 rows × 21 columns
\n", + "How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0
In many different statistical methods there... 24.0 \n", + "\n", + " LasActivityDate Title ... AnswerCount \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts ... 5.0 \n", + "1 2012-11-12 09:21:54 What is normality? ... 7.0 \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaN \n", + "1 1 8.0 88.0 2010-08-07 17:56:44 \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "\n", + " LastEditorDisplayName \n", + "0 NaN \n", + "1 NaN \n", + "\n", + "[2 rows x 21 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_table.head(2)" + ] }, { "cell_type": "markdown", @@ -79,12 +598,53 @@ " **posts columns**: postId, Score,userId,ViewCount,CommentCount" ] }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['userId'] not in index\"", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[25], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m users_df \u001b[39m=\u001b[39m users_table\u001b[39m.\u001b[39mloc[:, [\u001b[39m\"\u001b[39m\u001b[39muserId\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mReputation\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mViews\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mUpVotes\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDownVotes\u001b[39m\u001b[39m\"\u001b[39m]]\n\u001b[1;32m----> 2\u001b[0m posts_df \u001b[39m=\u001b[39m posts_table\u001b[39m.\u001b[39;49mloc[:, [\u001b[39m\"\u001b[39;49m\u001b[39muserId\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpostId\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mScore\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mViewCount\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mCommentCount\u001b[39;49m\u001b[39m\"\u001b[39;49m]]\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1067\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 1065\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_is_scalar_access(key):\n\u001b[0;32m 1066\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj\u001b[39m.\u001b[39m_get_value(\u001b[39m*\u001b[39mkey, takeable\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_takeable)\n\u001b[1;32m-> 1067\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_getitem_tuple(key)\n\u001b[0;32m 1068\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 1069\u001b[0m \u001b[39m# we by definition only have the 0th axis\u001b[39;00m\n\u001b[0;32m 1070\u001b[0m axis \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39maxis \u001b[39mor\u001b[39;00m \u001b[39m0\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1256\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_tuple\u001b[1;34m(self, tup)\u001b[0m\n\u001b[0;32m 1253\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_multi_take_opportunity(tup):\n\u001b[0;32m 1254\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_multi_take(tup)\n\u001b[1;32m-> 1256\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_getitem_tuple_same_dim(tup)\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:924\u001b[0m, in \u001b[0;36m_LocationIndexer._getitem_tuple_same_dim\u001b[1;34m(self, tup)\u001b[0m\n\u001b[0;32m 921\u001b[0m \u001b[39mif\u001b[39;00m com\u001b[39m.\u001b[39mis_null_slice(key):\n\u001b[0;32m 922\u001b[0m \u001b[39mcontinue\u001b[39;00m\n\u001b[1;32m--> 924\u001b[0m retval \u001b[39m=\u001b[39m \u001b[39mgetattr\u001b[39;49m(retval, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mname)\u001b[39m.\u001b[39;49m_getitem_axis(key, axis\u001b[39m=\u001b[39;49mi)\n\u001b[0;32m 925\u001b[0m \u001b[39m# We should never have retval.ndim < self.ndim, as that should\u001b[39;00m\n\u001b[0;32m 926\u001b[0m \u001b[39m# be handled by the _getitem_lowerdim call above.\u001b[39;00m\n\u001b[0;32m 927\u001b[0m \u001b[39massert\u001b[39;00m retval\u001b[39m.\u001b[39mndim \u001b[39m==\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mndim\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1301\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_axis\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m 1298\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(key, \u001b[39m\"\u001b[39m\u001b[39mndim\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mand\u001b[39;00m key\u001b[39m.\u001b[39mndim \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m 1299\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mCannot index with multidimensional key\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m-> 1301\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_getitem_iterable(key, axis\u001b[39m=\u001b[39;49maxis)\n\u001b[0;32m 1303\u001b[0m \u001b[39m# nested tuple slicing\u001b[39;00m\n\u001b[0;32m 1304\u001b[0m \u001b[39mif\u001b[39;00m is_nested_tuple(key, labels):\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1239\u001b[0m, in \u001b[0;36m_LocIndexer._getitem_iterable\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m 1236\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_key(key, axis)\n\u001b[0;32m 1238\u001b[0m \u001b[39m# A collection of keys\u001b[39;00m\n\u001b[1;32m-> 1239\u001b[0m keyarr, indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_listlike_indexer(key, axis)\n\u001b[0;32m 1240\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj\u001b[39m.\u001b[39m_reindex_with_indexers(\n\u001b[0;32m 1241\u001b[0m {axis: [keyarr, indexer]}, copy\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, allow_dups\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m\n\u001b[0;32m 1242\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1432\u001b[0m, in \u001b[0;36m_LocIndexer._get_listlike_indexer\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m 1429\u001b[0m ax \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj\u001b[39m.\u001b[39m_get_axis(axis)\n\u001b[0;32m 1430\u001b[0m axis_name \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mobj\u001b[39m.\u001b[39m_get_axis_name(axis)\n\u001b[1;32m-> 1432\u001b[0m keyarr, indexer \u001b[39m=\u001b[39m ax\u001b[39m.\u001b[39;49m_get_indexer_strict(key, axis_name)\n\u001b[0;32m 1434\u001b[0m \u001b[39mreturn\u001b[39;00m keyarr, indexer\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:6070\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[1;34m(self, key, axis_name)\u001b[0m\n\u001b[0;32m 6067\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 6068\u001b[0m keyarr, indexer, new_indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[1;32m-> 6070\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_raise_if_missing(keyarr, indexer, axis_name)\n\u001b[0;32m 6072\u001b[0m keyarr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtake(indexer)\n\u001b[0;32m 6073\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(key, Index):\n\u001b[0;32m 6074\u001b[0m \u001b[39m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\jsctr\\anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py:6133\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[1;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[0;32m 6130\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNone of [\u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m] are in the [\u001b[39m\u001b[39m{\u001b[39;00maxis_name\u001b[39m}\u001b[39;00m\u001b[39m]\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 6132\u001b[0m not_found \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[39m.\u001b[39mnonzero()[\u001b[39m0\u001b[39m]]\u001b[39m.\u001b[39munique())\n\u001b[1;32m-> 6133\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mnot_found\u001b[39m}\u001b[39;00m\u001b[39m not in index\u001b[39m\u001b[39m\"\u001b[39m)\n", + "\u001b[1;31mKeyError\u001b[0m: \"['userId'] not in index\"" + ] + } + ], + "source": [ + "users_df = users_table.loc[:, [\"userId\", \"Reputation\", \"Views\", \"UpVotes\", \"DownVotes\"]]\n", + "posts_df = posts_table.loc[:, [\"userId\", \"postId\", \"Score\", \"ViewCount\", \"CommentCount\"]]\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "posts_df.head()" + ] }, { "cell_type": "markdown", @@ -99,7 +659,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "merged_df = pd.merge(users_df, posts_df, on='userId')" + ] }, { "cell_type": "markdown", @@ -113,7 +675,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# No idea, but I would do a .isnull method on all columns to figure it out" + ] }, { "cell_type": "markdown", @@ -128,7 +692,10 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# That depends, if there are very few values missing, I'd do the average of the whole column and fill the empty rows with that value,\n", + " # and if the missing values are too many (like over 80% of the rows), I'd drop the column." + ] }, { "cell_type": "markdown", @@ -139,10 +706,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "postId int64\n", + "PostTypeId int64\n", + "AcceptedAnswerId float64\n", + "CreaionDate object\n", + "Score int64\n", + "ViewCount float64\n", + "Body object\n", + "OwnerUserId float64\n", + "LasActivityDate object\n", + "Title object\n", + "Tags object\n", + "AnswerCount float64\n", + "CommentCount int64\n", + "FavoriteCount float64\n", + "LastEditorUserId float64\n", + "LastEditDate object\n", + "CommunityOwnedDate object\n", + "ParentId float64\n", + "ClosedDate object\n", + "OwnerDisplayName object\n", + "LastEditorDisplayName object\n", + "dtype: object\n", + "userId int64\n", + "Reputation int64\n", + "CreationDate object\n", + "DisplayName object\n", + "LastAccessDate object\n", + "WebsiteUrl object\n", + "Location object\n", + "AboutMe object\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "AccountId int64\n", + "Age float64\n", + "ProfileImageUrl object\n", + "dtype: object\n" + ] + } + ], + "source": [ + "print(posts_table.dtypes)\n", + "print(users_table.dtypes)\n", + "\n", + "#I'd change numerical variables into categorical" + ] } ], "metadata": { @@ -161,7 +777,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.10.9" } }, "nbformat": 4,