ta-data-lis · valeritos · Dec 14, 2023
diff --git a/your_code/main.ipynb b/your_code/main.ipynb
@@ -1,5 +1,16 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy.stats as st\n",
+    "from scipy.stats import poisson\n",
+    "import numpy as np"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -14,11 +25,95 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.08759775, 0.21330051, 0.25969338, 0.21078446, 0.12831504,\n",
+       "       0.06248942, 0.02536029])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#H0: our observations follow Poisson\n",
+    "#H1: our observations do not follow Poisson\n",
+    "observed=np.array([35,99,104,110,62,25,10,3])\n",
+    "mu=2.435\n",
+    "poisson_dist=poisson(mu)\n",
+    "\n",
+    "#we have observed but not expected\n",
+    "#we know probability of values 0 to 6 under poisson\n",
+    "poisson_pmfs=np.array([poisson_dist.pmf(i) for i in range(0,7)])\n",
+    "poisson_pmfs\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.08759775, 0.21330051, 0.25969338, 0.21078446, 0.12831504,\n",
+       "       0.06248942, 0.02536029, 0.01245915])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#to account for >7 observations\n",
+    "with_tail=np.append(poisson_pmfs,1-poisson_pmfs.sum())\n",
+    "with_tail"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your answer here"
+    "#448 observations total\n",
+    "expected=with_tail*448"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Power_divergenceResult(statistic=6.491310681109821, pvalue=0.4836889068537269)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "st.chisquare(f_exp=expected,f_obs=observed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#we do not reject that our observations follow a poisson, because p value is greater than significance 0.05"
    ]
   },
   {
@@ -41,11 +136,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2.0"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# your code here"
+    "mu=(0+1+3+4)/4\n",
+    "mu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:\n0.4778112197861302",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[37], line 11\u001b[0m\n\u001b[0;32m      7\u001b[0m poisson_pmfs\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39marray([poisson_dist\u001b[38;5;241m.\u001b[39mpmf(i) \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;241m0\u001b[39m,\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m3\u001b[39m,\u001b[38;5;241m4\u001b[39m]])\n\u001b[0;32m      9\u001b[0m expected\u001b[38;5;241m=\u001b[39mpoisson_pmfs\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m60\u001b[39m\n\u001b[1;32m---> 11\u001b[0m st\u001b[38;5;241m.\u001b[39mchisquare(f_obs\u001b[38;5;241m=\u001b[39mobserved,f_exp\u001b[38;5;241m=\u001b[39mexpected)\n",
+      "File \u001b[1;32mc:\\Users\\valer.DESKTOP-8CM37D7\\anaconda3\\Lib\\site-packages\\scipy\\stats\\_stats_py.py:7553\u001b[0m, in \u001b[0;36mchisquare\u001b[1;34m(f_obs, f_exp, ddof, axis)\u001b[0m\n\u001b[0;32m   7428\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mchisquare\u001b[39m(f_obs, f_exp\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, ddof\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m):\n\u001b[0;32m   7429\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Calculate a one-way chi-square test.\u001b[39;00m\n\u001b[0;32m   7430\u001b[0m \n\u001b[0;32m   7431\u001b[0m \u001b[38;5;124;03m    The chi-square test tests the null hypothesis that the categorical data\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   7551\u001b[0m \n\u001b[0;32m   7552\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 7553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m power_divergence(f_obs, f_exp\u001b[38;5;241m=\u001b[39mf_exp, ddof\u001b[38;5;241m=\u001b[39mddof, axis\u001b[38;5;241m=\u001b[39maxis,\n\u001b[0;32m   7554\u001b[0m                             lambda_\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpearson\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\valer.DESKTOP-8CM37D7\\anaconda3\\Lib\\site-packages\\scipy\\stats\\_stats_py.py:7394\u001b[0m, in \u001b[0;36mpower_divergence\u001b[1;34m(f_obs, f_exp, ddof, axis, lambda_)\u001b[0m\n\u001b[0;32m   7388\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m diff_gt_tol:\n\u001b[0;32m   7389\u001b[0m         msg \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFor each axis slice, the sum of the observed \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   7390\u001b[0m                \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrequencies must agree with the sum of the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   7391\u001b[0m                \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexpected frequencies to a relative tolerance \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   7392\u001b[0m                \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mof \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrtol\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, but the percent differences are:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   7393\u001b[0m                \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_diff\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m-> 7394\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[0;32m   7396\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   7397\u001b[0m     \u001b[38;5;66;03m# Ignore 'invalid' errors so the edge case of a data set with length 0\u001b[39;00m\n\u001b[0;32m   7398\u001b[0m     \u001b[38;5;66;03m# is handled without spurious warnings.\u001b[39;00m\n\u001b[0;32m   7399\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m np\u001b[38;5;241m.\u001b[39merrstate(invalid\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m'\u001b[39m):\n",
+      "\u001b[1;31mValueError\u001b[0m: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:\n0.4778112197861302"
+     ]
+    }
+   ],
+   "source": [
+    "#H0: our observations follow Poisson\n",
+    "#H1: our observations do not follow Poisson\n",
+    "observed=np.array([32,15,9,4])\n",
+    "mu=2\n",
+    "poisson_dist=poisson(mu)\n",
+    "\n",
+    "poisson_pmfs=np.array([poisson_dist.pmf(i) for i in [0,1,3,4]])\n",
+    "\n",
+    "expected=poisson_pmfs*60\n",
+    "\n",
+    "st.chisquare(f_obs=observed,f_exp=expected)"
    ]
   },
   {
@@ -60,11 +200,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Power_divergenceResult(statistic=8.306179519542805, pvalue=0.015715783395950887)"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "n=10\n",
+    "p=0.05\n",
+    "observed=np.array([138,53,9])\n",
+    "\n",
+    "from scipy.stats import binom\n",
+    "binom_dist=binom(n,p)\n",
+    "\n",
+    "binom_pmfs=np.array([binom_dist.pmf(i) for i in range(0,2)])\n",
+    "\n",
+    "with_tail=np.append(binom_pmfs,1-binom_pmfs.sum())\n",
+    "\n",
+    "expected=with_tail*200\n",
+    "\n",
+    "st.chisquare(f_obs=observed,f_exp=expected)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your answer here"
+    "#we can reject that our observations follow a binomial distribution"
    ]
   },
   {
@@ -79,11 +252,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#contingency table\n",
+    "\n",
+    "#H0: physical activity is independent of consumption of sugary drinks\n",
+    "#H1: physical activity is dependent on consumption of sugary drinks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Chi2ContingencyResult(statistic=10.712198008709638, pvalue=0.004719280137040844, dof=2, expected_freq=array([[24.08421053, 19.91578947],\n",
+       "       [19.70526316, 16.29473684],\n",
+       "       [ 8.21052632,  6.78947368]]))"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table=np.array([[32,12],[14,22],[6,9]])\n",
+    "st.chi2_contingency(table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#your answer here"
+    "#we can reject the null hypothesis, so there seems to be a correlation between physical activity and consumption of sugary drinks"
    ]
   }
  ],
@@ -103,7 +311,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.3"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,