diff --git a/.github/workflows/rhiza_marimo.yml b/.github/workflows/rhiza_marimo.yml new file mode 100644 index 0000000..4109d6c --- /dev/null +++ b/.github/workflows/rhiza_marimo.yml @@ -0,0 +1,100 @@ +# This file is part of the jebel-quant/rhiza repository +# (https://github.com/jebel-quant/rhiza). +# +# Workflow: Marimo Notebooks +# +# Purpose: This workflow discovers and executes all Marimo notebooks in the +# repository. It builds a dynamic matrix to run each notebook in +# parallel to surface errors early and keep notebooks reproducible. +# +# Trigger: This workflow runs on every push and on pull requests to main/master +# branches (including from forks) +# +# Components: +# - πŸ”Ž Discover notebooks in book/marimo +# - πŸ§ͺ Run each notebook in parallel using a matrix strategy +# - βœ… Fail-fast disabled to report all failing notebooks + +name: "(RHIZA) MARIMO" + +permissions: + contents: read + +on: + push + +jobs: + # Build a matrix of notebooks to test + list-notebooks: + runs-on: ubuntu-latest + outputs: + notebook-list: ${{ steps.notebooks.outputs.matrix }} + steps: + # Check out the repository code + - uses: actions/checkout@v6.0.2 + + # Find all Python files in the marimo folder and create a matrix for parallel execution + - name: Find notebooks and build matrix + id: notebooks + run: | + # Extract MARIMO_FOLDER from the project configuration (via Makefile) + # shellcheck disable=SC2016 # Single quotes intentional - Make syntax, not shell expansion + NOTEBOOK_DIR=$(make -s -f Makefile -f - <<< 'print: ; @echo $(or $(MARIMO_FOLDER),marimo)' print) + + echo "Searching notebooks in: $NOTEBOOK_DIR" + # Check if directory exists + if [ ! -d "$NOTEBOOK_DIR" ]; then + echo "Directory $NOTEBOOK_DIR does not exist. Setting empty matrix." + echo "matrix=[]" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Find notebooks and handle empty results + if [ -z "$(find "$NOTEBOOK_DIR" -maxdepth 1 -name "*.py" 2>/dev/null)" ]; then + echo "No notebooks found in $NOTEBOOK_DIR. Setting empty matrix." + echo "matrix=[]" >> "$GITHUB_OUTPUT" + else + notebooks=$(find "$NOTEBOOK_DIR" -maxdepth 1 -name "*.py" -print0 | xargs -0 -n1 echo | jq -R -s -c 'split("\n")[:-1]') + echo "matrix=$notebooks" >> "$GITHUB_OUTPUT" + fi + shell: bash + + # Create one job per notebook using the matrix strategy for parallel execution + test-notebooks: + if: needs.list-notebooks.outputs.notebook-list != '[]' + runs-on: ubuntu-latest + needs: list-notebooks + strategy: + matrix: + notebook: ${{ fromJson(needs.list-notebooks.outputs.notebook-list) }} + # Don't fail the entire workflow if one notebook fails + fail-fast: false + name: Run notebook ${{ matrix.notebook }} + steps: + # Check out the repository code + - uses: actions/checkout@v6.0.2 + with: + lfs: true + + # Install uv/uvx + - name: Install uv + uses: astral-sh/setup-uv@v7.2.1 + with: + version: "0.9.28" + + # Execute the notebook with the appropriate runner based on its content + - name: Run notebook + env: + UV_EXTRA_INDEX_URL: ${{ secrets.UV_EXTRA_INDEX_URL }} + run: | + uvx uv run "${{ matrix.notebook }}" + # uvx β†’ creates a fresh ephemeral environment + # uv run β†’ runs the notebook as a script in that ephemeral env + # No project packages are pre-installed + # βœ… This forces the notebook to explicitly handle dependencies (e.g., uv install ., or pip install inside the script). + # βœ… It’s a true integration smoke test. + # Benefits of this pattern + # Confirms the notebook can bootstrap itself in a fresh environment + # Catches missing uv install or pip steps early + # Ensures CI/other users can run the notebook without manual setup + shell: bash diff --git a/Makefile b/Makefile index 979800a..3a48ca7 100644 --- a/Makefile +++ b/Makefile @@ -24,5 +24,5 @@ jupyter: install ## Install and start jupyter Lab .PHONY: marimo marimo: install ## Install and start marimo - @uv run pip install marimo - @uv run marimo edit --no-token --headless . + @uv run pip install marimo + @uv run marimo edit --no-token --headless . diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py new file mode 100644 index 0000000..4a7c668 --- /dev/null +++ b/marimo/Ch03-linreg-lab.py @@ -0,0 +1,899 @@ +# /// script +# dependencies = [ +# "marimo==0.19.6", +# "numpy==2.3.1", +# "pandas", +# "matplotlib", +# "statsmodels", +# "ISLP" +# ] +# requires-python = ">=3.12" +# /// + +import marimo + +__generated_with = "0.19.6" +app = marimo.App() + +with app.setup: + import marimo as mo + import numpy as np + import pandas as pd + + from matplotlib.pyplot import subplots + + import statsmodels.api as sm + from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF + from statsmodels.stats.anova import anova_lm + + from ISLP import load_data + from ISLP.models import (ModelSpec as MS, + summarize, + poly) + +@app.cell +def _(): + mo.md(r""" + # Linear Regression + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Importing packages + We import our standard libraries at this top + level. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ### New imports + Throughout this lab we will introduce new functions and libraries. However, + we will import them here to emphasize these are the new + code objects in this lab. Keeping imports near the top + of a notebook makes the code more readable, since scanning the first few + lines tells us what libraries are used. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + We will provide relevant details about the + functions below as they are needed. + + Besides importing whole modules, it is also possible + to import only a few items from a given module. This + will help keep the *namespace* clean. + We will use a few specific objects from the `statsmodels` package + which we import here. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + As one of the import statements above is quite a long line, we inserted a line break `\` to + ease readability. + + We will also use some functions written for the labs in this book in the `ISLP` + package. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ### Inspecting Objects and Namespaces + The + function `dir()` + provides a list of + objects in a namespace. + """) + return + + +@app.cell +def _(): + dir() + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + This shows you everything that `Python` can find at the top level. + There are certain objects like `__builtins__` that contain references to built-in + functions like `print()`. + + Every python object has its own notion of + namespace, also accessible with `dir()`. This will include + both the attributes of the object + as well as any methods associated with it. For instance, we see `'sum'` in the listing for an + array. + """) + return + + +@app.cell +def _(): + A = np.array([3,5,11]) + dir(A) + return (A,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + This indicates that the object `A.sum` exists. In this case it is a method + that can be used to compute the sum of the array `A` as can be seen by typing `A.sum?`. + """) + return + + +@app.cell +def _(A): + A.sum() + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Simple Linear Regression + In this section we will construct model + matrices (also called design matrices) using the `ModelSpec()` transform from `ISLP.models`. + + We will use the `Boston` housing data set, which is contained in the `ISLP` package. The `Boston` dataset records `medv` (median house value) for $506$ neighborhoods + around Boston. We will build a regression model to predict `medv` using $13$ + predictors such as `rm` (average number of rooms per house), + `age` (proportion of owner-occupied units built prior to 1940), and `lstat` (percent of + households with low socioeconomic status). We will use `statsmodels` for this + task, a `Python` package that implements several commonly used + regression methods. + + We have included a simple loading function `load_data()` in the + `ISLP` package: + """) + return + + +@app.cell +def _(): + Boston = load_data("Boston") + Boston.columns + return (Boston,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Type `Boston?` to find out more about these data. + + We start by using the `sm.OLS()` function to fit a + simple linear regression model. Our response will be + `medv` and `lstat` will be the single predictor. + For this model, we can create the model matrix by hand. + """) + return + + +@app.cell +def _(Boston): + X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]), + 'lstat': Boston['lstat']}) + X[:4] + return (X,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + We extract the response, and fit the model. + """) + return + + +@app.cell +def _(Boston, X): + y = Boston['medv'] + _model = sm.OLS(y, X) + results = _model.fit() + return results, y + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Note that `sm.OLS()` does + not fit the model; it specifies the model, and then `model.fit()` does the actual fitting. + + Our `ISLP` function `summarize()` produces a simple table of the parameter estimates, + their standard errors, t-statistics and p-values. + The function takes a single argument, such as the object `results` + returned here by the `fit` + method, and returns such a summary. + """) + return + + +@app.cell +def _(results): + summarize(results) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Before we describe other methods for working with fitted models, we outline a more useful and general framework for constructing a model matrix~`X`. + ### Using Transformations: Fit and Transform + Our model above has a single predictor, and constructing `X` was straightforward. + In practice we often fit models with more than one predictor, typically selected from an array or data frame. + We may wish to introduce transformations to the variables before fitting the model, specify interactions between variables, and expand some particular variables into sets of variables (e.g. polynomials). + The `sklearn` package has a particular notion + for this type of task: a *transform*. A transform is an object + that is created with some parameters as arguments. The + object has two main methods: `fit()` and `transform()`. + + We provide a general approach for specifying models and constructing + the model matrix through the transform `ModelSpec()` in the `ISLP` library. + `ModelSpec()` + (renamed `MS()` in the preamble) creates a + transform object, and then a pair of methods + `transform()` and `fit()` are used to construct a + corresponding model matrix. + + We first describe this process for our simple regression model using a single predictor `lstat` in + the `Boston` data frame, but will use it repeatedly in more + complex tasks in this and other labs in this book. + In our case the transform is created by the expression + `design = MS(['lstat'])`. + + The `fit()` method takes the original array and may do some + initial computations on it, as specified in the transform object. + For example, it may compute means and standard deviations for centering and scaling. + The `transform()` + method applies the fitted transformation to the array of data, and produces the model matrix. + """) + return + + +@app.cell +def _(Boston): + design = MS(['lstat']) + design = design.fit(Boston) + X_1 = design.transform(Boston) + X_1[:4] + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + In this simple case, the `fit()` method does very little; it simply checks that the variable `'lstat'` specified in `design` exists in `Boston`. Then `transform()` constructs the model matrix with two columns: an `intercept` and the variable `lstat`. + + These two operations can be combined with the + `fit_transform()` method. + """) + return + + +@app.cell +def _(Boston): + design_1 = MS(['lstat']) + X_2 = design_1.fit_transform(Boston) + X_2[:4] + return X_2, design_1 + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Note that, as in the previous code chunk when the two steps were done separately, the `design` object is changed as a result of the `fit()` operation. The power of this pipeline will become clearer when we fit more complex models that involve interactions and transformations. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Let's return to our fitted regression model. + The object + `results` has several methods that can be used for inference. + We already presented a function `summarize()` for showing the essentials of the fit. + For a full and somewhat exhaustive summary of the fit, we can use the `summary()` + method. + """) + return + + +@app.cell +def _(results): + results.summary() + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The fitted coefficients can also be retrieved as the + `params` attribute of `results`. + """) + return + + +@app.cell +def _(results): + results.params + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The `get_prediction()` method can be used to obtain predictions, and produce confidence intervals and + prediction intervals for the prediction of `medv` for given values of `lstat`. + + We first create a new data frame, in this case containing only the variable `lstat`, with the values for this variable at which we wish to make predictions. + We then use the `transform()` method of `design` to create the corresponding model matrix. + """) + return + + +@app.cell +def _(design_1): + new_df = pd.DataFrame({'lstat': [5, 10, 15]}) + newX = design_1.transform(new_df) + newX + return (newX,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Next we compute the predictions at `newX`, and view them by extracting the `predicted_mean` attribute. + """) + return + + +@app.cell +def _(newX, results): + new_predictions = results.get_prediction(newX); + new_predictions.predicted_mean + return (new_predictions,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + We can produce confidence intervals for the predicted values. + """) + return + + +@app.cell +def _(new_predictions): + new_predictions.conf_int(alpha=0.05) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Prediction intervals are computed by setting `obs=True`: + """) + return + + +@app.cell +def _(new_predictions): + new_predictions.conf_int(obs=True, alpha=0.05) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + For instance, the 95% confidence interval associated with an + `lstat` value of 10 is (24.47, 25.63), and the 95% prediction + interval is (12.82, 37.28). As expected, the confidence and + prediction intervals are centered around the same point (a predicted + value of 25.05 for `medv` when `lstat` equals + 10), but the latter are substantially wider. + + Next we will plot `medv` and `lstat` + using `DataFrame.plot.scatter()`, \definelongblankMR{plot.scatter()}{plot.slashslashscatter()} + and wish to + add the regression line to the resulting plot. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ### Defining Functions + While there is a function + within the `ISLP` package that adds a line to an existing plot, we take this opportunity + to define our first function to do so. + """) + return + + +@app.function +def abline(ax, b, m): + """Add a line with slope m and intercept b to ax""" + xlim = ax.get_xlim() + ylim = [m * xlim[0] + b, m * xlim[1] + b] + ax.plot(xlim, ylim) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + A few things are illustrated above. First we see the syntax for defining a function: + `def funcname(...)`. The function has arguments `ax, b, m` + where `ax` is an axis object for an existing plot, `b` is the intercept and + `m` is the slope of the desired line. Other plotting options can be passed on to + `ax.plot` by including additional optional arguments as follows: + """) + return + + +@app.function +def abline_1(ax, b, m, *args, **kwargs): + """Add a line with slope m and intercept b to ax""" + xlim = ax.get_xlim() + ylim = [m * xlim[0] + b, m * xlim[1] + b] + ax.plot(xlim, ylim, *args, **kwargs) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The addition of `*args` allows any number of + non-named arguments to `abline`, while `**kwargs` allows any + number of named arguments (such as `linewidth=3`) to `abline`. + In our function, we pass + these arguments verbatim to `ax.plot` above. Readers + interested in learning more about + functions are referred to the section on + defining functions in [docs.python.org/tutorial](https://docs.python.org/3/tutorial/controlflow.html#defining-functions). + + Let’s use our new function to add this regression line to a plot of + `medv` vs. `lstat`. + """) + return + + +@app.cell +def _(Boston, results): + _ax = Boston.plot.scatter('lstat', 'medv') + abline_1(_ax, results.params['intercept'], results.params['lstat'], 'r--', linewidth=3) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Thus, the final call to `ax.plot()` is `ax.plot(xlim, ylim, 'r--', linewidth=3)`. + We have used the argument `'r--'` to produce a red dashed line, and added + an argument to make it of width 3. + There is some evidence for non-linearity in the relationship between `lstat` and `medv`. We will explore this issue later in this lab. + + As mentioned above, there is an existing function to add a line to a plot --- `ax.axline()` --- but knowing how to write such functions empowers us to create more expressive displays. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Next we examine some diagnostic plots, several of which were discussed + in Section~\ref{Ch3:problems.sec}. + We can find the fitted values and residuals + of the fit as attributes of the `results` object. + Various influence measures describing the regression model + are computed with the `get_influence()` method. + As we will not use the `fig` component returned + as the first value from `subplots()`, we simply + capture the second returned value in `ax` below. + """) + return + + +@app.cell +def _(results): + _ax = subplots(figsize=(8, 8))[1] + _ax.scatter(results.fittedvalues, results.resid) + _ax.set_xlabel('Fitted value') + _ax.set_ylabel('Residual') + _ax.axhline(0, c='k', ls='--') + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + We add a horizontal line at 0 for reference using the + `ax.axhline()` method, indicating + it should be black (`c='k'`) and have a dashed linestyle (`ls='--'`). + + On the basis of the residual plot, there is some evidence of non-linearity. + Leverage statistics can be computed for any number of predictors using the + `hat_matrix_diag` attribute of the value returned by the + `get_influence()` method. + """) + return + + +@app.cell +def _(X_2, results): + infl = results.get_influence() + _ax = subplots(figsize=(8, 8))[1] + _ax.scatter(np.arange(X_2.shape[0]), infl.hat_matrix_diag) + _ax.set_xlabel('Index') + _ax.set_ylabel('Leverage') + np.argmax(infl.hat_matrix_diag) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The `np.argmax()` function identifies the index of the largest element of an array, optionally computed over an axis of the array. + In this case, we maximized over the entire array + to determine which observation has the largest leverage statistic. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Multiple Linear Regression + In order to fit a multiple linear regression model using least squares, we again use + the `ModelSpec()` transform to construct the required + model matrix and response. The arguments + to `ModelSpec()` can be quite general, but in this case + a list of column names suffice. We consider a fit here with + the two variables `lstat` and `age`. + """) + return + + +@app.cell +def _(Boston, y): + X_3 = MS(['lstat', 'age']).fit_transform(Boston) + _model1 = sm.OLS(y, X_3) + results1 = _model1.fit() + summarize(results1) + return (results1,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Notice how we have compacted the first line into a succinct expression describing the construction of `X`. + + The `Boston` data set contains 12 variables, and so it would be cumbersome + to have to type all of these in order to perform a regression using all of the predictors. + Instead, we can use the following short-hand:\definelongblankMR{columns.drop()}{columns.slashslashdrop()} + """) + return + + +@app.cell +def _(Boston): + terms = Boston.columns.drop('medv') + terms + return (terms,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + We can now fit the model with all the variables in `terms` using + the same model matrix builder. + """) + return + + +@app.cell +def _(Boston, terms, y): + X_4 = MS(terms).fit_transform(Boston) + _model = sm.OLS(y, X_4) + results_1 = _model.fit() + summarize(results_1) + return (X_4,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + What if we would like to perform a regression using all of the variables but one? For + example, in the above regression output, `age` has a high $p$-value. + So we may wish to run a regression excluding this predictor. + The following syntax results in a regression using all predictors except `age`. + """) + return + + +@app.cell +def _(Boston, y): + minus_age = Boston.columns.drop(['medv', 'age']) + Xma = MS(minus_age).fit_transform(Boston) + _model1 = sm.OLS(y, Xma) + summarize(_model1.fit()) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Multivariate Goodness of Fit + We can access the individual components of `results` by name + (`dir(results)` shows us what is available). Hence + `results.rsquared` gives us the $R^2$, + and + `np.sqrt(results.scale)` gives us the RSE. + + Variance inflation factors (section~\ref{Ch3:problems.sec}) are sometimes useful + to assess the effect of collinearity in the model matrix of a regression model. + We will compute the VIFs in our multiple regression fit, and use the opportunity to introduce the idea of *list comprehension*. + + ### List Comprehension + Often we encounter a sequence of objects which we would like to transform + for some other task. Below, we compute the VIF for each + feature in our `X` matrix and produce a data frame + whose index agrees with the columns of `X`. + The notion of list comprehension can often make such + a task easier. + + List comprehensions are simple and powerful ways to form + lists of `Python` objects. The language also supports + dictionary and *generator* comprehension, though these are + beyond our scope here. Let's look at an example. We compute the VIF for each of the variables + in the model matrix `X`, using the function `variance_inflation_factor()`. + """) + return + + +@app.cell +def _(X_4): + _vals = [VIF(X_4, i) for i in range(1, X_4.shape[1])] + vif = pd.DataFrame({'vif': _vals}, index=X_4.columns[1:]) + vif + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The function `VIF()` takes two arguments: a dataframe or array, + and a variable column index. In the code above we call `VIF()` on the fly for all columns in `X`. + We have excluded column 0 above (the intercept), which is not of interest. In this case the VIFs are not that exciting. + + The object `vals` above could have been constructed with the following for loop: + """) + return + + +@app.cell +def _(X_4): + _vals = [] + for i in range(1, X_4.values.shape[1]): + _vals.append(VIF(X_4.values, i)) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + List comprehension allows us to perform such repetitive operations in a more straightforward way. + ## Interaction Terms + It is easy to include interaction terms in a linear model using `ModelSpec()`. + Including a tuple `("lstat","age")` tells the model + matrix builder to include an interaction term between + `lstat` and `age`. + """) + return + + +@app.cell +def _(Boston, y): + X_5 = MS(['lstat', 'age', ('lstat', 'age')]).fit_transform(Boston) + model2 = sm.OLS(y, X_5) + summarize(model2.fit()) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Non-linear Transformations of the Predictors + The model matrix builder can include terms beyond + just column names and interactions. For instance, + the `poly()` function supplied in `ISLP` specifies that + columns representing polynomial functions + of its first argument are added to the model matrix. + """) + return + + +@app.cell +def _(Boston, y): + X_6 = MS([poly('lstat', degree=2), 'age']).fit_transform(Boston) + model3 = sm.OLS(y, X_6) + results3 = model3.fit() + summarize(results3) + return (results3,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The effectively zero *p*-value associated with the quadratic term + (i.e. the third row above) suggests that it leads to an improved model. + + By default, `poly()` creates a basis matrix for inclusion in the + model matrix whose + columns are *orthogonal polynomials*, which are designed for stable + least squares computations. {Actually, `poly()` is a wrapper for the workhorse and standalone function `Poly()` that does the work in building the model matrix.} + Alternatively, had we included an argument + `raw=True` in the above call to `poly()`, the basis matrix would consist simply of + `lstat` and `lstat**2`. Since either of these bases + represent quadratic polynomials, the fitted values would not + change in this case, just the polynomial coefficients. Also by default, the columns + created by `poly()` do not include an intercept column as + that is automatically added by `MS()`. + + We use the `anova_lm()` function to further quantify the extent to which the quadratic fit is + superior to the linear fit. + """) + return + + +@app.cell +def _(results1, results3): + anova_lm(results1, results3) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + Here `results1` represents the linear submodel containing + predictors `lstat` and `age`, + while `results3` corresponds to the larger model above with a quadratic + term in `lstat`. + The `anova_lm()` function performs a hypothesis test + comparing the two models. The null hypothesis is that the quadratic + term in the bigger model is not needed, and the alternative hypothesis is that the + bigger model is superior. Here the *F*-statistic is 177.28 and + the associated *p*-value is zero. + In this case the *F*-statistic is the square of the + *t*-statistic for the quadratic term in the linear model summary + for `results3` --- a consequence of the fact that these nested + models differ by one degree of freedom. + This provides very clear evidence that the quadratic polynomial in + `lstat` improves the linear model. + This is not surprising, since earlier we saw evidence for non-linearity in the relationship between `medv` + and `lstat`. + + The function `anova_lm()` can take more than two nested models + as input, in which case it compares every successive pair of models. + That also explains why there are `NaN`s in the first row above, since + there is no previous model with which to compare the first. + """) + return + + +@app.cell +def _(results3): + _ax = subplots(figsize=(8, 8))[1] + _ax.scatter(results3.fittedvalues, results3.resid) + _ax.set_xlabel('Fitted value') + _ax.set_ylabel('Residual') + _ax.axhline(0, c='k', ls='--') + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + We see that when the quadratic term is included in the model, + there is little discernible pattern in the residuals. + In order to create a cubic or higher-degree polynomial fit, we can simply change the degree argument + to `poly()`. + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Qualitative Predictors + Here we use the `Carseats` data, which is included in the + `ISLP` package. We will attempt to predict `Sales` + (child car seat sales) in 400 locations based on a number of + predictors. + """) + return + + +@app.cell +def _(): + Carseats = load_data('Carseats') + Carseats.columns + return (Carseats,) + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + The `Carseats` + data includes qualitative predictors such as + `ShelveLoc`, an indicator of the quality of the shelving + location --- that is, + the space within a store in which the car seat is displayed. The predictor + `ShelveLoc` takes on three possible values, `Bad`, `Medium`, and `Good`. + Given a qualitative variable such as `ShelveLoc`, `ModelSpec()` generates dummy + variables automatically. + These variables are often referred to as a *one-hot encoding* of the categorical + feature. Their columns sum to one, so to avoid collinearity with an intercept, the first column is dropped. Below we see + the column `ShelveLoc[Bad]` has been dropped, since `Bad` is the first level of `ShelveLoc`. + Below we fit a multiple regression model that includes some interaction terms. + """) + return + + +@app.cell +def _(Carseats): + allvars = list(Carseats.columns.drop('Sales')) + y_1 = Carseats['Sales'] + final = allvars + [('Income', 'Advertising'), ('Price', 'Age')] + X_7 = MS(final).fit_transform(Carseats) + _model = sm.OLS(y_1, X_7) + summarize(_model.fit()) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + In the first line above, we made `allvars` a list, so that we + could add the interaction terms two lines down. + Our model-matrix builder has created a `ShelveLoc[Good]` + dummy variable that takes on a value of 1 if the + shelving location is good, and 0 otherwise. It has also created a `ShelveLoc[Medium]` + dummy variable that equals 1 if the shelving location is medium, and 0 otherwise. + A bad shelving location corresponds to a zero for each of the two dummy variables. + The fact that the coefficient for `ShelveLoc[Good]` in the regression output is + positive indicates that a good shelving location is associated with high sales (relative to a bad location). + And `ShelveLoc[Medium]` has a smaller positive coefficient, + indicating that a medium shelving location leads to higher sales than a bad + shelving location, but lower sales than a good shelving location. + """) + return + + +if __name__ == "__main__": + app.run()