From 683f99e06d706e33d99196a1099531f820cda5e9 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Tue, 3 Feb 2026 21:52:06 +0400 Subject: [PATCH 01/13] Create .gitkeep in marimo folder --- marimo/.gitkeep | 1 + 1 file changed, 1 insertion(+) create mode 100644 marimo/.gitkeep diff --git a/marimo/.gitkeep b/marimo/.gitkeep new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/marimo/.gitkeep @@ -0,0 +1 @@ + From 5fc9de1fc8e2435e134ec45c6a1891e19e91989b Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Tue, 3 Feb 2026 22:43:34 +0400 Subject: [PATCH 02/13] a first notebook for marimo --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 979800a0..3a48ca76 100644 --- a/Makefile +++ b/Makefile @@ -24,5 +24,5 @@ jupyter: install ## Install and start jupyter Lab .PHONY: marimo marimo: install ## Install and start marimo - @uv run pip install marimo - @uv run marimo edit --no-token --headless . + @uv run pip install marimo + @uv run marimo edit --no-token --headless . From 28d3317abac533e214deb52ccb990381e85cf375 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Tue, 3 Feb 2026 22:44:03 +0400 Subject: [PATCH 03/13] a first notebook --- marimo/Ch03-linreg-lab.py | 909 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 909 insertions(+) create mode 100644 marimo/Ch03-linreg-lab.py diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py new file mode 100644 index 00000000..2d7baa4e --- /dev/null +++ b/marimo/Ch03-linreg-lab.py @@ -0,0 +1,909 @@ +import marimo + +__generated_with = "0.19.7" +app = marimo.App() + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +@app.cell +def _(mo): + mo.md(r""" + # Linear Regression + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Importing packages + We import our standard libraries at this top + level. + """) + return + + +@app.cell +def _(): + import numpy as np + import pandas as pd + from matplotlib.pyplot import subplots + return np, pd, subplots + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### New imports + Throughout this lab we will introduce new functions and libraries. However, + we will import them here to emphasize these are the new + code objects in this lab. Keeping imports near the top + of a notebook makes the code more readable, since scanning the first few + lines tells us what libraries are used. + """) + return + + +@app.cell +def _(): + import statsmodels.api as sm + return (sm,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We will provide relevant details about the + functions below as they are needed. + + Besides importing whole modules, it is also possible + to import only a few items from a given module. This + will help keep the *namespace* clean. + We will use a few specific objects from the `statsmodels` package + which we import here. + """) + return + + +@app.cell +def _(): + from statsmodels.stats.outliers_influence \ + import variance_inflation_factor as VIF + from statsmodels.stats.anova import anova_lm + return VIF, anova_lm + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + As one of the import statements above is quite a long line, we inserted a line break `\` to + ease readability. + + We will also use some functions written for the labs in this book in the `ISLP` + package. + """) + return + + +@app.cell +def _(): + from ISLP import load_data + from ISLP.models import (ModelSpec as MS, + summarize, + poly) + return MS, load_data, poly, summarize + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Inspecting Objects and Namespaces + The + function `dir()` + provides a list of + objects in a namespace. + """) + return + + +@app.cell +def _(): + dir() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This shows you everything that `Python` can find at the top level. + There are certain objects like `__builtins__` that contain references to built-in + functions like `print()`. + + Every python object has its own notion of + namespace, also accessible with `dir()`. This will include + both the attributes of the object + as well as any methods associated with it. For instance, we see `'sum'` in the listing for an + array. + """) + return + + +@app.cell +def _(np): + A = np.array([3,5,11]) + dir(A) + return (A,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This indicates that the object `A.sum` exists. In this case it is a method + that can be used to compute the sum of the array `A` as can be seen by typing `A.sum?`. + """) + return + + +@app.cell +def _(A): + A.sum() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Simple Linear Regression + In this section we will construct model + matrices (also called design matrices) using the `ModelSpec()` transform from `ISLP.models`. + + We will use the `Boston` housing data set, which is contained in the `ISLP` package. The `Boston` dataset records `medv` (median house value) for $506$ neighborhoods + around Boston. We will build a regression model to predict `medv` using $13$ + predictors such as `rm` (average number of rooms per house), + `age` (proportion of owner-occupied units built prior to 1940), and `lstat` (percent of + households with low socioeconomic status). We will use `statsmodels` for this + task, a `Python` package that implements several commonly used + regression methods. + + We have included a simple loading function `load_data()` in the + `ISLP` package: + """) + return + + +@app.cell +def _(load_data): + Boston = load_data("Boston") + Boston.columns + return (Boston,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Type `Boston?` to find out more about these data. + + We start by using the `sm.OLS()` function to fit a + simple linear regression model. Our response will be + `medv` and `lstat` will be the single predictor. + For this model, we can create the model matrix by hand. + """) + return + + +@app.cell +def _(Boston, np, pd): + X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]), + 'lstat': Boston['lstat']}) + X[:4] + return (X,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We extract the response, and fit the model. + """) + return + + +@app.cell +def _(Boston, X, sm): + y = Boston['medv'] + _model = sm.OLS(y, X) + results = _model.fit() + return results, y + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Note that `sm.OLS()` does + not fit the model; it specifies the model, and then `model.fit()` does the actual fitting. + + Our `ISLP` function `summarize()` produces a simple table of the parameter estimates, + their standard errors, t-statistics and p-values. + The function takes a single argument, such as the object `results` + returned here by the `fit` + method, and returns such a summary. + """) + return + + +@app.cell +def _(results, summarize): + summarize(results) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Before we describe other methods for working with fitted models, we outline a more useful and general framework for constructing a model matrix~`X`. + ### Using Transformations: Fit and Transform + Our model above has a single predictor, and constructing `X` was straightforward. + In practice we often fit models with more than one predictor, typically selected from an array or data frame. + We may wish to introduce transformations to the variables before fitting the model, specify interactions between variables, and expand some particular variables into sets of variables (e.g. polynomials). + The `sklearn` package has a particular notion + for this type of task: a *transform*. A transform is an object + that is created with some parameters as arguments. The + object has two main methods: `fit()` and `transform()`. + + We provide a general approach for specifying models and constructing + the model matrix through the transform `ModelSpec()` in the `ISLP` library. + `ModelSpec()` + (renamed `MS()` in the preamble) creates a + transform object, and then a pair of methods + `transform()` and `fit()` are used to construct a + corresponding model matrix. + + We first describe this process for our simple regression model using a single predictor `lstat` in + the `Boston` data frame, but will use it repeatedly in more + complex tasks in this and other labs in this book. + In our case the transform is created by the expression + `design = MS(['lstat'])`. + + The `fit()` method takes the original array and may do some + initial computations on it, as specified in the transform object. + For example, it may compute means and standard deviations for centering and scaling. + The `transform()` + method applies the fitted transformation to the array of data, and produces the model matrix. + """) + return + + +@app.cell +def _(Boston, MS): + design = MS(['lstat']) + design = design.fit(Boston) + X_1 = design.transform(Boston) + X_1[:4] + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + In this simple case, the `fit()` method does very little; it simply checks that the variable `'lstat'` specified in `design` exists in `Boston`. Then `transform()` constructs the model matrix with two columns: an `intercept` and the variable `lstat`. + + These two operations can be combined with the + `fit_transform()` method. + """) + return + + +@app.cell +def _(Boston, MS): + design_1 = MS(['lstat']) + X_2 = design_1.fit_transform(Boston) + X_2[:4] + return X_2, design_1 + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Note that, as in the previous code chunk when the two steps were done separately, the `design` object is changed as a result of the `fit()` operation. The power of this pipeline will become clearer when we fit more complex models that involve interactions and transformations. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Let's return to our fitted regression model. + The object + `results` has several methods that can be used for inference. + We already presented a function `summarize()` for showing the essentials of the fit. + For a full and somewhat exhaustive summary of the fit, we can use the `summary()` + method. + """) + return + + +@app.cell +def _(results): + results.summary() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The fitted coefficients can also be retrieved as the + `params` attribute of `results`. + """) + return + + +@app.cell +def _(results): + results.params + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `get_prediction()` method can be used to obtain predictions, and produce confidence intervals and + prediction intervals for the prediction of `medv` for given values of `lstat`. + + We first create a new data frame, in this case containing only the variable `lstat`, with the values for this variable at which we wish to make predictions. + We then use the `transform()` method of `design` to create the corresponding model matrix. + """) + return + + +@app.cell +def _(design_1, pd): + new_df = pd.DataFrame({'lstat': [5, 10, 15]}) + newX = design_1.transform(new_df) + newX + return (newX,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Next we compute the predictions at `newX`, and view them by extracting the `predicted_mean` attribute. + """) + return + + +@app.cell +def _(newX, results): + new_predictions = results.get_prediction(newX); + new_predictions.predicted_mean + return (new_predictions,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We can produce confidence intervals for the predicted values. + """) + return + + +@app.cell +def _(new_predictions): + new_predictions.conf_int(alpha=0.05) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Prediction intervals are computed by setting `obs=True`: + """) + return + + +@app.cell +def _(new_predictions): + new_predictions.conf_int(obs=True, alpha=0.05) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + For instance, the 95% confidence interval associated with an + `lstat` value of 10 is (24.47, 25.63), and the 95% prediction + interval is (12.82, 37.28). As expected, the confidence and + prediction intervals are centered around the same point (a predicted + value of 25.05 for `medv` when `lstat` equals + 10), but the latter are substantially wider. + + Next we will plot `medv` and `lstat` + using `DataFrame.plot.scatter()`, \definelongblankMR{plot.scatter()}{plot.slashslashscatter()} + and wish to + add the regression line to the resulting plot. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Defining Functions + While there is a function + within the `ISLP` package that adds a line to an existing plot, we take this opportunity + to define our first function to do so. + """) + return + + +@app.function +def abline(ax, b, m): + """Add a line with slope m and intercept b to ax""" + xlim = _ax.get_xlim() + ylim = [m * xlim[0] + b, m * xlim[1] + b] + _ax.plot(xlim, ylim) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + A few things are illustrated above. First we see the syntax for defining a function: + `def funcname(...)`. The function has arguments `ax, b, m` + where `ax` is an axis object for an existing plot, `b` is the intercept and + `m` is the slope of the desired line. Other plotting options can be passed on to + `ax.plot` by including additional optional arguments as follows: + """) + return + + +@app.function +def abline_1(ax, b, m, *args, **kwargs): + """Add a line with slope m and intercept b to ax""" + xlim = ax.get_xlim() + ylim = [m * xlim[0] + b, m * xlim[1] + b] + ax.plot(xlim, ylim, *args, **kwargs) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The addition of `*args` allows any number of + non-named arguments to `abline`, while `**kwargs` allows any + number of named arguments (such as `linewidth=3`) to `abline`. + In our function, we pass + these arguments verbatim to `ax.plot` above. Readers + interested in learning more about + functions are referred to the section on + defining functions in [docs.python.org/tutorial](https://docs.python.org/3/tutorial/controlflow.html#defining-functions). + + Let’s use our new function to add this regression line to a plot of + `medv` vs. `lstat`. + """) + return + + +@app.cell +def _(Boston, results): + _ax = Boston.plot.scatter('lstat', 'medv') + abline_1(_ax, results.params['intercept'], results.params['lstat'], 'r--', linewidth=3) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Thus, the final call to `ax.plot()` is `ax.plot(xlim, ylim, 'r--', linewidth=3)`. + We have used the argument `'r--'` to produce a red dashed line, and added + an argument to make it of width 3. + There is some evidence for non-linearity in the relationship between `lstat` and `medv`. We will explore this issue later in this lab. + + As mentioned above, there is an existing function to add a line to a plot --- `ax.axline()` --- but knowing how to write such functions empowers us to create more expressive displays. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Next we examine some diagnostic plots, several of which were discussed + in Section~\ref{Ch3:problems.sec}. + We can find the fitted values and residuals + of the fit as attributes of the `results` object. + Various influence measures describing the regression model + are computed with the `get_influence()` method. + As we will not use the `fig` component returned + as the first value from `subplots()`, we simply + capture the second returned value in `ax` below. + """) + return + + +@app.cell +def _(results, subplots): + _ax = subplots(figsize=(8, 8))[1] + _ax.scatter(results.fittedvalues, results.resid) + _ax.set_xlabel('Fitted value') + _ax.set_ylabel('Residual') + _ax.axhline(0, c='k', ls='--') + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We add a horizontal line at 0 for reference using the + `ax.axhline()` method, indicating + it should be black (`c='k'`) and have a dashed linestyle (`ls='--'`). + + On the basis of the residual plot, there is some evidence of non-linearity. + Leverage statistics can be computed for any number of predictors using the + `hat_matrix_diag` attribute of the value returned by the + `get_influence()` method. + """) + return + + +@app.cell +def _(X_2, np, results, subplots): + infl = results.get_influence() + _ax = subplots(figsize=(8, 8))[1] + _ax.scatter(np.arange(X_2.shape[0]), infl.hat_matrix_diag) + _ax.set_xlabel('Index') + _ax.set_ylabel('Leverage') + np.argmax(infl.hat_matrix_diag) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `np.argmax()` function identifies the index of the largest element of an array, optionally computed over an axis of the array. + In this case, we maximized over the entire array + to determine which observation has the largest leverage statistic. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Multiple Linear Regression + In order to fit a multiple linear regression model using least squares, we again use + the `ModelSpec()` transform to construct the required + model matrix and response. The arguments + to `ModelSpec()` can be quite general, but in this case + a list of column names suffice. We consider a fit here with + the two variables `lstat` and `age`. + """) + return + + +@app.cell +def _(Boston, MS, sm, summarize, y): + X_3 = MS(['lstat', 'age']).fit_transform(Boston) + _model1 = sm.OLS(y, X_3) + results1 = _model1.fit() + summarize(results1) + return (results1,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Notice how we have compacted the first line into a succinct expression describing the construction of `X`. + + The `Boston` data set contains 12 variables, and so it would be cumbersome + to have to type all of these in order to perform a regression using all of the predictors. + Instead, we can use the following short-hand:\definelongblankMR{columns.drop()}{columns.slashslashdrop()} + """) + return + + +@app.cell +def _(Boston): + terms = Boston.columns.drop('medv') + terms + return (terms,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We can now fit the model with all the variables in `terms` using + the same model matrix builder. + """) + return + + +@app.cell +def _(Boston, MS, sm, summarize, terms, y): + X_4 = MS(terms).fit_transform(Boston) + _model = sm.OLS(y, X_4) + results_1 = _model.fit() + summarize(results_1) + return (X_4,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + What if we would like to perform a regression using all of the variables but one? For + example, in the above regression output, `age` has a high $p$-value. + So we may wish to run a regression excluding this predictor. + The following syntax results in a regression using all predictors except `age`. + """) + return + + +@app.cell +def _(Boston, MS, sm, summarize, y): + minus_age = Boston.columns.drop(['medv', 'age']) + Xma = MS(minus_age).fit_transform(Boston) + _model1 = sm.OLS(y, Xma) + summarize(_model1.fit()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Multivariate Goodness of Fit + We can access the individual components of `results` by name + (`dir(results)` shows us what is available). Hence + `results.rsquared` gives us the $R^2$, + and + `np.sqrt(results.scale)` gives us the RSE. + + Variance inflation factors (section~\ref{Ch3:problems.sec}) are sometimes useful + to assess the effect of collinearity in the model matrix of a regression model. + We will compute the VIFs in our multiple regression fit, and use the opportunity to introduce the idea of *list comprehension*. + + ### List Comprehension + Often we encounter a sequence of objects which we would like to transform + for some other task. Below, we compute the VIF for each + feature in our `X` matrix and produce a data frame + whose index agrees with the columns of `X`. + The notion of list comprehension can often make such + a task easier. + + List comprehensions are simple and powerful ways to form + lists of `Python` objects. The language also supports + dictionary and *generator* comprehension, though these are + beyond our scope here. Let's look at an example. We compute the VIF for each of the variables + in the model matrix `X`, using the function `variance_inflation_factor()`. + """) + return + + +@app.cell +def _(VIF, X_4, pd): + _vals = [VIF(X_4, i) for i in range(1, X_4.shape[1])] + vif = pd.DataFrame({'vif': _vals}, index=X_4.columns[1:]) + vif + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The function `VIF()` takes two arguments: a dataframe or array, + and a variable column index. In the code above we call `VIF()` on the fly for all columns in `X`. + We have excluded column 0 above (the intercept), which is not of interest. In this case the VIFs are not that exciting. + + The object `vals` above could have been constructed with the following for loop: + """) + return + + +@app.cell +def _(VIF, X_4): + _vals = [] + for i in range(1, X_4.values.shape[1]): + _vals.append(VIF(X_4.values, i)) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + List comprehension allows us to perform such repetitive operations in a more straightforward way. + ## Interaction Terms + It is easy to include interaction terms in a linear model using `ModelSpec()`. + Including a tuple `("lstat","age")` tells the model + matrix builder to include an interaction term between + `lstat` and `age`. + """) + return + + +@app.cell +def _(Boston, MS, sm, summarize, y): + X_5 = MS(['lstat', 'age', ('lstat', 'age')]).fit_transform(Boston) + model2 = sm.OLS(y, X_5) + summarize(model2.fit()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Non-linear Transformations of the Predictors + The model matrix builder can include terms beyond + just column names and interactions. For instance, + the `poly()` function supplied in `ISLP` specifies that + columns representing polynomial functions + of its first argument are added to the model matrix. + """) + return + + +@app.cell +def _(Boston, MS, poly, sm, summarize, y): + X_6 = MS([poly('lstat', degree=2), 'age']).fit_transform(Boston) + model3 = sm.OLS(y, X_6) + results3 = model3.fit() + summarize(results3) + return (results3,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The effectively zero *p*-value associated with the quadratic term + (i.e. the third row above) suggests that it leads to an improved model. + + By default, `poly()` creates a basis matrix for inclusion in the + model matrix whose + columns are *orthogonal polynomials*, which are designed for stable + least squares computations. {Actually, `poly()` is a wrapper for the workhorse and standalone function `Poly()` that does the work in building the model matrix.} + Alternatively, had we included an argument + `raw=True` in the above call to `poly()`, the basis matrix would consist simply of + `lstat` and `lstat**2`. Since either of these bases + represent quadratic polynomials, the fitted values would not + change in this case, just the polynomial coefficients. Also by default, the columns + created by `poly()` do not include an intercept column as + that is automatically added by `MS()`. + + We use the `anova_lm()` function to further quantify the extent to which the quadratic fit is + superior to the linear fit. + """) + return + + +@app.cell +def _(anova_lm, results1, results3): + anova_lm(results1, results3) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Here `results1` represents the linear submodel containing + predictors `lstat` and `age`, + while `results3` corresponds to the larger model above with a quadratic + term in `lstat`. + The `anova_lm()` function performs a hypothesis test + comparing the two models. The null hypothesis is that the quadratic + term in the bigger model is not needed, and the alternative hypothesis is that the + bigger model is superior. Here the *F*-statistic is 177.28 and + the associated *p*-value is zero. + In this case the *F*-statistic is the square of the + *t*-statistic for the quadratic term in the linear model summary + for `results3` --- a consequence of the fact that these nested + models differ by one degree of freedom. + This provides very clear evidence that the quadratic polynomial in + `lstat` improves the linear model. + This is not surprising, since earlier we saw evidence for non-linearity in the relationship between `medv` + and `lstat`. + + The function `anova_lm()` can take more than two nested models + as input, in which case it compares every successive pair of models. + That also explains why there are `NaN`s in the first row above, since + there is no previous model with which to compare the first. + """) + return + + +@app.cell +def _(results3, subplots): + _ax = subplots(figsize=(8, 8))[1] + _ax.scatter(results3.fittedvalues, results3.resid) + _ax.set_xlabel('Fitted value') + _ax.set_ylabel('Residual') + _ax.axhline(0, c='k', ls='--') + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We see that when the quadratic term is included in the model, + there is little discernible pattern in the residuals. + In order to create a cubic or higher-degree polynomial fit, we can simply change the degree argument + to `poly()`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Qualitative Predictors + Here we use the `Carseats` data, which is included in the + `ISLP` package. We will attempt to predict `Sales` + (child car seat sales) in 400 locations based on a number of + predictors. + """) + return + + +@app.cell +def _(load_data): + Carseats = load_data('Carseats') + Carseats.columns + return (Carseats,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `Carseats` + data includes qualitative predictors such as + `ShelveLoc`, an indicator of the quality of the shelving + location --- that is, + the space within a store in which the car seat is displayed. The predictor + `ShelveLoc` takes on three possible values, `Bad`, `Medium`, and `Good`. + Given a qualitative variable such as `ShelveLoc`, `ModelSpec()` generates dummy + variables automatically. + These variables are often referred to as a *one-hot encoding* of the categorical + feature. Their columns sum to one, so to avoid collinearity with an intercept, the first column is dropped. Below we see + the column `ShelveLoc[Bad]` has been dropped, since `Bad` is the first level of `ShelveLoc`. + Below we fit a multiple regression model that includes some interaction terms. + """) + return + + +@app.cell +def _(Carseats, MS, sm, summarize): + allvars = list(Carseats.columns.drop('Sales')) + y_1 = Carseats['Sales'] + final = allvars + [('Income', 'Advertising'), ('Price', 'Age')] + X_7 = MS(final).fit_transform(Carseats) + _model = sm.OLS(y_1, X_7) + summarize(_model.fit()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + In the first line above, we made `allvars` a list, so that we + could add the interaction terms two lines down. + Our model-matrix builder has created a `ShelveLoc[Good]` + dummy variable that takes on a value of 1 if the + shelving location is good, and 0 otherwise. It has also created a `ShelveLoc[Medium]` + dummy variable that equals 1 if the shelving location is medium, and 0 otherwise. + A bad shelving location corresponds to a zero for each of the two dummy variables. + The fact that the coefficient for `ShelveLoc[Good]` in the regression output is + positive indicates that a good shelving location is associated with high sales (relative to a bad location). + And `ShelveLoc[Medium]` has a smaller positive coefficient, + indicating that a medium shelving location leads to higher sales than a bad + shelving location, but lower sales than a good shelving location. + """) + return + + +if __name__ == "__main__": + app.run() From 4a67183a1c69f35d7a8909852c572e0ce6f6b27d Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 07:35:52 +0400 Subject: [PATCH 04/13] .gitkeep no longer needed --- marimo/.gitkeep | 1 - 1 file changed, 1 deletion(-) delete mode 100644 marimo/.gitkeep diff --git a/marimo/.gitkeep b/marimo/.gitkeep deleted file mode 100644 index 8b137891..00000000 --- a/marimo/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - From 799bc2689cb9435cc4b2b8da795fb57001eccd1d Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 07:37:14 +0400 Subject: [PATCH 05/13] Add GitHub Actions workflow to test Marimo notebooks --- .github/workflows/rhiza_marimo.yml | 108 +++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 .github/workflows/rhiza_marimo.yml diff --git a/.github/workflows/rhiza_marimo.yml b/.github/workflows/rhiza_marimo.yml new file mode 100644 index 00000000..b4f52c1b --- /dev/null +++ b/.github/workflows/rhiza_marimo.yml @@ -0,0 +1,108 @@ +# This file is part of the jebel-quant/rhiza repository +# (https://github.com/jebel-quant/rhiza). +# +# Workflow: Marimo Notebooks +# +# Purpose: This workflow discovers and executes all Marimo notebooks in the +# repository. It builds a dynamic matrix to run each notebook in +# parallel to surface errors early and keep notebooks reproducible. +# +# Trigger: This workflow runs on every push and on pull requests to main/master +# branches (including from forks) +# +# Components: +# - πŸ”Ž Discover notebooks in book/marimo +# - πŸ§ͺ Run each notebook in parallel using a matrix strategy +# - βœ… Fail-fast disabled to report all failing notebooks + +name: "(RHIZA) MARIMO" + +permissions: + contents: read + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +jobs: + # Build a matrix of notebooks to test + list-notebooks: + runs-on: ubuntu-latest + outputs: + notebook-list: ${{ steps.notebooks.outputs.matrix }} + steps: + # Check out the repository code + - uses: actions/checkout@v6.0.2 + + # Find all Python files in the marimo folder and create a matrix for parallel execution + - name: Find notebooks and build matrix + id: notebooks + run: | + # Extract MARIMO_FOLDER from the project configuration (via Makefile) + # shellcheck disable=SC2016 # Single quotes intentional - Make syntax, not shell expansion + NOTEBOOK_DIR=$(make -s -f Makefile -f - <<< 'print: ; @echo $(or $(MARIMO_FOLDER),marimo)' print) + + echo "Searching notebooks in: $NOTEBOOK_DIR" + # Check if directory exists + if [ ! -d "$NOTEBOOK_DIR" ]; then + echo "Directory $NOTEBOOK_DIR does not exist. Setting empty matrix." + echo "matrix=[]" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Find notebooks and handle empty results + if [ -z "$(find "$NOTEBOOK_DIR" -maxdepth 1 -name "*.py" 2>/dev/null)" ]; then + echo "No notebooks found in $NOTEBOOK_DIR. Setting empty matrix." + echo "matrix=[]" >> "$GITHUB_OUTPUT" + else + notebooks=$(find "$NOTEBOOK_DIR" -maxdepth 1 -name "*.py" -print0 | xargs -0 -n1 echo | jq -R -s -c 'split("\n")[:-1]') + echo "matrix=$notebooks" >> "$GITHUB_OUTPUT" + fi + shell: bash + + # Create one job per notebook using the matrix strategy for parallel execution + test-notebooks: + if: needs.list-notebooks.outputs.notebook-list != '[]' + runs-on: ubuntu-latest + needs: list-notebooks + strategy: + matrix: + notebook: ${{ fromJson(needs.list-notebooks.outputs.notebook-list) }} + # Don't fail the entire workflow if one notebook fails + fail-fast: false + name: Run notebook ${{ matrix.notebook }} + steps: + # Check out the repository code + - uses: actions/checkout@v6.0.2 + with: + lfs: true + + # Install uv/uvx + - name: Install uv + uses: astral-sh/setup-uv@v7.2.1 + with: + version: "0.9.28" + + - name: Configure git auth for private packages + uses: ./.github/actions/configure-git-auth + with: + token: ${{ secrets.GH_PAT }} + + # Execute the notebook with the appropriate runner based on its content + - name: Run notebook + env: + UV_EXTRA_INDEX_URL: ${{ secrets.UV_EXTRA_INDEX_URL }} + run: | + uvx uv run "${{ matrix.notebook }}" + # uvx β†’ creates a fresh ephemeral environment + # uv run β†’ runs the notebook as a script in that ephemeral env + # No project packages are pre-installed + # βœ… This forces the notebook to explicitly handle dependencies (e.g., uv install ., or pip install inside the script). + # βœ… It’s a true integration smoke test. + # Benefits of this pattern + # Confirms the notebook can bootstrap itself in a fresh environment + # Catches missing uv install or pip steps early + # Ensures CI/other users can run the notebook without manual setup + shell: bash From 61b7a14bc8334e0c7881275d41df5ad3bcd93d4a Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 07:41:16 +0400 Subject: [PATCH 06/13] Update rhiza_marimo.yml --- .github/workflows/rhiza_marimo.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/rhiza_marimo.yml b/.github/workflows/rhiza_marimo.yml index b4f52c1b..c2d5238d 100644 --- a/.github/workflows/rhiza_marimo.yml +++ b/.github/workflows/rhiza_marimo.yml @@ -21,10 +21,7 @@ permissions: contents: read on: - push: - branches: [ main, master ] - pull_request: - branches: [ main, master ] + push jobs: # Build a matrix of notebooks to test From 53002afcb36e8af5991da010d1a7c3539cbcef87 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 07:42:48 +0400 Subject: [PATCH 07/13] Remove unused Git authentication step from Marimo workflow --- .github/workflows/rhiza_marimo.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/rhiza_marimo.yml b/.github/workflows/rhiza_marimo.yml index c2d5238d..4109d6c5 100644 --- a/.github/workflows/rhiza_marimo.yml +++ b/.github/workflows/rhiza_marimo.yml @@ -82,11 +82,6 @@ jobs: with: version: "0.9.28" - - name: Configure git auth for private packages - uses: ./.github/actions/configure-git-auth - with: - token: ${{ secrets.GH_PAT }} - # Execute the notebook with the appropriate runner based on its content - name: Run notebook env: From 243370436b8617f92376301dcb6504d5df9642bf Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 07:46:28 +0400 Subject: [PATCH 08/13] script header for uv run with dependencies and requirements --- marimo/Ch03-linreg-lab.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py index 2d7baa4e..99dbdd0d 100644 --- a/marimo/Ch03-linreg-lab.py +++ b/marimo/Ch03-linreg-lab.py @@ -1,3 +1,13 @@ +# /// script +# dependencies = [ +# "marimo==0.19.6", +# "numpy==2.3.1", +# "plotly==6.2.0", +# "polars==1.32.2", +# ] +# requires-python = ">=3.12" +# /// + import marimo __generated_with = "0.19.7" From 0a429f2c807ca17155f8b553decfde625395e575 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 07:50:47 +0400 Subject: [PATCH 09/13] script header for uv run with dependencies and requirements --- marimo/Ch03-linreg-lab.py | 98 +++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 50 deletions(-) diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py index 99dbdd0d..7d1bb74a 100644 --- a/marimo/Ch03-linreg-lab.py +++ b/marimo/Ch03-linreg-lab.py @@ -4,6 +4,8 @@ # "numpy==2.3.1", # "plotly==6.2.0", # "polars==1.32.2", +# "pandas", +# "matplotlib" # ] # requires-python = ">=3.12" # /// @@ -13,15 +15,11 @@ __generated_with = "0.19.7" app = marimo.App() - -@app.cell -def _(): +with app.setup: import marimo as mo - return (mo,) - - + @app.cell -def _(mo): +def _(): mo.md(r""" # Linear Regression """) @@ -29,7 +27,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ## Importing packages We import our standard libraries at this top @@ -47,7 +45,7 @@ def _(): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ### New imports Throughout this lab we will introduce new functions and libraries. However, @@ -66,7 +64,7 @@ def _(): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" We will provide relevant details about the functions below as they are needed. @@ -89,7 +87,7 @@ def _(): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" As one of the import statements above is quite a long line, we inserted a line break `\` to ease readability. @@ -110,7 +108,7 @@ def _(): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ### Inspecting Objects and Namespaces The @@ -128,7 +126,7 @@ def _(): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" This shows you everything that `Python` can find at the top level. There are certain objects like `__builtins__` that contain references to built-in @@ -151,7 +149,7 @@ def _(np): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" This indicates that the object `A.sum` exists. In this case it is a method that can be used to compute the sum of the array `A` as can be seen by typing `A.sum?`. @@ -166,7 +164,7 @@ def _(A): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" """) @@ -174,7 +172,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ## Simple Linear Regression In this section we will construct model @@ -202,7 +200,7 @@ def _(load_data): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Type `Boston?` to find out more about these data. @@ -223,7 +221,7 @@ def _(Boston, np, pd): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" We extract the response, and fit the model. """) @@ -239,7 +237,7 @@ def _(Boston, X, sm): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Note that `sm.OLS()` does not fit the model; it specifies the model, and then `model.fit()` does the actual fitting. @@ -260,7 +258,7 @@ def _(results, summarize): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Before we describe other methods for working with fitted models, we outline a more useful and general framework for constructing a model matrix~`X`. ### Using Transformations: Fit and Transform @@ -305,7 +303,7 @@ def _(Boston, MS): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" In this simple case, the `fit()` method does very little; it simply checks that the variable `'lstat'` specified in `design` exists in `Boston`. Then `transform()` constructs the model matrix with two columns: an `intercept` and the variable `lstat`. @@ -324,7 +322,7 @@ def _(Boston, MS): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Note that, as in the previous code chunk when the two steps were done separately, the `design` object is changed as a result of the `fit()` operation. The power of this pipeline will become clearer when we fit more complex models that involve interactions and transformations. """) @@ -332,7 +330,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Let's return to our fitted regression model. The object @@ -351,7 +349,7 @@ def _(results): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The fitted coefficients can also be retrieved as the `params` attribute of `results`. @@ -366,7 +364,7 @@ def _(results): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The `get_prediction()` method can be used to obtain predictions, and produce confidence intervals and prediction intervals for the prediction of `medv` for given values of `lstat`. @@ -386,7 +384,7 @@ def _(design_1, pd): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Next we compute the predictions at `newX`, and view them by extracting the `predicted_mean` attribute. """) @@ -401,7 +399,7 @@ def _(newX, results): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" We can produce confidence intervals for the predicted values. """) @@ -415,7 +413,7 @@ def _(new_predictions): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Prediction intervals are computed by setting `obs=True`: """) @@ -429,7 +427,7 @@ def _(new_predictions): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" For instance, the 95% confidence interval associated with an `lstat` value of 10 is (24.47, 25.63), and the 95% prediction @@ -447,7 +445,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ### Defining Functions While there is a function @@ -466,7 +464,7 @@ def abline(ax, b, m): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" A few things are illustrated above. First we see the syntax for defining a function: `def funcname(...)`. The function has arguments `ax, b, m` @@ -486,7 +484,7 @@ def abline_1(ax, b, m, *args, **kwargs): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The addition of `*args` allows any number of non-named arguments to `abline`, while `**kwargs` allows any @@ -511,7 +509,7 @@ def _(Boston, results): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Thus, the final call to `ax.plot()` is `ax.plot(xlim, ylim, 'r--', linewidth=3)`. We have used the argument `'r--'` to produce a red dashed line, and added @@ -524,7 +522,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Next we examine some diagnostic plots, several of which were discussed in Section~\ref{Ch3:problems.sec}. @@ -550,7 +548,7 @@ def _(results, subplots): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" We add a horizontal line at 0 for reference using the `ax.axhline()` method, indicating @@ -576,7 +574,7 @@ def _(X_2, np, results, subplots): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The `np.argmax()` function identifies the index of the largest element of an array, optionally computed over an axis of the array. In this case, we maximized over the entire array @@ -586,7 +584,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ## Multiple Linear Regression In order to fit a multiple linear regression model using least squares, we again use @@ -609,7 +607,7 @@ def _(Boston, MS, sm, summarize, y): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Notice how we have compacted the first line into a succinct expression describing the construction of `X`. @@ -628,7 +626,7 @@ def _(Boston): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" We can now fit the model with all the variables in `terms` using the same model matrix builder. @@ -646,7 +644,7 @@ def _(Boston, MS, sm, summarize, terms, y): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" What if we would like to perform a regression using all of the variables but one? For example, in the above regression output, `age` has a high $p$-value. @@ -666,7 +664,7 @@ def _(Boston, MS, sm, summarize, y): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ## Multivariate Goodness of Fit We can access the individual components of `results` by name @@ -705,7 +703,7 @@ def _(VIF, X_4, pd): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The function `VIF()` takes two arguments: a dataframe or array, and a variable column index. In the code above we call `VIF()` on the fly for all columns in `X`. @@ -725,7 +723,7 @@ def _(VIF, X_4): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" List comprehension allows us to perform such repetitive operations in a more straightforward way. ## Interaction Terms @@ -746,7 +744,7 @@ def _(Boston, MS, sm, summarize, y): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ## Non-linear Transformations of the Predictors The model matrix builder can include terms beyond @@ -768,7 +766,7 @@ def _(Boston, MS, poly, sm, summarize, y): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The effectively zero *p*-value associated with the quadratic term (i.e. the third row above) suggests that it leads to an improved model. @@ -798,7 +796,7 @@ def _(anova_lm, results1, results3): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" Here `results1` represents the linear submodel containing predictors `lstat` and `age`, @@ -837,7 +835,7 @@ def _(results3, subplots): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" We see that when the quadratic term is included in the model, there is little discernible pattern in the residuals. @@ -848,7 +846,7 @@ def _(mo): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" ## Qualitative Predictors Here we use the `Carseats` data, which is included in the @@ -867,7 +865,7 @@ def _(load_data): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" The `Carseats` data includes qualitative predictors such as @@ -897,7 +895,7 @@ def _(Carseats, MS, sm, summarize): @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" In the first line above, we made `allvars` a list, so that we could add the interaction terms two lines down. From 360e1eaf5f6c4833d8fcbf41921fc744db65d457 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 08:01:36 +0400 Subject: [PATCH 10/13] Install ISLP library --- marimo/Ch03-linreg-lab.py | 51 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py index 7d1bb74a..6c9b1177 100644 --- a/marimo/Ch03-linreg-lab.py +++ b/marimo/Ch03-linreg-lab.py @@ -5,9 +5,13 @@ # "plotly==6.2.0", # "polars==1.32.2", # "pandas", -# "matplotlib" +# "matplotlib", +# "statsmodels", +# "ISLP" # ] # requires-python = ">=3.12" +# [tool.uv.sources] +# ISLP = { git = "https://github.com/intro-stat-learning/ISLP_labs.git" } # /// import marimo @@ -17,7 +21,10 @@ with app.setup: import marimo as mo - + import numpy as np + import pandas as pd + import statsmodels.api as sm + @app.cell def _(): mo.md(r""" @@ -38,10 +45,10 @@ def _(): @app.cell def _(): - import numpy as np - import pandas as pd + #import numpy as np + #import pandas as pd from matplotlib.pyplot import subplots - return np, pd, subplots + return subplots @app.cell(hide_code=True) @@ -57,12 +64,6 @@ def _(): return -@app.cell -def _(): - import statsmodels.api as sm - return (sm,) - - @app.cell(hide_code=True) def _(): mo.md(r""" @@ -142,7 +143,7 @@ def _(): @app.cell -def _(np): +def _(): A = np.array([3,5,11]) dir(A) return (A,) @@ -213,7 +214,7 @@ def _(): @app.cell -def _(Boston, np, pd): +def _(Boston): X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]), 'lstat': Boston['lstat']}) X[:4] @@ -229,7 +230,7 @@ def _(): @app.cell -def _(Boston, X, sm): +def _(Boston, X): y = Boston['medv'] _model = sm.OLS(y, X) results = _model.fit() @@ -376,7 +377,7 @@ def _(): @app.cell -def _(design_1, pd): +def _(design_1): new_df = pd.DataFrame({'lstat': [5, 10, 15]}) newX = design_1.transform(new_df) newX @@ -458,9 +459,9 @@ def _(): @app.function def abline(ax, b, m): """Add a line with slope m and intercept b to ax""" - xlim = _ax.get_xlim() + xlim = ax.get_xlim() ylim = [m * xlim[0] + b, m * xlim[1] + b] - _ax.plot(xlim, ylim) + ax.plot(xlim, ylim) @app.cell(hide_code=True) @@ -563,7 +564,7 @@ def _(): @app.cell -def _(X_2, np, results, subplots): +def _(X_2, results, subplots): infl = results.get_influence() _ax = subplots(figsize=(8, 8))[1] _ax.scatter(np.arange(X_2.shape[0]), infl.hat_matrix_diag) @@ -598,7 +599,7 @@ def _(): @app.cell -def _(Boston, MS, sm, summarize, y): +def _(Boston, MS, summarize, y): X_3 = MS(['lstat', 'age']).fit_transform(Boston) _model1 = sm.OLS(y, X_3) results1 = _model1.fit() @@ -635,7 +636,7 @@ def _(): @app.cell -def _(Boston, MS, sm, summarize, terms, y): +def _(Boston, MS, summarize, terms, y): X_4 = MS(terms).fit_transform(Boston) _model = sm.OLS(y, X_4) results_1 = _model.fit() @@ -655,7 +656,7 @@ def _(): @app.cell -def _(Boston, MS, sm, summarize, y): +def _(Boston, MS, summarize, y): minus_age = Boston.columns.drop(['medv', 'age']) Xma = MS(minus_age).fit_transform(Boston) _model1 = sm.OLS(y, Xma) @@ -695,7 +696,7 @@ def _(): @app.cell -def _(VIF, X_4, pd): +def _(VIF, X_4): _vals = [VIF(X_4, i) for i in range(1, X_4.shape[1])] vif = pd.DataFrame({'vif': _vals}, index=X_4.columns[1:]) vif @@ -736,7 +737,7 @@ def _(): @app.cell -def _(Boston, MS, sm, summarize, y): +def _(Boston, MS, summarize, y): X_5 = MS(['lstat', 'age', ('lstat', 'age')]).fit_transform(Boston) model2 = sm.OLS(y, X_5) summarize(model2.fit()) @@ -757,7 +758,7 @@ def _(): @app.cell -def _(Boston, MS, poly, sm, summarize, y): +def _(Boston, MS, poly, summarize, y): X_6 = MS([poly('lstat', degree=2), 'age']).fit_transform(Boston) model3 = sm.OLS(y, X_6) results3 = model3.fit() @@ -884,7 +885,7 @@ def _(): @app.cell -def _(Carseats, MS, sm, summarize): +def _(Carseats, MS, summarize): allvars = list(Carseats.columns.drop('Sales')) y_1 = Carseats['Sales'] final = allvars + [('Income', 'Advertising'), ('Price', 'Age')] From 8cd0096d5881adb4a46eb49e52d7c8e338e3b7d2 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 08:12:17 +0400 Subject: [PATCH 11/13] Install ISLP library --- marimo/Ch03-linreg-lab.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py index 6c9b1177..bf019f9f 100644 --- a/marimo/Ch03-linreg-lab.py +++ b/marimo/Ch03-linreg-lab.py @@ -10,13 +10,11 @@ # "ISLP" # ] # requires-python = ">=3.12" -# [tool.uv.sources] -# ISLP = { git = "https://github.com/intro-stat-learning/ISLP_labs.git" } # /// import marimo -__generated_with = "0.19.7" +__generated_with = "0.19.6" app = marimo.App() with app.setup: @@ -25,6 +23,7 @@ import pandas as pd import statsmodels.api as sm + @app.cell def _(): mo.md(r""" @@ -48,7 +47,7 @@ def _(): #import numpy as np #import pandas as pd from matplotlib.pyplot import subplots - return subplots + return (subplots,) @app.cell(hide_code=True) From 5bf79e5de9324728c1d51af94e5275012dd7048b Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 08:16:51 +0400 Subject: [PATCH 12/13] Remove unused imports and streamline function arguments in Ch03-linreg-lab.py --- marimo/Ch03-linreg-lab.py | 73 ++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py index bf019f9f..f5623e20 100644 --- a/marimo/Ch03-linreg-lab.py +++ b/marimo/Ch03-linreg-lab.py @@ -22,8 +22,18 @@ import numpy as np import pandas as pd import statsmodels.api as sm - - + + # plotting + from matplotlib.pyplot import subplots + + from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF + from statsmodels.stats.anova import anova_lm + + from ISLP import load_data + from ISLP.models import (ModelSpec as MS, + summarize, + poly) + @app.cell def _(): mo.md(r""" @@ -42,14 +52,6 @@ def _(): return -@app.cell -def _(): - #import numpy as np - #import pandas as pd - from matplotlib.pyplot import subplots - return (subplots,) - - @app.cell(hide_code=True) def _(): mo.md(r""" @@ -78,14 +80,6 @@ def _(): return -@app.cell -def _(): - from statsmodels.stats.outliers_influence \ - import variance_inflation_factor as VIF - from statsmodels.stats.anova import anova_lm - return VIF, anova_lm - - @app.cell(hide_code=True) def _(): mo.md(r""" @@ -98,15 +92,6 @@ def _(): return -@app.cell -def _(): - from ISLP import load_data - from ISLP.models import (ModelSpec as MS, - summarize, - poly) - return MS, load_data, poly, summarize - - @app.cell(hide_code=True) def _(): mo.md(r""" @@ -193,7 +178,7 @@ def _(): @app.cell -def _(load_data): +def _(): Boston = load_data("Boston") Boston.columns return (Boston,) @@ -252,7 +237,7 @@ def _(): @app.cell -def _(results, summarize): +def _(results): summarize(results) return @@ -294,7 +279,7 @@ def _(): @app.cell -def _(Boston, MS): +def _(Boston): design = MS(['lstat']) design = design.fit(Boston) X_1 = design.transform(Boston) @@ -314,7 +299,7 @@ def _(): @app.cell -def _(Boston, MS): +def _(Boston): design_1 = MS(['lstat']) X_2 = design_1.fit_transform(Boston) X_2[:4] @@ -538,7 +523,7 @@ def _(): @app.cell -def _(results, subplots): +def _(results): _ax = subplots(figsize=(8, 8))[1] _ax.scatter(results.fittedvalues, results.resid) _ax.set_xlabel('Fitted value') @@ -563,7 +548,7 @@ def _(): @app.cell -def _(X_2, results, subplots): +def _(X_2, results): infl = results.get_influence() _ax = subplots(figsize=(8, 8))[1] _ax.scatter(np.arange(X_2.shape[0]), infl.hat_matrix_diag) @@ -598,7 +583,7 @@ def _(): @app.cell -def _(Boston, MS, summarize, y): +def _(Boston, y): X_3 = MS(['lstat', 'age']).fit_transform(Boston) _model1 = sm.OLS(y, X_3) results1 = _model1.fit() @@ -635,7 +620,7 @@ def _(): @app.cell -def _(Boston, MS, summarize, terms, y): +def _(Boston, terms, y): X_4 = MS(terms).fit_transform(Boston) _model = sm.OLS(y, X_4) results_1 = _model.fit() @@ -655,7 +640,7 @@ def _(): @app.cell -def _(Boston, MS, summarize, y): +def _(Boston, y): minus_age = Boston.columns.drop(['medv', 'age']) Xma = MS(minus_age).fit_transform(Boston) _model1 = sm.OLS(y, Xma) @@ -695,7 +680,7 @@ def _(): @app.cell -def _(VIF, X_4): +def _(X_4): _vals = [VIF(X_4, i) for i in range(1, X_4.shape[1])] vif = pd.DataFrame({'vif': _vals}, index=X_4.columns[1:]) vif @@ -715,7 +700,7 @@ def _(): @app.cell -def _(VIF, X_4): +def _(X_4): _vals = [] for i in range(1, X_4.values.shape[1]): _vals.append(VIF(X_4.values, i)) @@ -736,7 +721,7 @@ def _(): @app.cell -def _(Boston, MS, summarize, y): +def _(Boston, y): X_5 = MS(['lstat', 'age', ('lstat', 'age')]).fit_transform(Boston) model2 = sm.OLS(y, X_5) summarize(model2.fit()) @@ -757,7 +742,7 @@ def _(): @app.cell -def _(Boston, MS, poly, summarize, y): +def _(Boston, y): X_6 = MS([poly('lstat', degree=2), 'age']).fit_transform(Boston) model3 = sm.OLS(y, X_6) results3 = model3.fit() @@ -790,7 +775,7 @@ def _(): @app.cell -def _(anova_lm, results1, results3): +def _(results1, results3): anova_lm(results1, results3) return @@ -825,7 +810,7 @@ def _(): @app.cell -def _(results3, subplots): +def _(results3): _ax = subplots(figsize=(8, 8))[1] _ax.scatter(results3.fittedvalues, results3.resid) _ax.set_xlabel('Fitted value') @@ -858,7 +843,7 @@ def _(): @app.cell -def _(load_data): +def _(): Carseats = load_data('Carseats') Carseats.columns return (Carseats,) @@ -884,7 +869,7 @@ def _(): @app.cell -def _(Carseats, MS, summarize): +def _(Carseats): allvars = list(Carseats.columns.drop('Sales')) y_1 = Carseats['Sales'] final = allvars + [('Income', 'Advertising'), ('Price', 'Age')] From 29a7f7e3c7b186b90512378591ffe622eee16f18 Mon Sep 17 00:00:00 2001 From: Thomas Schmelzer Date: Wed, 4 Feb 2026 08:31:30 +0400 Subject: [PATCH 13/13] Remove unused dependencies and reorder imports in Ch03-linreg-lab.py --- marimo/Ch03-linreg-lab.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/marimo/Ch03-linreg-lab.py b/marimo/Ch03-linreg-lab.py index f5623e20..4a7c6687 100644 --- a/marimo/Ch03-linreg-lab.py +++ b/marimo/Ch03-linreg-lab.py @@ -2,8 +2,6 @@ # dependencies = [ # "marimo==0.19.6", # "numpy==2.3.1", -# "plotly==6.2.0", -# "polars==1.32.2", # "pandas", # "matplotlib", # "statsmodels", @@ -21,11 +19,10 @@ import marimo as mo import numpy as np import pandas as pd - import statsmodels.api as sm - - # plotting + from matplotlib.pyplot import subplots - + + import statsmodels.api as sm from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF from statsmodels.stats.anova import anova_lm