Cleared outputs, set some init cells

[ou-jupyter-r-demo.git] / section5.1.ipynb
diff --git a/section5.1.ipynb b/section5.1.ipynb

index 8d94e17a1742c23841b56f0468aea98dad56554e..e24eb064e22c0e837d6d3f08a47ceb6fa581757f 100644 (file)
--- a/section5.1.ipynb
+++ b/section5.1.ipynb
@@ -42,77 +42,7 @@
     },
     "outputs": [],
     "source": [
-    "# From http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/\n",
-    "# Multiple plot function\n",
-    "#\n",
-    "# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)\n",
-    "# - cols:   Number of columns in layout\n",
-    "# - layout: A matrix specifying the layout. If present, 'cols' is ignored.\n",
-    "#\n",
-    "# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),\n",
-    "# then plot 1 will go in the upper left, 2 will go in the upper right, and\n",
-    "# 3 will go all the way across the bottom.\n",
-    "#\n",
-    "multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {\n",
-    "  library(grid)\n",
-    "\n",
-    "  # Make a list from the ... arguments and plotlist\n",
-    "  plots <- c(list(...), plotlist)\n",
-    "\n",
-    "  numPlots = length(plots)\n",
-    "\n",
-    "  # If layout is NULL, then use 'cols' to determine layout\n",
-    "  if (is.null(layout)) {\n",
-    "    # Make the panel\n",
-    "    # ncol: Number of columns of plots\n",
-    "    # nrow: Number of rows needed, calculated from # of cols\n",
-    "    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),\n",
-    "                    ncol = cols, nrow = ceiling(numPlots/cols))\n",
-    "  }\n",
-    "\n",
-    " if (numPlots==1) {\n",
-    "    print(plots[[1]])\n",
-    "\n",
-    "  } else {\n",
-    "    # Set up the page\n",
-    "    grid.newpage()\n",
-    "    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))\n",
-    "\n",
-    "    # Make each plot, in the correct location\n",
-    "    for (i in 1:numPlots) {\n",
-    "      # Get the i,j matrix positions of the regions that contain this subplot\n",
-    "      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))\n",
-    "\n",
-    "      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,\n",
-    "                                      layout.pos.col = matchidx$col))\n",
-    "    }\n",
-    "  }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "hidden": true,
-    "init_cell": true
-   },
-   "outputs": [],
-   "source": [
-    "# From https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/\n",
-    "ggplotRegression <- function (fit) {\n",
-    "\n",
-    "require(ggplot2)\n",
-    "\n",
-    "ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + \n",
-    "    geom_point() +\n",
-    "    stat_smooth(method = \"lm\", col = \"red\") +\n",
-    "    labs(title = paste(\"Adj R2 = \",signif(summary(fit)$adj.r.squared, 5),\n",
-    "                     \"Intercept =\",signif(fit$coef[[1]],5 ),\n",
-    "                     \" Slope =\",signif(fit$coef[[2]], 5),\n",
-    "                     \" P =\",signif(summary(fit)$coef[2,4], 5))) + \n",
-    "    theme(plot.title = element_text(size=12))\n",
-    "}"
+    "source('plot_extensions.R')"
     ]
    },
    {
@@ -170,9 +100,9 @@
     "metadata": {},
     "outputs": [],
     "source": [
-    "fit <- lm(loss ~ hardness, data = rubber)\n",
-    "summary(fit)\n",
-    "anova(fit)\n",
+    "fit.h <- lm(loss ~ hardness, data = rubber)\n",
+    "summary(fit.h)\n",
+    "anova(fit.h)\n",
      "# af <- anova(fit)\n",
      "# afss <- af$\"Sum Sq\"\n",
      "# print(cbind(af,PctExp=afss/sum(afss)*100))"
@@ -184,18 +114,7 @@
     "metadata": {},
     "outputs": [],
     "source": [
-    "ggplot(rubber, aes(x=hardness, y=loss)) + \n",
-    "    geom_point() +\n",
-    "    stat_smooth(method = \"lm\", col = \"red\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ggplotRegression(fit)"
+    "ggplotRegression(fit.h)"
     ]
    },
    {
@@ -240,9 +159,9 @@
     },
     "outputs": [],
     "source": [
-    "fit <- lm(loss ~ strength, data = rubber)\n",
-    "summary(fit)\n",
-    "anova(fit)"
+    "fit.s <- lm(loss ~ strength, data = rubber)\n",
+    "summary(fit.s)\n",
+    "anova(fit.s)"
     ]
    },
    {
@@ -253,7 +172,7 @@
     },
     "outputs": [],
     "source": [
-    "ggplotRegression(fit)"
+    "ggplotRegression(fit.s)"
     ]
    },
    {
@@ -292,9 +211,9 @@
     "metadata": {},
     "outputs": [],
     "source": [
-    "fit <- lm(loss ~ hardness + strength, data = rubber)\n",
-    "summary(fit)\n",
-    "anova(fit)"
+    "fit.hs <- lm(loss ~ hardness + strength, data = rubber)\n",
+    "summary(fit.hs)\n",
+    "anova(fit.hs)"
     ]
    },
    {
@@ -303,7 +222,7 @@
     "metadata": {},
     "outputs": [],
     "source": [
-    "ggplotRegression(fit)"
+    "ggplotRegression(fit.hs)"
     ]
    },
    {
@@ -319,6 +238,19 @@
      "Associated with each estimated parameter, GenStat gives a standard error (details of the calculation of which need not concern us now) and hence a _t_-statistic (estimate divided by standard error) to be compared with the distribution on `d.f. (Residual)`=27 degrees of freedom. GenStat makes the comparison and gives _p_ values, which in this case are all very small, suggesting strong evidence for the non-zeroness (and hence presence) of each parameter, $\\alpha$, $\\beta_1$ and $\\beta_2$, individually."
     ]
    },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fits <- list(fit.h, fit.s, fit.hs)\n",
+    "data.frame(\n",
+    "    \"Vars\" = sapply(fits, function(x) toString(attr(summary(x)$terms, \"variables\")[-(1:2)]) ),\n",
+    "    \"Adj R^2\" = sapply(fits, function(x) summary(x)$adj.r.squared)\n",
+    ")"
+   ]
+  },
    {
     "cell_type": "markdown",
     "metadata": {},
@@ -372,8 +304,7 @@
     "metadata": {},
     "outputs": [],
     "source": [
-    "fit <- lm(loss ~ hardness + strength, data = rubber)\n",
-    "autoplot(fit)"
+    "autoplot(fit.hs)"
     ]
    },
    {
@@ -452,7 +383,7 @@
      "\n",
      "a. Using GenStat, perform the regression of expired ventilation (`ventil`) on oxygen uptake (`oxygen`). Are you at all surprised by how good this regression model seems?\n",
      "\n",
-    "b. Now form a new variable `oxy2`, say, by squaring oxygen. (Create a new column in the `anearobic` dataframe which is `anaerobic$oxygen ^ 2`.) Perform the regression of ventil on `oxygen` and `oxy2`. Comment on the fit of this model according to the printed output (and with recourse to Figure 3.2 in Example 3.1).\n",
+    "b. Now form a new variable `oxy2`, say, by squaring oxygen. (Create a new column in the `anearobic` dataframe which is `anaerobic$oxygen ^ 2`.) Perform the regression of `ventil` on `oxygen` and `oxy2`. Comment on the fit of this model according to the printed output (and with recourse to Figure 3.2 in Example 3.1).\n",
      "\n",
      "c. Make the usual residual plots and comment on the fit of the model again.\n",
      "\n",