section5.1solutions.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "markdown",
   5    "metadata": {},
   6    "source": [
   7     "# Section 5.1 solutions"
   8    ]
   9   },
  10   {
  11    "cell_type": "markdown",
  12    "metadata": {
  13     "heading_collapsed": true
  14    },
  15    "source": [
  16     "### Imports and defintions"
  17    ]
  18   },
  19   {
  20    "cell_type": "code",
  21    "execution_count": null,
  22    "metadata": {
  23     "hidden": true,
  24     "init_cell": true
  25    },
  26    "outputs": [],
  27    "source": [
  28     "library(tidyverse)\n",
  29     "# library(cowplot)\n",
  30     "library(repr)\n",
  31     "library(ggfortify)\n",
  32     "\n",
  33     "# Change plot size to 4 x 3\n",
  34     "options(repr.plot.width=6, repr.plot.height=4)"
  35    ]
  36   },
  37   {
  38    "cell_type": "code",
  39    "execution_count": null,
  40    "metadata": {
  41     "hidden": true,
  42     "init_cell": true
  43    },
  44    "outputs": [],
  45    "source": [
  46     "# From http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/\n",
  47     "# Multiple plot function\n",
  48     "#\n",
  49     "# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)\n",
  50     "# - cols:   Number of columns in layout\n",
  51     "# - layout: A matrix specifying the layout. If present, 'cols' is ignored.\n",
  52     "#\n",
  53     "# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),\n",
  54     "# then plot 1 will go in the upper left, 2 will go in the upper right, and\n",
  55     "# 3 will go all the way across the bottom.\n",
  56     "#\n",
  57     "multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {\n",
  58     "  library(grid)\n",
  59     "\n",
  60     "  # Make a list from the ... arguments and plotlist\n",
  61     "  plots <- c(list(...), plotlist)\n",
  62     "\n",
  63     "  numPlots = length(plots)\n",
  64     "\n",
  65     "  # If layout is NULL, then use 'cols' to determine layout\n",
  66     "  if (is.null(layout)) {\n",
  67     "    # Make the panel\n",
  68     "    # ncol: Number of columns of plots\n",
  69     "    # nrow: Number of rows needed, calculated from # of cols\n",
  70     "    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),\n",
  71     "                    ncol = cols, nrow = ceiling(numPlots/cols))\n",
  72     "  }\n",
  73     "\n",
  74     " if (numPlots==1) {\n",
  75     "    print(plots[[1]])\n",
  76     "\n",
  77     "  } else {\n",
  78     "    # Set up the page\n",
  79     "    grid.newpage()\n",
  80     "    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))\n",
  81     "\n",
  82     "    # Make each plot, in the correct location\n",
  83     "    for (i in 1:numPlots) {\n",
  84     "      # Get the i,j matrix positions of the regions that contain this subplot\n",
  85     "      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))\n",
  86     "\n",
  87     "      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,\n",
  88     "                                      layout.pos.col = matchidx$col))\n",
  89     "    }\n",
  90     "  }\n",
  91     "}"
  92    ]
  93   },
  94   {
  95    "cell_type": "code",
  96    "execution_count": null,
  97    "metadata": {
  98     "hidden": true,
  99     "init_cell": true
 100    },
 101    "outputs": [],
 102    "source": [
 103     "# From https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/\n",
 104     "ggplotRegression <- function (fit) {\n",
 105     "\n",
 106     "require(ggplot2)\n",
 107     "\n",
 108     "ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + \n",
 109     "    geom_point() +\n",
 110     "    stat_smooth(method = \"lm\", col = \"red\") +\n",
 111     "    labs(title = paste(\"Adj R2 = \",signif(summary(fit)$adj.r.squared, 5),\n",
 112     "                     \"Intercept =\",signif(fit$coef[[1]],5 ),\n",
 113     "                     \" Slope =\",signif(fit$coef[[2]], 5),\n",
 114     "                     \" P =\",signif(summary(fit)$coef[2,4], 5))) + \n",
 115     "    theme(plot.title = element_text(size=12))\n",
 116     "}"
 117    ]
 118   },
 119   {
 120    "cell_type": "markdown",
 121    "metadata": {},
 122    "source": [
 123     "## Exercise 5.2"
 124    ]
 125   },
 126   {
 127    "cell_type": "markdown",
 128    "metadata": {},
 129    "source": [
 130     "### Load the `cemheat` dataset."
 131    ]
 132   },
 133   {
 134    "cell_type": "code",
 135    "execution_count": null,
 136    "metadata": {},
 137    "outputs": [],
 138    "source": [
 139     "cemheat <- read.csv('cemheat.csv')\n",
 140     "head(cemheat)"
 141    ]
 142   },
 143   {
 144    "cell_type": "markdown",
 145    "metadata": {},
 146    "source": [
 147     "### Make scatterplots of heat against each of TA and TS in turn, and comment on what you see."
 148    ]
 149   },
 150   {
 151    "cell_type": "code",
 152    "execution_count": null,
 153    "metadata": {},
 154    "outputs": [],
 155    "source": [
 156     "taheat <- ggplot(cemheat, aes(x=TA, y=heat)) + geom_point()\n",
 157     "tsheat <-  ggplot(cemheat, aes(x=TS, y=heat)) + geom_point()\n",
 158     "\n",
 159     "multiplot(taheat, tsheat, cols=2)"
 160    ]
 161   },
 162   {
 163    "cell_type": "markdown",
 164    "metadata": {},
 165    "source": [
 166     "### Use GenStat to fit each individual regression equation (of heat on TA and of heat on TS) in turn, and then to fit the regression equation with two explanatory variables. Does the latter regression equation give you a better model than either of the individual ones?"
 167    ]
 168   },
 169   {
 170    "cell_type": "code",
 171    "execution_count": null,
 172    "metadata": {},
 173    "outputs": [],
 174    "source": [
 175     "fit <- lm(heat ~ TA, data = cemheat)\n",
 176     "summary(fit)\n",
 177     "anova(fit)"
 178    ]
 179   },
 180   {
 181    "cell_type": "code",
 182    "execution_count": null,
 183    "metadata": {},
 184    "outputs": [],
 185    "source": [
 186     "ggplotRegression(fit)"
 187    ]
 188   },
 189   {
 190    "cell_type": "code",
 191    "execution_count": null,
 192    "metadata": {},
 193    "outputs": [],
 194    "source": [
 195     "fit <- lm(heat ~ TS, data = cemheat)\n",
 196     "summary(fit)\n",
 197     "anova(fit)"
 198    ]
 199   },
 200   {
 201    "cell_type": "code",
 202    "execution_count": null,
 203    "metadata": {},
 204    "outputs": [],
 205    "source": [
 206     "ggplotRegression(fit)"
 207    ]
 208   },
 209   {
 210    "cell_type": "code",
 211    "execution_count": null,
 212    "metadata": {},
 213    "outputs": [],
 214    "source": [
 215     "fit <- lm(heat ~ TA + TS, data = cemheat)\n",
 216     "summary(fit)\n",
 217     "anova(fit)"
 218    ]
 219   },
 220   {
 221    "cell_type": "markdown",
 222    "metadata": {},
 223    "source": [
 224     "### According to the regression equation with two explanatory variables fitted in part (b), what is the predicted value of heat when TA = 15 and TS = 55?"
 225    ]
 226   },
 227   {
 228    "cell_type": "code",
 229    "execution_count": null,
 230    "metadata": {},
 231    "outputs": [],
 232    "source": [
 233     "predict(fit, data.frame(\"TA\" = 15, \"TS\" = 55))"
 234    ]
 235   },
 236   {
 237    "cell_type": "markdown",
 238    "metadata": {},
 239    "source": [
 240     "### By looking at the (default) composite residual plots, comment on the appropriateness of the fitted regression model."
 241    ]
 242   },
 243   {
 244    "cell_type": "code",
 245    "execution_count": null,
 246    "metadata": {},
 247    "outputs": [],
 248    "source": [
 249     "autoplot(fit)"
 250    ]
 251   },
 252   {
 253    "cell_type": "markdown",
 254    "metadata": {},
 255    "source": [
 256     "## Exercise 5.3: Fitting a quadratic regression model"
 257    ]
 258   },
 259   {
 260    "cell_type": "code",
 261    "execution_count": null,
 262    "metadata": {},
 263    "outputs": [],
 264    "source": [
 265     "anaerobic <- read.csv('anaerob.csv')\n",
 266     "ggplot(anaerobic, aes(x=oxygen, y=ventil)) + geom_point()"
 267    ]
 268   },
 269   {
 270    "cell_type": "markdown",
 271    "metadata": {},
 272    "source": [
 273     "### Using GenStat, perform the regression of expired ventilation (`ventil`) on oxygen uptake (`oxygen`). Are you at all surprised by how good this regression model seems?"
 274    ]
 275   },
 276   {
 277    "cell_type": "code",
 278    "execution_count": null,
 279    "metadata": {},
 280    "outputs": [],
 281    "source": [
 282     "fit <- lm(ventil ~ oxygen, data = anaerobic)\n",
 283     "summary(fit)\n",
 284     "anova(fit)"
 285    ]
 286   },
 287   {
 288    "cell_type": "code",
 289    "execution_count": null,
 290    "metadata": {},
 291    "outputs": [],
 292    "source": [
 293     "ggplotRegression(fit)"
 294    ]
 295   },
 296   {
 297    "cell_type": "code",
 298    "execution_count": null,
 299    "metadata": {},
 300    "outputs": [],
 301    "source": [
 302     "autoplot(fit)"
 303    ]
 304   },
 305   {
 306    "cell_type": "markdown",
 307    "metadata": {},
 308    "source": [
 309     "### Now form a new variable `oxy2`, say, by squaring oxygen.\n",
 310     "(Create a new column in the `anearobic` dataframe which is `anaerobic$oxygen ^ 2`.) Perform the regression of ventil on `oxygen` and `oxy2`. Comment on the fit of this model according to the printed output (and with recourse to Figure 3.2 in Example 3.1)."
 311    ]
 312   },
 313   {
 314    "cell_type": "code",
 315    "execution_count": null,
 316    "metadata": {},
 317    "outputs": [],
 318    "source": [
 319     "anaerobic$oxy2 <- anaerobic$oxygen^2\n",
 320     "head(anaerobic)"
 321    ]
 322   },
 323   {
 324    "cell_type": "code",
 325    "execution_count": null,
 326    "metadata": {},
 327    "outputs": [],
 328    "source": [
 329     "fit <- lm(ventil ~ oxygen + oxy2, data = anaerobic)\n",
 330     "summary(fit)\n",
 331     "anova(fit)"
 332    ]
 333   },
 334   {
 335    "cell_type": "markdown",
 336    "metadata": {},
 337    "source": [
 338     "### Make the usual residual plots and comment on the fit of the model again."
 339    ]
 340   },
 341   {
 342    "cell_type": "code",
 343    "execution_count": null,
 344    "metadata": {},
 345    "outputs": [],
 346    "source": [
 347     "ggplotRegression(fit)"
 348    ]
 349   },
 350   {
 351    "cell_type": "code",
 352    "execution_count": null,
 353    "metadata": {},
 354    "outputs": [],
 355    "source": [
 356     "autoplot(fit)"
 357    ]
 358   },
 359   {
 360    "cell_type": "code",
 361    "execution_count": null,
 362    "metadata": {},
 363    "outputs": [],
 364    "source": []
 365   }
 366  ],
 367  "metadata": {
 368   "kernelspec": {
 369    "display_name": "R",
 370    "language": "R",
 371    "name": "ir"
 372   },
 373   "language_info": {
 374    "codemirror_mode": "r",
 375    "file_extension": ".r",
 376    "mimetype": "text/x-r-source",
 377    "name": "R",
 378    "pygments_lexer": "r",
 379    "version": "3.4.2"
 380   },
 381   "toc": {
 382    "nav_menu": {},
 383    "number_sections": true,
 384    "sideBar": true,
 385    "skip_h1_title": false,
 386    "title_cell": "Table of Contents",
 387    "title_sidebar": "Contents",
 388    "toc_cell": false,
 389    "toc_position": {
 390     "height": "calc(100% - 180px)",
 391     "left": "10px",
 392     "top": "150px",
 393     "width": "342px"
 394    },
 395    "toc_section_display": true,
 396    "toc_window_display": true
 397   }
 398  },
 399  "nbformat": 4,
 400  "nbformat_minor": 2
 401 }