R - Programming with the Tidyverse

Authors

Michael Luu

Marcio Diniz

Published

September 13, 2022

1 Installing the ‘tidyverse’ package

Before we have access to the ‘tidyverse’ functions that was mentioned - we will first need to install the package using the install.packages() function. This is a generic function that you can use to install ANY packages you learn about in the future that is part of CRAN. You can install any package by adding the name of the package between quotes, within the parantheses. e.g. ‘tidyverse’

install.packages('tidyverse')
install.packages('janitor')
install.packages('readxl')

2 Loading packages

Now that the tidyverse packages have been installed into R, you will need to load the package to get access to the function. You won’t have access to any of the functions until you load the package. There are 2 other packages we will load to help faciliate our analysis.

library(tidyverse)
library(janitor)
library(readxl)

3 Case study with the sepsis data

3.1 Reading in the data

## reading in the dataset using the read_excel() function from the readxl package

df <- read_excel('data/sepsis_wide.xlsx', na = 'NA')

## here we are using the janitor package to help us clean the variable names of the data
df <- clean_names(df)

3.2 Checking variable data types

Obtaining a glimpse() of the data frame, this allows us to have a quick ‘glimpse’ or look of the data we just imported

glimpse(df)
Rows: 455
Columns: 21
$ id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
$ treat  <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, …
$ race   <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, …
$ apache <dbl> 27, 14, 33, 3, 5, 13, 34, 11, 25, 20, 21, 14, 19, 23, 22, 16, 1…
$ death  <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, …
$ temp0  <dbl> 95.4, 101.6, 101.0, 101.0, 101.4, 100.7, 101.1, 100.9, 100.8, 9…
$ temp1  <dbl> 93.9, 99.0, 98.9, 100.2, 101.4, 101.0, 99.7, 100.3, 99.7, 98.0,…
$ temp2  <dbl> 93.6, 99.8, 97.2, 100.0, 101.8, 101.5, 99.5, 100.7, 99.9, 97.0,…
$ temp3  <dbl> 93.0, 99.8, NA, 99.5, 101.4, 102.1, 99.0, 100.3, 99.7, 99.6, 10…
$ temp4  <dbl> 90.3, 99.6, NA, 99.7, 101.4, 102.3, 99.1, 99.1, 99.0, 99.6, 100…
$ temp5  <dbl> 93.2, 100.2, 94.8, 99.1, 101.0, 99.9, 98.7, 98.9, 98.1, 100.0, …
$ temp6  <dbl> 95.0, 100.2, 95.5, 99.3, 102.6, 100.5, 99.1, 99.3, 98.9, 99.0, …
$ temp7  <dbl> 96.2, 99.0, NA, 99.6, 100.8, 101.5, 98.3, 100.1, 98.1, 100.6, 1…
$ temp8  <dbl> 98.8, 99.9, 97.2, 98.0, 100.6, 101.9, 99.1, 100.3, 98.0, 101.0,…
$ temp9  <dbl> 97.5, 99.8, 99.2, 97.8, 99.8, 99.5, 99.0, 99.3, 98.1, 101.4, 10…
$ temp10 <dbl> 97.8, 99.6, NA, 97.6, 99.6, 100.7, 98.1, 98.7, 98.1, 99.8, 100.…
$ temp11 <dbl> 99.1, 99.4, NA, 98.4, 99.6, 101.3, 99.0, 98.5, 97.8, 103.2, 100…
$ temp12 <dbl> 95.6, 101.2, NA, 98.0, 99.8, 101.5, 98.9, 99.0, 97.8, 97.4, 98.…
$ temp13 <dbl> NA, 101.0, NA, 98.4, 100.8, 101.5, NA, 98.6, 98.0, 102.0, 101.0…
$ temp14 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ temp15 <dbl> NA, 100.6, NA, 98.8, 100.0, 101.5, NA, 99.0, 98.0, 101.4, 101.0…

3.3 Initial pre-processing

We can see that treat, race, and death are incorrectly coded as ‘dbl’, we would like to change them into factors using the factors() function. Since we are replacing the previous values with a new one, we need to use it in conjunction with the mutate function to create a new variable

df <- mutate(df, treat = factor(
  treat,
  levels = c(0, 1),
  labels = c('Control', 'Ibuprofenol')
))

df <- mutate(df, death = factor(
  death,
  levels = c(0, 1),
  labels = c('Alive', 'Died')
))

df <- mutate(df, race = factor(
  race,
  levels = c(0, 1, 2),
  labels = c('White', 'African American', 'Hispanic')
))

## obtain a new glimpse of the data to confirm the variables contain the correct 'typing'
glimpse(df)
Rows: 455
Columns: 21
$ id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
$ treat  <fct> Control, Ibuprofenol, Control, Ibuprofenol, Control, Ibuprofeno…
$ race   <fct> White, African American, African American, White, White, White,…
$ apache <dbl> 27, 14, 33, 3, 5, 13, 34, 11, 25, 20, 21, 14, 19, 23, 22, 16, 1…
$ death  <fct> Died, Alive, Died, Alive, Alive, Alive, Died, Alive, Alive, Ali…
$ temp0  <dbl> 95.4, 101.6, 101.0, 101.0, 101.4, 100.7, 101.1, 100.9, 100.8, 9…
$ temp1  <dbl> 93.9, 99.0, 98.9, 100.2, 101.4, 101.0, 99.7, 100.3, 99.7, 98.0,…
$ temp2  <dbl> 93.6, 99.8, 97.2, 100.0, 101.8, 101.5, 99.5, 100.7, 99.9, 97.0,…
$ temp3  <dbl> 93.0, 99.8, NA, 99.5, 101.4, 102.1, 99.0, 100.3, 99.7, 99.6, 10…
$ temp4  <dbl> 90.3, 99.6, NA, 99.7, 101.4, 102.3, 99.1, 99.1, 99.0, 99.6, 100…
$ temp5  <dbl> 93.2, 100.2, 94.8, 99.1, 101.0, 99.9, 98.7, 98.9, 98.1, 100.0, …
$ temp6  <dbl> 95.0, 100.2, 95.5, 99.3, 102.6, 100.5, 99.1, 99.3, 98.9, 99.0, …
$ temp7  <dbl> 96.2, 99.0, NA, 99.6, 100.8, 101.5, 98.3, 100.1, 98.1, 100.6, 1…
$ temp8  <dbl> 98.8, 99.9, 97.2, 98.0, 100.6, 101.9, 99.1, 100.3, 98.0, 101.0,…
$ temp9  <dbl> 97.5, 99.8, 99.2, 97.8, 99.8, 99.5, 99.0, 99.3, 98.1, 101.4, 10…
$ temp10 <dbl> 97.8, 99.6, NA, 97.6, 99.6, 100.7, 98.1, 98.7, 98.1, 99.8, 100.…
$ temp11 <dbl> 99.1, 99.4, NA, 98.4, 99.6, 101.3, 99.0, 98.5, 97.8, 103.2, 100…
$ temp12 <dbl> 95.6, 101.2, NA, 98.0, 99.8, 101.5, 98.9, 99.0, 97.8, 97.4, 98.…
$ temp13 <dbl> NA, 101.0, NA, 98.4, 100.8, 101.5, NA, 98.6, 98.0, 102.0, 101.0…
$ temp14 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ temp15 <dbl> NA, 100.6, NA, 98.8, 100.0, 101.5, NA, 99.0, 98.0, 101.4, 101.0…

3.4 Converting the temperature from F to C

## we can write the formula to convert a single temp from F to C
mutate(df, temp0 = (temp0 - 32) * 5/9)
# A tibble: 455 × 21
      id treat      race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>      <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control    White     27 Died   35.2  93.9  93.6  93    90.3  93.2  95  
 2     2 Ibuprofen… Afri…     14 Alive  38.7  99    99.8  99.8  99.6 100.  100. 
 3     3 Control    Afri…     33 Died   38.3  98.9  97.2  NA    NA    94.8  95.5
 4     4 Ibuprofen… White      3 Alive  38.3 100.  100    99.5  99.7  99.1  99.3
 5     5 Control    White      5 Alive  38.6 101.  102.  101.  101.  101   103. 
 6     6 Ibuprofen… White     13 Alive  38.2 101   102.  102.  102.   99.9 100. 
 7     7 Ibuprofen… White     34 Died   38.4  99.7  99.5  99    99.1  98.7  99.1
 8     8 Control    White     11 Alive  38.3 100.  101.  100.   99.1  98.9  99.3
 9     9 Ibuprofen… White     25 Alive  38.2  99.7  99.9  99.7  99    98.1  98.9
10    10 Control    White     20 Alive  37.2  98    97    99.6  99.6 100    99  
# … with 445 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
## alternatively we can write our own function to do this
fare_to_cel <- function(x) {
  (x - 32) * 5/9
}

## and then apply the function to the selected variables using across() within the mutate function use ?across to get more information on how to use this function the below three methods all produce the same results

df %>%
  mutate(across(
    c(
      "temp0",
      "temp1",
      "temp2",
      "temp3",
      "temp4",
      "temp5",
      "temp6",
      "temp7",
      "temp8",
      "temp9",
      "temp10",
      "temp11",
      "temp12",
      "temp13",
      "temp14",
      "temp15"
    ),
    fare_to_cel
  ))
# A tibble: 455 × 21
      id treat      race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>      <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control    White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     2 Ibuprofen… Afri…     14 Alive  38.7  37.2  37.7  37.7  37.6  37.9  37.9
 3     3 Control    Afri…     33 Died   38.3  37.2  36.2  NA    NA    34.9  35.3
 4     4 Ibuprofen… White      3 Alive  38.3  37.9  37.8  37.5  37.6  37.3  37.4
 5     5 Control    White      5 Alive  38.6  38.6  38.8  38.6  38.6  38.3  39.2
 6     6 Ibuprofen… White     13 Alive  38.2  38.3  38.6  38.9  39.1  37.7  38.1
 7     7 Ibuprofen… White     34 Died   38.4  37.6  37.5  37.2  37.3  37.1  37.3
 8     8 Control    White     11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 9     9 Ibuprofen… White     25 Alive  38.2  37.6  37.7  37.6  37.2  36.7  37.2
10    10 Control    White     20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
# … with 445 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
df %>% mutate(across(c(6:21), fare_to_cel))
# A tibble: 455 × 21
      id treat      race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>      <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control    White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     2 Ibuprofen… Afri…     14 Alive  38.7  37.2  37.7  37.7  37.6  37.9  37.9
 3     3 Control    Afri…     33 Died   38.3  37.2  36.2  NA    NA    34.9  35.3
 4     4 Ibuprofen… White      3 Alive  38.3  37.9  37.8  37.5  37.6  37.3  37.4
 5     5 Control    White      5 Alive  38.6  38.6  38.8  38.6  38.6  38.3  39.2
 6     6 Ibuprofen… White     13 Alive  38.2  38.3  38.6  38.9  39.1  37.7  38.1
 7     7 Ibuprofen… White     34 Died   38.4  37.6  37.5  37.2  37.3  37.1  37.3
 8     8 Control    White     11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 9     9 Ibuprofen… White     25 Alive  38.2  37.6  37.7  37.6  37.2  36.7  37.2
10    10 Control    White     20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
# … with 445 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
df %>% mutate(across(c(temp0:temp15), fare_to_cel))
# A tibble: 455 × 21
      id treat      race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>      <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control    White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     2 Ibuprofen… Afri…     14 Alive  38.7  37.2  37.7  37.7  37.6  37.9  37.9
 3     3 Control    Afri…     33 Died   38.3  37.2  36.2  NA    NA    34.9  35.3
 4     4 Ibuprofen… White      3 Alive  38.3  37.9  37.8  37.5  37.6  37.3  37.4
 5     5 Control    White      5 Alive  38.6  38.6  38.8  38.6  38.6  38.3  39.2
 6     6 Ibuprofen… White     13 Alive  38.2  38.3  38.6  38.9  39.1  37.7  38.1
 7     7 Ibuprofen… White     34 Died   38.4  37.6  37.5  37.2  37.3  37.1  37.3
 8     8 Control    White     11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 9     9 Ibuprofen… White     25 Alive  38.2  37.6  37.7  37.6  37.2  36.7  37.2
10    10 Control    White     20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
# … with 445 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
## if we are content with the changes, we can save and replace our original dataframe with the changes

df <- df %>% mutate(across(c(temp0:temp15), fare_to_cel))

3.5 Transforming from wide to long and back

We can use the pivot_longer() and the pivot_wider() function to transpose our data between the two data structure. The pivot_longer() function will transpose a wide data frame to a long data frame. The pivot_wider() function will allow for the opposite, a long dataframe to a wide dataframe.

## here we are creating a new dataframe called 'long'
long <- df %>% pivot_longer(., cols = temp0:temp15, names_to = 'name', values_to = 'value')
long
# A tibble: 7,280 × 7
      id treat   race  apache death name  value
   <dbl> <fct>   <fct>  <dbl> <fct> <chr> <dbl>
 1     1 Control White     27 Died  temp0  35.2
 2     1 Control White     27 Died  temp1  34.4
 3     1 Control White     27 Died  temp2  34.2
 4     1 Control White     27 Died  temp3  33.9
 5     1 Control White     27 Died  temp4  32.4
 6     1 Control White     27 Died  temp5  34.0
 7     1 Control White     27 Died  temp6  35  
 8     1 Control White     27 Died  temp7  35.7
 9     1 Control White     27 Died  temp8  37.1
10     1 Control White     27 Died  temp9  36.4
# … with 7,270 more rows
## here we are creating a new dataframe called 'wide' by converting the long dataframe back to wide format they are both valid formats we can work with in R

wide <- long %>% pivot_wider(., names_from = 'name', values_from = 'value')
wide
# A tibble: 455 × 21
      id treat      race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>      <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control    White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     2 Ibuprofen… Afri…     14 Alive  38.7  37.2  37.7  37.7  37.6  37.9  37.9
 3     3 Control    Afri…     33 Died   38.3  37.2  36.2  NA    NA    34.9  35.3
 4     4 Ibuprofen… White      3 Alive  38.3  37.9  37.8  37.5  37.6  37.3  37.4
 5     5 Control    White      5 Alive  38.6  38.6  38.8  38.6  38.6  38.3  39.2
 6     6 Ibuprofen… White     13 Alive  38.2  38.3  38.6  38.9  39.1  37.7  38.1
 7     7 Ibuprofen… White     34 Died   38.4  37.6  37.5  37.2  37.3  37.1  37.3
 8     8 Control    White     11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 9     9 Ibuprofen… White     25 Alive  38.2  37.6  37.7  37.6  37.2  36.7  37.2
10    10 Control    White     20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
# … with 445 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>

3.6 Extracting specific observations

## to extract specific observations we can use the filter() function, and state a criteria. Here we are extracting only patients in the Control group
df %>% filter(., treat == 'Control')
# A tibble: 231 × 21
      id treat   race     apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>   <fct>     <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control White        27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     3 Control African…     33 Died   38.3  37.2  36.2  NA    NA    34.9  35.3
 3     5 Control White         5 Alive  38.6  38.6  38.8  38.6  38.6  38.3  39.2
 4     8 Control White        11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 5    10 Control White        20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
 6    12 Control White        14 Alive  38.6  38.7  39.2  39.1  39.1  39.7  38.3
 7    14 Control White        23 Alive  37.9  37.6  37.4  37.4  37.4  37.6  37.6
 8    16 Control White        16 Died   38.1  38.0  37.7  36.9  36.9  36.4  NA  
 9    17 Control African…     17 Alive  38.1  37.7  38.3  37.8  39.3  37.6  38.2
10    18 Control White        14 Alive  38.7  38.7  38.6  38.3  38.3  38.3  38.4
# … with 221 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
## here we are extracting white control patients
df %>% filter(., treat == 'Control' & race == 'White')
# A tibble: 156 × 21
      id treat   race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>   <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     5 Control White      5 Alive  38.6  38.6  38.8  38.6  38.6  38.3  39.2
 3     8 Control White     11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 4    10 Control White     20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
 5    12 Control White     14 Alive  38.6  38.7  39.2  39.1  39.1  39.7  38.3
 6    14 Control White     23 Alive  37.9  37.6  37.4  37.4  37.4  37.6  37.6
 7    16 Control White     16 Died   38.1  38.0  37.7  36.9  36.9  36.4  NA  
 8    18 Control White     14 Alive  38.7  38.7  38.6  38.3  38.3  38.3  38.4
 9    21 Control White     24 Died   39.3  39.4  39.8  39.9  38.6  36.8  36.8
10    23 Control White     14 Alive  38.4  38.7  38.2  37.8  38.2  36.8  38.1
# … with 146 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
## here we are extracting white control patients who died
df %>% filter(., treat == 'Control' & race == 'White' & death == 'Died')
# A tibble: 54 × 21
      id treat   race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>   <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2    16 Control White     16 Died   38.1  38.0  37.7  36.9  36.9  36.4  NA  
 3    21 Control White     24 Died   39.3  39.4  39.8  39.9  38.6  36.8  36.8
 4    27 Control White     18 Died   38.3  37.6  37.6  36.8  37.7  36.7  38.0
 5    56 Control White     18 Died   39.1  39.3  39.5  39.5  39.8  39.6  39.9
 6    80 Control White     27 Died   37.7  37.7  37.2  37.9  38.2  37.9  37.8
 7    82 Control White      8 Died   38.1  36.4  36.6  34.4  33.0  34.4  34.8
 8    89 Control White     21 Died   34.6  37.3  37.4  37.6  37.1  37.4  37.4
 9   102 Control White     15 Died   38.6  38.5  38.6  38.3  37.6  37.1  36.7
10   104 Control White      7 Died   38.0  38.0  38.0  38.2  37.7  37.3  39.0
# … with 44 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
## here we are extracting white control patients with apache score greater than 10 or who died
df %>% filter(., (treat == 'Control' & race == 'White' ) & (apache > 10 | death == 'Died'))
# A tibble: 121 × 21
      id treat   race  apache death temp0 temp1 temp2 temp3 temp4 temp5 temp6
   <dbl> <fct>   <fct>  <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control White     27 Died   35.2  34.4  34.2  33.9  32.4  34.0  35  
 2     8 Control White     11 Alive  38.3  37.9  38.2  37.9  37.3  37.2  37.4
 3    10 Control White     20 Alive  37.2  36.7  36.1  37.6  37.6  37.8  37.2
 4    12 Control White     14 Alive  38.6  38.7  39.2  39.1  39.1  39.7  38.3
 5    14 Control White     23 Alive  37.9  37.6  37.4  37.4  37.4  37.6  37.6
 6    16 Control White     16 Died   38.1  38.0  37.7  36.9  36.9  36.4  NA  
 7    18 Control White     14 Alive  38.7  38.7  38.6  38.3  38.3  38.3  38.4
 8    21 Control White     24 Died   39.3  39.4  39.8  39.9  38.6  36.8  36.8
 9    23 Control White     14 Alive  38.4  38.7  38.2  37.8  38.2  36.8  38.1
10    25 Control White     22 Alive  35.7  36.8  37.4  38.3  38.4  37.9  37.4
# … with 111 more rows, and 9 more variables: temp7 <dbl>, temp8 <dbl>,
#   temp9 <dbl>, temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>

3.7 Selecting specific columns or variables

## perhaps we wanted to extract a dataframe that only contains id, treat, race, apache, and death
df %>% select(., id, treat, race, apache, death)
# A tibble: 455 × 5
      id treat       race             apache death
   <dbl> <fct>       <fct>             <dbl> <fct>
 1     1 Control     White                27 Died 
 2     2 Ibuprofenol African American     14 Alive
 3     3 Control     African American     33 Died 
 4     4 Ibuprofenol White                 3 Alive
 5     5 Control     White                 5 Alive
 6     6 Ibuprofenol White                13 Alive
 7     7 Ibuprofenol White                34 Died 
 8     8 Control     White                11 Alive
 9     9 Ibuprofenol White                25 Alive
10    10 Control     White                20 Alive
# … with 445 more rows
## perhaps we wanted to extract only the id and the temp variables
df %>% select(., id, temp0:temp15)
# A tibble: 455 × 17
      id temp0 temp1 temp2 temp3 temp4 temp5 temp6 temp7 temp8 temp9 temp10
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
 1     1  35.2  34.4  34.2  33.9  32.4  34.0  35    35.7  37.1  36.4   36.6
 2     2  38.7  37.2  37.7  37.7  37.6  37.9  37.9  37.2  37.7  37.7   37.6
 3     3  38.3  37.2  36.2  NA    NA    34.9  35.3  NA    36.2  37.3   NA  
 4     4  38.3  37.9  37.8  37.5  37.6  37.3  37.4  37.6  36.7  36.6   36.4
 5     5  38.6  38.6  38.8  38.6  38.6  38.3  39.2  38.2  38.1  37.7   37.6
 6     6  38.2  38.3  38.6  38.9  39.1  37.7  38.1  38.6  38.8  37.5   38.2
 7     7  38.4  37.6  37.5  37.2  37.3  37.1  37.3  36.8  37.3  37.2   36.7
 8     8  38.3  37.9  38.2  37.9  37.3  37.2  37.4  37.8  37.9  37.4   37.1
 9     9  38.2  37.6  37.7  37.6  37.2  36.7  37.2  36.7  36.7  36.7   36.7
10    10  37.2  36.7  36.1  37.6  37.6  37.8  37.2  38.1  38.3  38.6   37.7
# … with 445 more rows, and 5 more variables: temp11 <dbl>, temp12 <dbl>,
#   temp13 <dbl>, temp14 <dbl>, temp15 <dbl>
## perhaps we wanted to extract all the variables except outcome e.g. death
df %>% select(., -death)
# A tibble: 455 × 20
      id treat      race  apache temp0 temp1 temp2 temp3 temp4 temp5 temp6 temp7
   <dbl> <fct>      <fct>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1 Control    White     27  35.2  34.4  34.2  33.9  32.4  34.0  35    35.7
 2     2 Ibuprofen… Afri…     14  38.7  37.2  37.7  37.7  37.6  37.9  37.9  37.2
 3     3 Control    Afri…     33  38.3  37.2  36.2  NA    NA    34.9  35.3  NA  
 4     4 Ibuprofen… White      3  38.3  37.9  37.8  37.5  37.6  37.3  37.4  37.6
 5     5 Control    White      5  38.6  38.6  38.8  38.6  38.6  38.3  39.2  38.2
 6     6 Ibuprofen… White     13  38.2  38.3  38.6  38.9  39.1  37.7  38.1  38.6
 7     7 Ibuprofen… White     34  38.4  37.6  37.5  37.2  37.3  37.1  37.3  36.8
 8     8 Control    White     11  38.3  37.9  38.2  37.9  37.3  37.2  37.4  37.8
 9     9 Ibuprofen… White     25  38.2  37.6  37.7  37.6  37.2  36.7  37.2  36.7
10    10 Control    White     20  37.2  36.7  36.1  37.6  37.6  37.8  37.2  38.1
# … with 445 more rows, and 8 more variables: temp8 <dbl>, temp9 <dbl>,
#   temp10 <dbl>, temp11 <dbl>, temp12 <dbl>, temp13 <dbl>, temp14 <dbl>,
#   temp15 <dbl>
## we can also use a numerical indicator of the column number to select a range of columns, e.g the first 5 columns
df %>% select(., 1:5)
# A tibble: 455 × 5
      id treat       race             apache death
   <dbl> <fct>       <fct>             <dbl> <fct>
 1     1 Control     White                27 Died 
 2     2 Ibuprofenol African American     14 Alive
 3     3 Control     African American     33 Died 
 4     4 Ibuprofenol White                 3 Alive
 5     5 Control     White                 5 Alive
 6     6 Ibuprofenol White                13 Alive
 7     7 Ibuprofenol White                34 Died 
 8     8 Control     White                11 Alive
 9     9 Ibuprofenol White                25 Alive
10    10 Control     White                20 Alive
# … with 445 more rows

3.7.1 Selecting using helper functions

## let's select variables using starts_with() helper function
df %>% select(., starts_with('ra'))
# A tibble: 455 × 1
   race            
   <fct>           
 1 White           
 2 African American
 3 African American
 4 White           
 5 White           
 6 White           
 7 White           
 8 White           
 9 White           
10 White           
# … with 445 more rows
## let's select variables using ends_with() helper function
df %>% select(., ends_with('e'))
# A tibble: 455 × 2
   race             apache
   <fct>             <dbl>
 1 White                27
 2 African American     14
 3 African American     33
 4 White                 3
 5 White                 5
 6 White                13
 7 White                34
 8 White                11
 9 White                25
10 White                20
# … with 445 more rows
## let's select variables using contains() helper function
df %>% select(., contains('temp'))
# A tibble: 455 × 16
   temp0 temp1 temp2 temp3 temp4 temp5 temp6 temp7 temp8 temp9 temp10 temp11
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>
 1  35.2  34.4  34.2  33.9  32.4  34.0  35    35.7  37.1  36.4   36.6   37.3
 2  38.7  37.2  37.7  37.7  37.6  37.9  37.9  37.2  37.7  37.7   37.6   37.4
 3  38.3  37.2  36.2  NA    NA    34.9  35.3  NA    36.2  37.3   NA     NA  
 4  38.3  37.9  37.8  37.5  37.6  37.3  37.4  37.6  36.7  36.6   36.4   36.9
 5  38.6  38.6  38.8  38.6  38.6  38.3  39.2  38.2  38.1  37.7   37.6   37.6
 6  38.2  38.3  38.6  38.9  39.1  37.7  38.1  38.6  38.8  37.5   38.2   38.5
 7  38.4  37.6  37.5  37.2  37.3  37.1  37.3  36.8  37.3  37.2   36.7   37.2
 8  38.3  37.9  38.2  37.9  37.3  37.2  37.4  37.8  37.9  37.4   37.1   36.9
 9  38.2  37.6  37.7  37.6  37.2  36.7  37.2  36.7  36.7  36.7   36.7   36.6
10  37.2  36.7  36.1  37.6  37.6  37.8  37.2  38.1  38.3  38.6   37.7   39.6
# … with 445 more rows, and 4 more variables: temp12 <dbl>, temp13 <dbl>,
#   temp14 <dbl>, temp15 <dbl>
## let's select variables using num_range() helper function
df %>% select(., num_range('temp', 0:5))
# A tibble: 455 × 6
   temp0 temp1 temp2 temp3 temp4 temp5
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  35.2  34.4  34.2  33.9  32.4  34.0
 2  38.7  37.2  37.7  37.7  37.6  37.9
 3  38.3  37.2  36.2  NA    NA    34.9
 4  38.3  37.9  37.8  37.5  37.6  37.3
 5  38.6  38.6  38.8  38.6  38.6  38.3
 6  38.2  38.3  38.6  38.9  39.1  37.7
 7  38.4  37.6  37.5  37.2  37.3  37.1
 8  38.3  37.9  38.2  37.9  37.3  37.2
 9  38.2  37.6  37.7  37.6  37.2  36.7
10  37.2  36.7  36.1  37.6  37.6  37.8
# … with 445 more rows

3.8 Summarizing data with dplyr

## lets taking a simple count of the dataframe
df %>% summarise(., n = n())
# A tibble: 1 × 1
      n
  <int>
1   455
## the summzarise function becomes very powerful when you use it in conjuntion with the group_by() function lets take the count of control and ibuprofenol patients in our dataset
df %>%
  group_by(treat) %>%
  summarise(., n = n(), .groups = 'drop')
# A tibble: 2 × 2
  treat           n
  <fct>       <int>
1 Control       231
2 Ibuprofenol   224
## we can expand this further by including more factors in group_by() here lets take the count of people who died among the different arms
df %>%
  group_by(., treat, death) %>%
  summarise(., n = n(), .groups = 'drop')
# A tibble: 4 × 3
  treat       death     n
  <fct>       <fct> <int>
1 Control     Alive   139
2 Control     Died     92
3 Ibuprofenol Alive   140
4 Ibuprofenol Died     84
## we can also further expand this by adding more functions into the summarise function
df %>%
  group_by(., treat, death) %>%
  summarise(
    .,
    mean_temp0 = mean(temp0, na.rm = T),
    sd_temp0 = sd(temp0, na.rm = T),
    .groups = 'drop'
  )
# A tibble: 4 × 4
  treat       death mean_temp0 sd_temp0
  <fct>       <fct>      <dbl>    <dbl>
1 Control     Alive       38.2    0.765
2 Control     Died        37.8    1.35 
3 Ibuprofenol Alive       38.1    1.09 
4 Ibuprofenol Died        37.8    1.34 
## here we are using a across which allows us to easily summarise many columns at once, with a list of functions

df %>%
  group_by(., treat, death) %>%
  summarise(across(
    c(temp0:temp15),
    list('mean' = mean, 'sd' = sd),
    na.rm = T,
    .names = '{.fn}_{.col}'
  ),
  .groups = 'drop')
# A tibble: 4 × 34
  treat    death mean_…¹ sd_te…² mean_…³ sd_te…⁴ mean_…⁵ sd_te…⁶ mean_…⁷ sd_te…⁸
  <fct>    <fct>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 Control  Alive    38.2   0.765    38.1   0.923    38.0   0.764    38.1   0.799
2 Control  Died     37.8   1.35     37.5   1.37     37.5   1.29     37.5   1.32 
3 Ibuprof… Alive    38.1   1.09     37.6   0.906    37.2   0.883    37.1   0.931
4 Ibuprof… Died     37.8   1.34     37.4   1.22     37.1   1.22     36.8   1.04 
# … with 24 more variables: mean_temp4 <dbl>, sd_temp4 <dbl>, mean_temp5 <dbl>,
#   sd_temp5 <dbl>, mean_temp6 <dbl>, sd_temp6 <dbl>, mean_temp7 <dbl>,
#   sd_temp7 <dbl>, mean_temp8 <dbl>, sd_temp8 <dbl>, mean_temp9 <dbl>,
#   sd_temp9 <dbl>, mean_temp10 <dbl>, sd_temp10 <dbl>, mean_temp11 <dbl>,
#   sd_temp11 <dbl>, mean_temp12 <dbl>, sd_temp12 <dbl>, mean_temp13 <dbl>,
#   sd_temp13 <dbl>, mean_temp14 <dbl>, sd_temp14 <dbl>, mean_temp15 <dbl>,
#   sd_temp15 <dbl>, and abbreviated variable names ¹​mean_temp0, ²​sd_temp0, …