7 Data transformation II

7.1 `arrange()` and `desc()`

Arrange (arrange()) and descending (desc()) allow you to change the order of the rows without changing their contents. desc() is almost always used inside arrange(), i.e., arrange(desc(variable)).

library(dplyr)
library(readr)
library(knitr)
library(kableExtra)

dat_gender_tidier_counts <- dat_demographics_messy %>%
  # convert to lower case and remove non-letters
  mutate(gender = str_to_lower(gender),
         gender = str_remove_all(gender, "[^A-Za-z]")) %>%
  # tidy up cases
  mutate(gender = if_else(condition = gender %in% c("female", "male", "nonbinary"), # the logical test applied: is 'gender' one of the following
                          true = gender, # what to do if the test is passed: keep the original gender response
                          false = NA_character_))  %>% # what to do if the test is failed: set it to NA
  # count frequencies of each unique value in the gender column
  count(gender) 

# arrange by increasing N
dat_gender_tidier_counts %>%
  arrange(n) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
nonbinary	16
NA	44
male	126
female	314

# arrange by decreasing N
dat_gender_tidier_counts %>%
  arrange(desc(n)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
NA	44
nonbinary	16

# arrange by gender in alphabetical order  
dat_gender_tidier_counts %>%
  arrange(gender) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
nonbinary	16
NA	44

# arrange by gender in reverse alphabetical order  
dat_gender_tidier_counts %>%
  arrange(desc(gender)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
nonbinary	16
male	126
female	314
NA	44

Note that you can also arrange by multiple columns, e.g., dat %in% arrange(timepoint, condition, gender) would arrange timepoint in ascending order (1, 2), then for ties it arranges condition alphabetically (“control”, “intervention”), and for ties it would arrange gender alphabetically (e.g., “female”, “male”, “non-binary”).

7.2 `filter()` derivatives

Sometimes we want to filter() for rows not based on their content but their location, eg at the top or the bottom of the data frame. These filter() derivatives are therefore related to arrange() in that they often relate to row number rather than row contents.

7.2.1 `slice_()` functions

slice_head(n = 5) is equivalent to filter(row_number() <= 5)
slice_tail(n = 5) is equivalent to filter(row_number() > n() - 5)
slice_min() is equivalent to filter(min_rank(x) <= 5)
slice_max(n = 5) is equivalent to filter(min_rank(desc(x)) <= 5)

Sometimes you might want to filter rows randomly rather than at the top or bottom of the data frame. You can do this with slice_sample():

slice_sample(n = 5) : return 5 random rows
slice_sample(prop = .05) : return a random 5% of the rows

7.2.2 `drop_na()`

A common negative-filter() is to remove rows whose values are NA. Because NA is a special value, this is not done with filter(variable != NA) but instead with its own function: is.na():

dat_gender_tidier_counts %>%
  # arrange by decreasing N
  arrange(desc(n)) %>%
  # negative filter rows where gender is not NA
  filter(!is.na(gender)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
nonbinary	16

This negative filter is so common that it has its own helper function: drop_na(). Note that there is also a base-R drop.na(), which you should avoid for technical reasons.

You can pass one, more than one, or no column names to drop_na(). If no column names are specified, it requires that all columns are not NA. For example:

# retain all rows
dat_gender_tidier_counts %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
nonbinary	16
NA	44

# drop NA rows in 'gender'
dat_gender_tidier_counts %>%
  # negative filter rows where !is.na(gender) 
  drop_na(gender) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
nonbinary	16

# drop NA rows in 'gender' and 'n'
dat_gender_tidier_counts %>%
  # negative filter rows where !is.na(gender) & !is.na(n)
  drop_na(gender, n) %>%
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
nonbinary	16

# drop NA rows in all columns in data frame
dat_gender_tidier_counts %>%
  # negative filter rows where all columns are not NA
  drop_na() %>%      # <- note no columns supplied to drop_na()
  kable() %>%
  kable_classic(full_width = FALSE)

gender	n
female	314
male	126
nonbinary	16

7.3 Advanced mutates

7.3.1 Separating columns with `seperate()`

TODO

7.3.2 `lead()` and `lag()`

TODO

7.3.3 `fill()`

TODO

7.4 Modifying multiple columns at once with `across()`

7.4.1 `mutate(across())`

TODO

library(dplyr)
library(knitr)
library(kableExtra)
library(janitor)

dat_regression_betas_rounded <- dat_regression_betas %>%
  mutate(across(
    .cols = c(beta_estimate, beta_ci_lower, beta_ci_upper),
    .fns = ~ round_half_up(.x, digits = 2)
  ))

dat_regression_betas_rounded %>%
  kable() %>%
  kable_classic(full_width = FALSE)

beta_estimate	beta_ci_lower	beta_ci_upper	p
0.37	0.17	0.57	0.0009180
0.30	0.10	0.50	0.0000014
0.12	-0.08	0.32	0.0082030
0.29	0.09	0.49	0.0014797
0.18	-0.02	0.38	0.0043528

dat_regression_betas_rounded <- dat_regression_betas %>%
  mutate(across(
    .cols = everything(),
    .fns = ~ round_half_up(.x, digits = 2)
  ))

dat_regression_betas_rounded %>%
  kable() %>%
  kable_classic(full_width = FALSE)

beta_estimate	beta_ci_lower	beta_ci_upper	p
0.37	0.17	0.57	0.00
0.30	0.10	0.50	0.00
0.12	-0.08	0.32	0.01
0.29	0.09	0.49	0.00
0.18	-0.02	0.38	0.00

dat_regression_betas_rounded <- dat_regression_betas %>%
  mutate(across(
    .cols = starts_with("beta_"),
    .fns = ~ round_half_up(.x, digits = 2)
  ))

dat_regression_betas_rounded %>%
  kable() %>%
  kable_classic(full_width = FALSE)

beta_estimate	beta_ci_lower	beta_ci_upper	p
0.37	0.17	0.57	0.0009180
0.30	0.10	0.50	0.0000014
0.12	-0.08	0.32	0.0082030
0.29	0.09	0.49	0.0014797
0.18	-0.02	0.38	0.0043528

7.4.2 `rename(across())`

TODO use {truffle}

7.5 Exercises

TODO

# Data transformation II ## `arrange()` and `desc()` Arrange (`arrange()`) and descending (`desc()`) allow you to change the order of the rows without changing their contents. `desc()` is almost always used inside `arrange()`, i.e., `arrange(desc(variable))`. ```{r} #| include: false # generate some messy demographics data with a larger N # devtools::install_github("ianhussey/truffle") library(truffle) set.seed(42) dat_demographics_messy <- data.frame(id = 1:500) %>% truffle::truffle_demographics() %>% truffle::dirt_demographics() ``` ```{r} library(dplyr) library(readr) library(knitr) library(kableExtra) dat_gender_tidier_counts <- dat_demographics_messy %>% # convert to lower case and remove non-letters mutate(gender = str_to_lower(gender), gender = str_remove_all(gender, "[^A-Za-z]")) %>% # tidy up cases mutate(gender = if_else(condition = gender %in% c("female", "male", "nonbinary"), # the logical test applied: is 'gender' one of the following true = gender, # what to do if the test is passed: keep the original gender response false = NA_character_)) %>% # what to do if the test is failed: set it to NA # count frequencies of each unique value in the gender column count(gender) # arrange by increasing N dat_gender_tidier_counts %>% arrange(n) %>% kable() %>% kable_classic(full_width = FALSE) # arrange by decreasing N dat_gender_tidier_counts %>% arrange(desc(n)) %>% kable() %>% kable_classic(full_width = FALSE) # arrange by gender in alphabetical order dat_gender_tidier_counts %>% arrange(gender) %>% kable() %>% kable_classic(full_width = FALSE) # arrange by gender in reverse alphabetical order dat_gender_tidier_counts %>% arrange(desc(gender)) %>% kable() %>% kable_classic(full_width = FALSE) ``` Note that you can also arrange by multiple columns, e.g., `dat %in% arrange(timepoint, condition, gender)` would arrange timepoint in ascending order (1, 2), then for ties it arranges condition alphabetically ("control", "intervention"), and for ties it would arrange gender alphabetically (e.g., "female", "male", "non-binary"). ## `filter()` derivatives Sometimes we want to `filter()` for rows not based on their content but their location, eg at the top or the bottom of the data frame. These `filter()` derivatives are therefore related to `arrange()` in that they often relate to row number rather than row contents. ### `slice_()` functions - `slice_head(n = 5)` is equivalent to `filter(row_number() <= 5)` - `slice_tail(n = 5)` is equivalent to `filter(row_number() > n() - 5)` - `slice_min()` is equivalent to `filter(min_rank(x) <= 5)` - `slice_max(n = 5)` is equivalent to `filter(min_rank(desc(x)) <= 5)` Sometimes you might want to filter rows randomly rather than at the top or bottom of the data frame. You can do this with `slice_sample()`: - `slice_sample(n = 5)` : return 5 random rows - `slice_sample(prop = .05)` : return a random 5% of the rows ### `drop_na()` A common negative-`filter()` is to remove rows whose values are `NA`. Because `NA` is a special value, this is not done with `filter(variable != NA)` but instead with its own function: `is.na()`: ```{r} dat_gender_tidier_counts %>% # arrange by decreasing N arrange(desc(n)) %>% # negative filter rows where gender is not NA filter(!is.na(gender)) %>% kable() %>% kable_classic(full_width = FALSE) ``` This negative filter is so common that it has its own helper function: `drop_na()`. Note that there is also a base-R `drop.na()`, which you should avoid for technical reasons. You can pass one, more than one, or no column names to `drop_na()`. If no column names are specified, it requires that all columns are not `NA`. For example: ```{r} # retain all rows dat_gender_tidier_counts %>% kable() %>% kable_classic(full_width = FALSE) # drop NA rows in 'gender' dat_gender_tidier_counts %>% # negative filter rows where !is.na(gender) drop_na(gender) %>% kable() %>% kable_classic(full_width = FALSE) # drop NA rows in 'gender' and 'n' dat_gender_tidier_counts %>% # negative filter rows where !is.na(gender) & !is.na(n) drop_na(gender, n) %>% kable() %>% kable_classic(full_width = FALSE) # drop NA rows in all columns in data frame dat_gender_tidier_counts %>% # negative filter rows where all columns are not NA drop_na() %>% # <- note no columns supplied to drop_na() kable() %>% kable_classic(full_width = FALSE) ``` ## Advanced mutates ### Separating columns with `seperate()` TODO ### `lead()` and `lag()` TODO ### `fill()` TODO ## Modifying multiple columns at once with `across()` ### `mutate(across())` TODO ```{r} #| eval: true #| include: false # make up some values to be rounded library(dplyr) set.seed(44) dat_regression_betas <- data.frame(beta_estimate = rnorm(n = 5, mean = .3, sd = .1)) %>% mutate(beta_ci_lower = beta_estimate - 0.2, beta_ci_upper = beta_estimate + 0.2) %>% mutate(p = runif(n = 5, min = 0.000000001, max = 0.01)) ``` ```{r} library(dplyr) library(knitr) library(kableExtra) library(janitor) dat_regression_betas_rounded <- dat_regression_betas %>% mutate(across( .cols = c(beta_estimate, beta_ci_lower, beta_ci_upper), .fns = ~ round_half_up(.x, digits = 2) )) dat_regression_betas_rounded %>% kable() %>% kable_classic(full_width = FALSE) ``` ```{r} dat_regression_betas_rounded <- dat_regression_betas %>% mutate(across( .cols = everything(), .fns = ~ round_half_up(.x, digits = 2) )) dat_regression_betas_rounded %>% kable() %>% kable_classic(full_width = FALSE) ``` ```{r} dat_regression_betas_rounded <- dat_regression_betas %>% mutate(across( .cols = starts_with("beta_"), .fns = ~ round_half_up(.x, digits = 2) )) dat_regression_betas_rounded %>% kable() %>% kable_classic(full_width = FALSE) ``` ### `rename(across())` TODO use {truffle} ## Exercises TODO

7.1 arrange() and desc()

7.2 filter() derivatives

7.2.1 slice_() functions

7.2.2 drop_na()

7.3 Advanced mutates

7.3.1 Separating columns with seperate()

7.3.2 lead() and lag()

7.3.3 fill()

7.4 Modifying multiple columns at once with across()

7.4.1 mutate(across())

7.4.2 rename(across())

7.5 Exercises

7.1 `arrange()` and `desc()`

7.2 `filter()` derivatives

7.2.1 `slice_()` functions

7.2.2 `drop_na()`

7.3.1 Separating columns with `seperate()`

7.3.2 `lead()` and `lag()`

7.3.3 `fill()`

7.4 Modifying multiple columns at once with `across()`

7.4.1 `mutate(across())`

7.4.2 `rename(across())`