7  Data transformation II

7.1 arrange() and desc()

Arrange (arrange()) and descending (desc()) allow you to change the order of the rows without changing their contents. desc() is almost always used inside arrange(), i.e., arrange(desc(variable)).

library(dplyr)
library(readr)
library(knitr)
library(kableExtra)

dat_gender_tidier_counts <- dat_demographics_messy %>%
  # convert to lower case and remove non-letters
  mutate(gender = str_to_lower(gender),
         gender = str_remove_all(gender, "[^A-Za-z]")) %>%
  # tidy up cases
  mutate(gender = if_else(condition = gender %in% c("female", "male", "nonbinary"), # the logical test applied: is 'gender' one of the following
                          true = gender, # what to do if the test is passed: keep the original gender response
                          false = NA_character_))  %>% # what to do if the test is failed: set it to NA
  # count frequencies of each unique value in the gender column
  count(gender) 

# arrange by increasing N
dat_gender_tidier_counts %>%
  arrange(n) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
nonbinary 16
NA 44
male 126
female 314
# arrange by decreasing N
dat_gender_tidier_counts %>%
  arrange(desc(n)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
NA 44
nonbinary 16
# arrange by gender in alphabetical order  
dat_gender_tidier_counts %>%
  arrange(gender) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
nonbinary 16
NA 44
# arrange by gender in reverse alphabetical order  
dat_gender_tidier_counts %>%
  arrange(desc(gender)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
nonbinary 16
male 126
female 314
NA 44

Note that you can also arrange by multiple columns, e.g., dat %in% arrange(timepoint, condition, gender) would arrange timepoint in ascending order (1, 2), then for ties it arranges condition alphabetically (“control”, “intervention”), and for ties it would arrange gender alphabetically (e.g., “female”, “male”, “non-binary”).

7.2 filter() derivatives

Sometimes we want to filter() for rows not based on their content but their location, eg at the top or the bottom of the data frame. These filter() derivatives are therefore related to arrange() in that they often relate to row number rather than row contents.

7.2.1 slice_() functions

  • slice_head(n = 5) is equivalent to filter(row_number() <= 5)
  • slice_tail(n = 5) is equivalent to filter(row_number() > n() - 5)
  • slice_min() is equivalent to filter(min_rank(x) <= 5)
  • slice_max(n = 5) is equivalent to filter(min_rank(desc(x)) <= 5)

Sometimes you might want to filter rows randomly rather than at the top or bottom of the data frame. You can do this with slice_sample():

  • slice_sample(n = 5) : return 5 random rows
  • slice_sample(prop = .05) : return a random 5% of the rows

7.2.2 drop_na()

A common negative-filter() is to remove rows whose values are NA. Because NA is a special value, this is not done with filter(variable != NA) but instead with its own function: is.na():

dat_gender_tidier_counts %>%
  # arrange by decreasing N
  arrange(desc(n)) %>%
  # negative filter rows where gender is not NA
  filter(!is.na(gender)) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
nonbinary 16

This negative filter is so common that it has its own helper function: drop_na(). Note that there is also a base-R drop.na(), which you should avoid for technical reasons.

You can pass one, more than one, or no column names to drop_na(). If no column names are specified, it requires that all columns are not NA. For example:

# retain all rows
dat_gender_tidier_counts %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
nonbinary 16
NA 44
# drop NA rows in 'gender'
dat_gender_tidier_counts %>%
  # negative filter rows where !is.na(gender) 
  drop_na(gender) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
nonbinary 16
# drop NA rows in 'gender' and 'n'
dat_gender_tidier_counts %>%
  # negative filter rows where !is.na(gender) & !is.na(n)
  drop_na(gender, n) %>%
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
nonbinary 16
# drop NA rows in all columns in data frame
dat_gender_tidier_counts %>%
  # negative filter rows where all columns are not NA
  drop_na() %>%      # <- note no columns supplied to drop_na()
  kable() %>%
  kable_classic(full_width = FALSE)
gender n
female 314
male 126
nonbinary 16

7.3 Advanced mutates

7.3.1 Separating columns with seperate()

TODO

7.3.2 lead() and lag()

TODO

7.3.3 fill()

TODO

7.4 Modifying multiple columns at once with across()

7.4.1 mutate(across())

TODO

library(dplyr)
library(knitr)
library(kableExtra)
library(janitor)

dat_regression_betas_rounded <- dat_regression_betas %>%
  mutate(across(
    .cols = c(beta_estimate, beta_ci_lower, beta_ci_upper),
    .fns = ~ round_half_up(.x, digits = 2)
  ))

dat_regression_betas_rounded %>%
  kable() %>%
  kable_classic(full_width = FALSE)
beta_estimate beta_ci_lower beta_ci_upper p
0.37 0.17 0.57 0.0009180
0.30 0.10 0.50 0.0000014
0.12 -0.08 0.32 0.0082030
0.29 0.09 0.49 0.0014797
0.18 -0.02 0.38 0.0043528
dat_regression_betas_rounded <- dat_regression_betas %>%
  mutate(across(
    .cols = everything(),
    .fns = ~ round_half_up(.x, digits = 2)
  ))

dat_regression_betas_rounded %>%
  kable() %>%
  kable_classic(full_width = FALSE)
beta_estimate beta_ci_lower beta_ci_upper p
0.37 0.17 0.57 0.00
0.30 0.10 0.50 0.00
0.12 -0.08 0.32 0.01
0.29 0.09 0.49 0.00
0.18 -0.02 0.38 0.00
dat_regression_betas_rounded <- dat_regression_betas %>%
  mutate(across(
    .cols = starts_with("beta_"),
    .fns = ~ round_half_up(.x, digits = 2)
  ))

dat_regression_betas_rounded %>%
  kable() %>%
  kable_classic(full_width = FALSE)
beta_estimate beta_ci_lower beta_ci_upper p
0.37 0.17 0.57 0.0009180
0.30 0.10 0.50 0.0000014
0.12 -0.08 0.32 0.0082030
0.29 0.09 0.49 0.0014797
0.18 -0.02 0.38 0.0043528

7.4.2 rename(across())

TODO use {truffle}

7.5 Exercises

TODO