class: clear, center, middle background-image: url(images/transformer.gif) background-size: cover .font200.white.bold[Scoped Variable Transformation] --- # Session Prereqs .pull-left[ .font120.bold.center[Packages] ```r library(nycflights13) # data library(dplyr) # data transformation ``` ] .pull-right[ .font120.bold.center[Data] ```r flights ## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## 4 2013 1 1 544 545 -1 1004 ## 5 2013 1 1 554 600 -6 812 ## 6 2013 1 1 554 558 -4 740 ## 7 2013 1 1 555 600 -5 913 ## 8 2013 1 1 557 600 -3 709 ## 9 2013 1 1 557 600 -3 838 ## 10 2013 1 1 558 600 -2 753 ## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm> ``` ] --- # Data transformation task <br><br> <img src="images/transform-task.png" style="display: block; margin: auto;" /> --- # dplyr review In the Intro to R course, you learned six key .bold[dplyr] functions that allow you to solve the vast majority of your data manipulation challenges: .pull-left[ * .bold[`filter`]: pick observations based on values * .bold[`select`]: pick variables * .bold[`summarize`]: compute statistical summaries * .bold[`group_by`]: perform operations at different levels of your data * .bold[`arrange`]: reorder data * .bold[`mutate`]: create new variables ] .pull-right[ <br> <img src="images/dplyr.png" width="50%" height="50%" style="display: block; margin: auto;" /> <br> ] --- # Basics .font130[All functions work similarly:] .font120[ * The first argument is a data frame * Subsequent arguments describe what to do * Output is a new data frame ] <br> <img src="images/function-in-out.png" width="1343" style="display: block; margin: auto;" /> --- # dplyr .red[review] .pull-left[ * .bold[`filter`]: pick observations based on values * .opacity20[`select`: pick variables] * .opacity20[`summarize`: compute statistical summaries] * .opacity20[`group_by`: perform operations at different levels of your data] * .opacity20[`arrange`: reorder data] * .opacity20[`mutate`: create new variables] ] .pull-right[ ```r # filter flights with arrival delay > 60 minutes filter(flights, arr_delay > 60) ## # A tibble: 27,789 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 811 630 101 1047 ## 2 2013 1 1 848 1835 853 1001 ## 3 2013 1 1 957 733 144 1056 ## 4 2013 1 1 1114 900 134 1447 ## 5 2013 1 1 1120 944 96 1331 ## 6 2013 1 1 1255 1200 55 1451 ## 7 2013 1 1 1301 1150 71 1518 ## 8 2013 1 1 1337 1220 77 1649 ## 9 2013 1 1 1342 1320 22 1617 ## 10 2013 1 1 1400 1250 70 1645 ## # … with 27,779 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm> ``` ] --- # dplyr .red[review] .pull-left[ * .opacity20[`filter`: pick observations based on values] * .bold[`select`]: pick variables * .opacity20[`summarize`: compute statistical summaries] * .opacity20[`group_by`: perform operations at different levels of your data] * .opacity20[`arrange`: reorder data] * .opacity20[`mutate`: create new variables] ] .pull-right[ ```r # select carrier, year, month, day, and all variables containing "delay" select(flights, carrier, year, month, day, contains("delay")) ## # A tibble: 336,776 x 6 ## carrier year month day dep_delay arr_delay ## <chr> <int> <int> <int> <dbl> <dbl> ## 1 UA 2013 1 1 2 11 ## 2 UA 2013 1 1 4 20 ## 3 AA 2013 1 1 2 33 ## 4 B6 2013 1 1 -1 -18 ## 5 DL 2013 1 1 -6 -25 ## 6 UA 2013 1 1 -4 12 ## 7 B6 2013 1 1 -5 19 ## 8 EV 2013 1 1 -3 -14 ## 9 B6 2013 1 1 -3 -8 ## 10 AA 2013 1 1 -2 8 ## # … with 336,766 more rows ``` ] --- # dplyr .red[review] .pull-left[ * .opacity20[`filter`: pick observations based on values] * .opacity20[`select`: pick variables] * .bold[`summarize`]: compute statistical summaries * .opacity20[`group_by`: perform operations at different levels of your data] * .opacity20[`arrange`: reorder data] * .opacity20[`mutate`: create new variables] ] .pull-right[ ```r # compute median departure delay across all fights summarize(flights, median_delay = median(dep_delay, na.rm = TRUE)) ## # A tibble: 1 x 1 ## median_delay ## <dbl> ## 1 -2 ``` ] --- # dplyr .red[review] .pull-left[ * .opacity20[`filter`: pick observations based on values] * .opacity20[`select`: pick variables] * .bold[`summarize`]: compute statistical summaries * .bold[`group_by`]: perform operations at different levels of your data * .opacity20[`arrange`: reorder data] * .opacity20[`mutate`: create new variables] ] .pull-right[ ```r # compute median departure delay across all fights summarize(flights, median_delay = median(dep_delay, na.rm = TRUE)) ## # A tibble: 1 x 1 ## median_delay ## <dbl> ## 1 -2 # compute median departure delay for each carrier flights %>% group_by(carrier) %>% summarize(median_delay = median(dep_delay, na.rm = TRUE)) ## # A tibble: 16 x 2 ## carrier median_delay ## <chr> <dbl> ## 1 9E -2 ## 2 AA -3 ## 3 AS -3 ## 4 B6 -1 ## 5 DL -2 ## 6 EV -1 ## 7 F9 0.5 ## 8 FL 1 ## 9 HA -4 ## 10 MQ -3 ## 11 OO -6 ## 12 UA 0 ## 13 US -4 ## 14 VX 0 ## 15 WN 1 ## 16 YV -2 ``` ] --- # dplyr .red[review] .pull-left-40[ .center[ <img src="images/wait-one-sec.gif" style="display: block; margin: auto;" /> ] ] .pull-right-60[ ```r #--------------TRADITIONAL APPROACH ------------------------------------------------------- # Step 1: group data grouped_flights <- group_by(flights, carrier) # Step 2: compute median departure delay for each carrier summarize(grouped_flights, median_delay = median(dep_delay, na.rm = TRUE)) #--------------NESTED APPROACH ------------------------------------------------------------ summarize(group_by(flights, carrier), median_delay = median(dep_delay, na.rm = TRUE)) #--------------PIPE OPERATOR APPROACH ----------------------------------------------------- flights %>% group_by(carrier) %>% summarize(median_delay = median(dep_delay, na.rm = TRUE)) ``` ] <br><br><br><br><br><br><br><br><br><br><br><br><br> .center.bold[The .blue[`%>%`] operator takes the output of the left-hand-side function and passes it to the right-hand-side function as the first argument.] --- # dplyr .red[review] .pull-left[ * .bold[`filter`]: pick observations based on values * .opacity20[`select`: pick variables] * .bold[`summarize`]: compute statistical summaries * .bold[`group_by`]: perform operations at different levels of your data * .bold[`arrange`]: reorder data * .opacity20[`mutate`: create new variables] ] .pull-right[ ```r # rank order carriers with greatest delays on Jan 31 flights %>% filter(month == 1, day == 31) %>% group_by(carrier) %>% summarize(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% * arrange(desc(avg_delay)) ## # A tibble: 15 x 2 ## carrier avg_delay ## <chr> <dbl> ## 1 F9 88 ## 2 WN 68.8 ## 3 EV 68.2 ## 4 YV 47 ## 5 MQ 41.6 ## 6 US 38.2 ## 7 B6 36.5 ## 8 AS 34.5 ## 9 9E 34.3 ## 10 FL 26.2 ## 11 UA 18.8 ## 12 AA 16.2 ## 13 DL 12.0 ## 14 VX -1 ## 15 HA -55 ``` ] --- # dplyr .red[review] .pull-left[ * .bold[`filter`]: pick observations based on values * .opacity20[`select`: pick variables] * .bold[`summarize`]: compute statistical summaries * .bold[`group_by`]: perform operations at different levels of your data * .bold[`arrange`]: reorder data * .bold[`mutate`]: create new variables ] .pull-right[ ```r # rank order carriers with greatest difference in delays flights %>% filter(month == 1, day == 31) %>% group_by(carrier) %>% summarize( avg_dep_delay = mean(dep_delay, na.rm = TRUE), avg_arr_delay = mean(arr_delay, na.rm = TRUE) ) %>% * mutate(avg_diff = avg_arr_delay - avg_dep_delay) %>% arrange(desc(avg_diff )) ## # A tibble: 15 x 4 ## carrier avg_dep_delay avg_arr_delay avg_diff ## <chr> <dbl> <dbl> <dbl> ## 1 B6 28.2 36.5 8.35 ## 2 YV 39 47 8 ## 3 EV 60.2 68.2 7.92 ## 4 MQ 34.2 41.6 7.40 ## 5 US 32.8 38.2 5.35 ## 6 FL 21.3 26.2 4.90 ## 7 UA 15.7 18.8 3.09 ## 8 DL 10.8 12.0 1.25 ## 9 AA 15.1 16.2 1.18 ## 10 9E 34.4 34.3 -0.0952 ## 11 WN 71.4 68.8 -2.59 ## 12 F9 94.5 88 -6.5 ## 13 VX 7.4 -1 -8.4 ## 14 AS 62 34.5 -27.5 ## 15 HA -2 -55 -53 ``` ] --- class: yourturn # Your Turn! .pull-left[ .font90[ ### Challenge 1. Which top 5 destination airports (`dest`) have the largest median arrival delays (`arr_delay`)? ```r # hint flights %>% group_by(____) %>% summarize(____) %>% arrange(____) ``` 2. Which carriers (`carrier`) have the fastest average air speed where `\(air\_speed = \frac{distance}{air\_time}\)`? ```r # hint flights %>% mutate(____) %>% group_by(____) %>% summarize(____) %>% arrange(____) ``` ] ] -- .pull-right[ ### Solution ```r # 1 flights %>% group_by(dest) %>% summarize(med_arr_delay = median(arr_delay, na.rm = TRUE)) %>% arrange(desc(med_arr_delay)) ## # A tibble: 105 x 2 ## dest med_arr_delay ## <chr> <dbl> ## 1 CAE 28 ## 2 OKC 16 ## 3 JAC 15 ## 4 TUL 14 ## 5 EYW 7 ## 6 SMF 4 ## 7 CAK 3 ## 8 HDN 2 ## 9 TYS 2 ## 10 ANC 1.5 ## # … with 95 more rows ``` ] --- class: yourturn # Your Turn! .pull-left[ .font90[ ### Challenge 1. Which top 5 destination airports (`dest`) have the largest median arrival delays (`arr_delay`)? ```r # hint flights %>% group_by(____) %>% summarize(____) %>% arrange(____) ``` 2. Which carriers (`carrier`) have the fastest average air speed where `\(air\_speed = \frac{distance}{air\_time}\)`? ```r # hint flights %>% mutate(____) %>% group_by(____) %>% summarize(____) %>% arrange(____) ``` ] ] .pull-right[ ### Solution ```r # 2 flights %>% mutate(air_speed = distance / air_time) %>% group_by(carrier) %>% summarize(avg_air_speed = mean(air_speed, na.rm = TRUE)) %>% arrange(desc(avg_air_speed)) ## # A tibble: 16 x 2 ## carrier avg_air_speed ## <chr> <dbl> ## 1 HA 8.01 ## 2 VX 7.44 ## 3 AS 7.39 ## 4 F9 7.09 ## 5 UA 7.01 ## 6 DL 6.97 ## 7 AA 6.96 ## 8 WN 6.68 ## 9 B6 6.67 ## 10 FL 6.57 ## 11 MQ 6.14 ## 12 OO 6.11 ## 13 EV 6.05 ## 14 9E 5.76 ## 15 US 5.70 ## 16 YV 5.53 ``` ] --- # .red[Scoped] dplyr verbs .font150[Performing manipulations on
multiple
columns can be tedious] -- Say we wanted to compute several departure and arrival time stats for each month: .pull-left[ ```r # compute several departure and arrival time stats for each month flights %>% group_by(month) %>% summarize( dep_actual = mean(dep_time, na.rm = TRUE), dep_sched = mean(sched_dep_time, na.rm = TRUE), dep_delay = mean(dep_delay, na.rm = TRUE), arr_actual = mean(arr_time, na.rm = TRUE), arr_sched = mean(sched_arr_time, na.rm = TRUE), arr_delay = mean(arr_delay, na.rm = TRUE), ) ## # A tibble: 12 x 7 ## month dep_actual dep_sched dep_delay arr_actual arr_sched arr_delay ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 1347. 1341. 10.0 1523. 1548. 6.13 ## 2 2 1348. 1342. 10.8 1522. 1547. 5.61 ## 3 3 1359. 1354. 13.2 1510. 1546. 5.81 ## 4 4 1353. 1351. 13.9 1501. 1537. 11.2 ## 5 5 1351. 1345. 13.0 1503. 1533. 3.52 ## 6 6 1351. 1346. 20.8 1468. 1527. 16.5 ## 7 7 1353. 1347. 21.7 1456. 1521. 16.7 ## 8 8 1350. 1345. 12.6 1495. 1519. 6.04 ## 9 9 1334. 1335. 6.72 1504. 1534. -4.02 ## 10 10 1340. 1336. 6.24 1520. 1539. -0.167 ## 11 11 1344. 1342. 5.44 1523. 1545. 0.461 ## 12 12 1357. 1345. 16.6 1505. 1543. 14.9 ``` ] -- .pull-right[ <img src="images/why-so-difficult.gif" style="display: block; margin: auto;" /> ] --- # .red[Scoped] dplyr verbs .font150[Many __dplyr__ functions have cousins with the following suffixes:] <br> .font140[ - `*_all()`: execute function(s) on all variables - `*_if()`: execute function(s) on variables that meet a certain condition - `*_at()`: execute function(s) for pre-specified variables <br> .center[.content-box-gray[Type `filter_ + tab` to see example options.]] ] --- # Transform .red[all] variables with .red[`*_all()`] .font150.center[ Perform some operation on all variables `df %>%` .red[`xxxx_all`]`(`.blue[`.funs`], .grey[`...`]`)` ] .center[ .content-box-red-dark[.white[scoped dplyr function]] .content-box-blue-dark[.white[function call to execute]] .content-box-grey-dark[.white[additional args for function call]] ] --- # Transform .red[all] variables with .red[`*_all()`] Going back to our prior problem: _say we wanted to compute summary stats across .red[all variables] for each month_: .font140.center[`df %>% summarize_all(.funs, ...)`] ```r flights %>% group_by(month) %>% * summarize_all(mean, na.rm = TRUE) ## # A tibble: 12 x 19 ## month year day dep_time sched_dep_time dep_delay arr_time ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 2013 16.0 1347. 1341. 10.0 1523. ## 2 2 2013 14.7 1348. 1342. 10.8 1522. ## 3 3 2013 16.0 1359. 1354. 13.2 1510. ## 4 4 2013 15.4 1353. 1351. 13.9 1501. ## 5 5 2013 16.0 1351. 1345. 13.0 1503. ## 6 6 2013 15.6 1351. 1346. 20.8 1468. ## 7 7 2013 16.2 1353. 1347. 21.7 1456. ## 8 8 2013 15.8 1350. 1345. 12.6 1495. ## 9 9 2013 15.6 1334. 1335. 6.72 1504. ## 10 10 2013 16.0 1340. 1336. 6.24 1520. ## 11 11 2013 15.3 1344. 1342. 5.44 1523. ## 12 12 2013 15.7 1357. 1345. 16.6 1505. ## # … with 12 more variables: sched_arr_time <dbl>, arr_delay <dbl>, ## # carrier <dbl>, flight <dbl>, tailnum <dbl>, origin <dbl>, dest <dbl>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, ## # time_hour <dttm> ``` --- # Transform .red[all] variables with .red[`*_all()`] ...or _say we wanted to standardize .red[all variables]_: .font140.center[`df %>% mutate_all(.funs, ...)`] -- ```r flights %>% mutate_all(scale) ## Error in mutate_impl(.data, dots): Evaluation error: 'x' must be numeric. flights %>% select(-c(carrier, tailnum, origin, dest, time_hour)) %>% * mutate_all(scale) ## # A tibble: 336,776 x 14 ## year[,1] month[,1] day[,1] dep_time[,1] sched_dep_time[… dep_delay[,1] ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NaN -1.63 -1.68 -1.70 -1.77 -0.265 ## 2 NaN -1.63 -1.68 -1.67 -1.74 -0.215 ## 3 NaN -1.63 -1.68 -1.65 -1.72 -0.265 ## 4 NaN -1.63 -1.68 -1.65 -1.71 -0.339 ## 5 NaN -1.63 -1.68 -1.63 -1.59 -0.464 ## 6 NaN -1.63 -1.68 -1.63 -1.68 -0.414 ## 7 NaN -1.63 -1.68 -1.63 -1.59 -0.439 ## 8 NaN -1.63 -1.68 -1.62 -1.59 -0.389 ## 9 NaN -1.63 -1.68 -1.62 -1.59 -0.389 ## 10 NaN -1.63 -1.68 -1.62 -1.59 -0.364 ## # … with 336,766 more rows, and 8 more variables: arr_time[,1] <dbl>, ## # sched_arr_time[,1] <dbl>, arr_delay[,1] <dbl>, flight[,1] <dbl>, ## # air_time[,1] <dbl>, distance[,1] <dbl>, hour[,1] <dbl>, ## # minute[,1] <dbl> ``` --- # Transform .red[all] variables with .red[`*_all()`] What if we want to write our own <u>custom function</u> on the fly? .pull-left[ ```r flights %>% select(-c(carrier, tailnum, origin, dest, time_hour)) %>% * mutate_all(function(x) (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)) ## # A tibble: 336,776 x 14 ## year month day dep_time sched_dep_time dep_delay arr_time ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NaN -1.63 -1.68 -1.70 -1.77 -0.265 -1.26 ## 2 NaN -1.63 -1.68 -1.67 -1.74 -0.215 -1.22 ## 3 NaN -1.63 -1.68 -1.65 -1.72 -0.265 -1.09 ## 4 NaN -1.63 -1.68 -1.65 -1.71 -0.339 -0.934 ## 5 NaN -1.63 -1.68 -1.63 -1.59 -0.464 -1.29 ## 6 NaN -1.63 -1.68 -1.63 -1.68 -0.414 -1.43 ## 7 NaN -1.63 -1.68 -1.63 -1.59 -0.439 -1.10 ## 8 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.49 ## 9 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.25 ## 10 NaN -1.63 -1.68 -1.62 -1.59 -0.364 -1.40 ## # … with 336,766 more rows, and 7 more variables: sched_arr_time <dbl>, ## # arr_delay <dbl>, flight <dbl>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl> ``` ] .pull-right[ ```r flights %>% select(-c(carrier, tailnum, origin, dest, time_hour)) %>% * mutate_all(.funs = funs((. - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE))) ## # A tibble: 336,776 x 14 ## year month day dep_time sched_dep_time dep_delay arr_time ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NaN -1.63 -1.68 -1.70 -1.77 -0.265 -1.26 ## 2 NaN -1.63 -1.68 -1.67 -1.74 -0.215 -1.22 ## 3 NaN -1.63 -1.68 -1.65 -1.72 -0.265 -1.09 ## 4 NaN -1.63 -1.68 -1.65 -1.71 -0.339 -0.934 ## 5 NaN -1.63 -1.68 -1.63 -1.59 -0.464 -1.29 ## 6 NaN -1.63 -1.68 -1.63 -1.68 -0.414 -1.43 ## 7 NaN -1.63 -1.68 -1.63 -1.59 -0.439 -1.10 ## 8 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.49 ## 9 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.25 ## 10 NaN -1.63 -1.68 -1.62 -1.59 -0.364 -1.40 ## # … with 336,766 more rows, and 7 more variables: sched_arr_time <dbl>, ## # arr_delay <dbl>, flight <dbl>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl> ``` ] --- class: yourturn # Your Turn! .pull-left[ ### Challenge Using the `flights` data: 1. group by month and day 2. select all variables containing "time" and "delay" 3. compute the mean of all "time" and "delay" variables ```r # hint flights %>% group_by(____) %>% select(____) %>% summarize_all(____) ``` ] -- .pull-right[ ### Solution ```r flights %>% group_by(month, day) %>% select(contains("time"), contains("delay")) %>% summarize_all(mean, na.rm = TRUE) ## # A tibble: 365 x 10 ## # Groups: month [?] ## month day dep_time sched_dep_time arr_time sched_arr_time air_time ## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 1 1385. 1372. 1562. 1568. 170. ## 2 1 2 1354. 1341. 1533. 1539. 162. ## 3 1 3 1357. 1344. 1536. 1543. 157. ## 4 1 4 1348. 1341. 1519. 1541. 151. ## 5 1 5 1326. 1326. 1509. 1524. 161. ## 6 1 6 1399. 1393. 1573. 1596. 160. ## 7 1 7 1341. 1337. 1516. 1539. 145. ## 8 1 8 1335. 1334. 1534. 1536. 149. ## 9 1 9 1326. 1335. 1523. 1538. 153. ## 10 1 10 1333. 1339. 1523. 1540. 147. ## # … with 355 more rows, and 3 more variables: time_hour <dttm>, ## # dep_delay <dbl>, arr_delay <dbl> ``` ] --- # .font80[But what if we only want to transform .red[some] but not all .red[variables]?] .pull-left[ * In a prior example, we saw that an error occurred when we tried to `mutate_all()` variables. * This is because we are trying to execute a function that does not work on non-numeric variables * What we actually want is to standardize variables ___if they are numeric___ * .red[We can use the `*_if()` suffix to only execute on variables that meet a certain condition] ] .pull-right[ ```r # my own standarizing function standardize <- function(x) { (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE) } # results in an error due to non-numeric variables flights %>% mutate_all(standardize) ## Error in mutate_impl(.data, dots): Evaluation error: non-numeric argument to binary operator. # works when performed on only numeric variables flights %>% * select(-c(carrier, tailnum, origin, dest, time_hour)) %>% * mutate_all(standardize) ## # A tibble: 336,776 x 14 ## year month day dep_time sched_dep_time dep_delay arr_time ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NaN -1.63 -1.68 -1.70 -1.77 -0.265 -1.26 ## 2 NaN -1.63 -1.68 -1.67 -1.74 -0.215 -1.22 ## 3 NaN -1.63 -1.68 -1.65 -1.72 -0.265 -1.09 ## 4 NaN -1.63 -1.68 -1.65 -1.71 -0.339 -0.934 ## 5 NaN -1.63 -1.68 -1.63 -1.59 -0.464 -1.29 ## 6 NaN -1.63 -1.68 -1.63 -1.68 -0.414 -1.43 ## 7 NaN -1.63 -1.68 -1.63 -1.59 -0.439 -1.10 ## 8 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.49 ## 9 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.25 ## 10 NaN -1.63 -1.68 -1.62 -1.59 -0.364 -1.40 ## # … with 336,766 more rows, and 7 more variables: sched_arr_time <dbl>, ## # arr_delay <dbl>, flight <dbl>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl> ``` ] --- # Transform .red[some] variables with .red[`*_if()`] .font150.center[ Perform some operation on all variables that meet a condition `df %>%` .red[`xxxx_if`]`(`.green[`.predicate`], .blue[`.funs`], .grey[`...`]`)` ] .center.font90[ .content-box-red-dark[.white[scoped dplyr function]] .content-box-green-dark[.white[condition to meet]] .content-box-blue-dark[.white[function call to execute]] .content-box-grey-dark[.white[additional args for function call]] ] --- # Transform .red[some] variables with .red[`*_if()`] Back to our problem of _wanting to standardize .red[only numeric variables]_: .font150.center[`df %>% mutate_if(.predicate, .funs, ...)`] ```r # standardize only numeric variables but keep ALL variables flights %>% * mutate_if(is.numeric, standardize) ## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NaN -1.63 -1.68 -1.70 -1.77 -0.265 -1.26 ## 2 NaN -1.63 -1.68 -1.67 -1.74 -0.215 -1.22 ## 3 NaN -1.63 -1.68 -1.65 -1.72 -0.265 -1.09 ## 4 NaN -1.63 -1.68 -1.65 -1.71 -0.339 -0.934 ## 5 NaN -1.63 -1.68 -1.63 -1.59 -0.464 -1.29 ## 6 NaN -1.63 -1.68 -1.63 -1.68 -0.414 -1.43 ## 7 NaN -1.63 -1.68 -1.63 -1.59 -0.439 -1.10 ## 8 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.49 ## 9 NaN -1.63 -1.68 -1.62 -1.59 -0.389 -1.25 ## 10 NaN -1.63 -1.68 -1.62 -1.59 -0.364 -1.40 ## # … with 336,766 more rows, and 12 more variables: sched_arr_time <dbl>, ## # arr_delay <dbl>, carrier <chr>, flight <dbl>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm> ``` --- class: yourturn # Your Turn! .pull-left[ ### Challenge Using the `flights` data: 1. convert year, month, and day variables to type character 2. group by month 3. compute the mean of all numeric variables ```r # hint flights %>% mutate(____) %>% group_by(____) %>% summarize_if(____) ``` ] -- .pull-right[ ### Solution ```r flights %>% mutate( year = as.character(year), month = as.character(month), day = as.character(day) ) %>% group_by(month) %>% summarize_if(is.numeric, mean, na.rm = TRUE) ## # A tibble: 12 x 12 ## month dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 1347. 1341. 10.0 1523. 1548. ## 2 10 1340. 1336. 6.24 1520. 1539. ## 3 11 1344. 1342. 5.44 1523. 1545. ## 4 12 1357. 1345. 16.6 1505. 1543. ## 5 2 1348. 1342. 10.8 1522. 1547. ## 6 3 1359. 1354. 13.2 1510. 1546. ## 7 4 1353. 1351. 13.9 1501. 1537. ## 8 5 1351. 1345. 13.0 1503. 1533. ## 9 6 1351. 1346. 20.8 1468. 1527. ## 10 7 1353. 1347. 21.7 1456. 1521. ## 11 8 1350. 1345. 12.6 1495. 1519. ## 12 9 1334. 1335. 6.72 1504. 1534. ## # … with 6 more variables: arr_delay <dbl>, flight <dbl>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl> ``` ] --- class: yourturn # Your Turn! .pull-left[ ### Challenge Using the `flights` data: 1. convert year, month, and day variables to type character 2. group by month 3. compute the mean of all numeric variables ```r # hint flights %>% mutate(____) %>% group_by(____) %>% summarize_if(____) ``` ] .pull-right[ ### Solution ```r flights %>% mutate( * year = as.character(year), # can we * month = as.character(month), # simplify * day = as.character(day) # this? ) %>% group_by(month) %>% summarize_if(is.numeric, mean, na.rm = TRUE) ## # A tibble: 12 x 12 ## month dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 1347. 1341. 10.0 1523. 1548. ## 2 10 1340. 1336. 6.24 1520. 1539. ## 3 11 1344. 1342. 5.44 1523. 1545. ## 4 12 1357. 1345. 16.6 1505. 1543. ## 5 2 1348. 1342. 10.8 1522. 1547. ## 6 3 1359. 1354. 13.2 1510. 1546. ## 7 4 1353. 1351. 13.9 1501. 1537. ## 8 5 1351. 1345. 13.0 1503. 1533. ## 9 6 1351. 1346. 20.8 1468. 1527. ## 10 7 1353. 1347. 21.7 1456. 1521. ## 11 8 1350. 1345. 12.6 1495. 1519. ## 12 9 1334. 1335. 6.72 1504. 1534. ## # … with 6 more variables: arr_delay <dbl>, flight <dbl>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl> ``` ] --- # Transform .red[specific] variables with .red[`*_at()`] .font150.center[ Perform some operation on pre-specified variables `df %>%` .red[`xxxx_at`]`(`.green[`vars(x, y, z)`], .blue[`.funs`], .grey[`...`]`)` ] .center.font90[ .content-box-red-dark[.white[scoped dplyr function]] .content-box-green-dark[.white[variables to execute on]] .content-box-blue-dark[.white[function call to execute]] .content-box-grey-dark[.white[additional args for function call]] ] --- # Transform .red[specific] variables with .red[`*_at()`] Back to our problem of _wanting to mutate .red[specific variables]_: .font150.center[`df %>% mutate_at(vars(x, y, z), .funs, ...)`] .pull-left[ .center.font120.bold[Unscoped approach] ```r flights %>% mutate( * year = as.character(year), # can we * month = as.character(month), # simplify * day = as.character(day) # this? ) %>% group_by(month) %>% summarize_if(is.numeric, mean, na.rm = TRUE) ## # A tibble: 12 x 12 ## month dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 1347. 1341. 10.0 1523. 1548. ## 2 10 1340. 1336. 6.24 1520. 1539. ## 3 11 1344. 1342. 5.44 1523. 1545. ## 4 12 1357. 1345. 16.6 1505. 1543. ## 5 2 1348. 1342. 10.8 1522. 1547. ## 6 3 1359. 1354. 13.2 1510. 1546. ## 7 4 1353. 1351. 13.9 1501. 1537. ## 8 5 1351. 1345. 13.0 1503. 1533. ## 9 6 1351. 1346. 20.8 1468. 1527. ## 10 7 1353. 1347. 21.7 1456. 1521. ## 11 8 1350. 1345. 12.6 1495. 1519. ## 12 9 1334. 1335. 6.72 1504. 1534. ## # … with 6 more variables: arr_delay <dbl>, flight <dbl>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl> ``` ] .pull-right[ .center.font120.bold[Scoped `*_at()` approach] ```r flights %>% * mutate_at(vars(year, month, day), as.character) %>% group_by(month) %>% summarize_if(is.numeric, mean, na.rm = TRUE) ## # A tibble: 12 x 12 ## month dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 1 1347. 1341. 10.0 1523. 1548. ## 2 10 1340. 1336. 6.24 1520. 1539. ## 3 11 1344. 1342. 5.44 1523. 1545. ## 4 12 1357. 1345. 16.6 1505. 1543. ## 5 2 1348. 1342. 10.8 1522. 1547. ## 6 3 1359. 1354. 13.2 1510. 1546. ## 7 4 1353. 1351. 13.9 1501. 1537. ## 8 5 1351. 1345. 13.0 1503. 1533. ## 9 6 1351. 1346. 20.8 1468. 1527. ## 10 7 1353. 1347. 21.7 1456. 1521. ## 11 8 1350. 1345. 12.6 1495. 1519. ## 12 9 1334. 1335. 6.72 1504. 1534. ## # … with 6 more variables: arr_delay <dbl>, flight <dbl>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl> ``` ] --- # Thinking out loud
<img src="https://emojis.slackmojis.com/emojis/images/1542340471/4979/thinking.gif?1542340471" style="height:1em; width:auto; "/>
.font130[ Can you think of an easy way to do the following: 1. for each destination airport, 2. compute the median of all variables that contain "delay", 3. re-order the destinations in descending order based on all delay statistics ] -- ```r flights %>% group_by(dest) %>% summarize_at(vars(contains("delay")), median, na.rm = TRUE) %>% arrange_if(is.numeric, desc) ## # A tibble: 105 x 3 ## dest dep_delay arr_delay ## <chr> <dbl> <dbl> ## 1 CAE 14 28 ## 2 SBN 14 -3.5 ## 3 JAC 13.5 15 ## 4 OKC 10 16 ## 5 TUL 8 14 ## 6 HDN 6.5 2 ## 7 ANC 3 1.5 ## 8 MTJ 3 -10.5 ## 9 SMF 2 4 ## 10 MDW 2 -1 ## # … with 95 more rows ``` --- # Thinking out loud
<img src="https://emojis.slackmojis.com/emojis/images/1542340471/4979/thinking.gif?1542340471" style="height:1em; width:auto; "/>
.font130[ Can you think of an easy way to do the following: 1. for each destination airport, 2. compute the median of all variables that contain "delay", 3. re-order the destinations in descending order based on all delay statistics ] .center.font130.red[Any `select()` helper function can be used in `vars()`!] -- .center.font110.red[and we can combine them just like we do in `select()`] ```r flights %>% group_by(dest) %>% summarize_at(vars(contains("delay"), ends_with("time")), median, na.rm = TRUE) ## # A tibble: 105 x 8 ## dest dep_delay arr_delay dep_time sched_dep_time arr_time ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ABQ 0 -5.5 2005 2001 2246. ## 2 ACK -3 -3 1141 1140 1242 ## 3 ALB 1 -4 1738 1721 1832 ## 4 ANC 3 1.5 1618 1615 1957 ## 5 ATL -2 -1 1317 1308 1535 ## 6 AUS -1 -5 1652. 1655 1829 ## 7 AVL -3 -1 1051 1053 1242 ## 8 BDL -1 -10 1328. 1327 1414. ## 9 BGR -2 -9 1842. 1825 1955 ## 10 BHM 1 -2 1947 1910 2120 ## # … with 95 more rows, and 2 more variables: sched_arr_time <dbl>, ## # air_time <dbl> ``` --- # Computing .red[multiple] functions .pull-left-30[ .font110[ * We can also include multiple functions to be executed * Use the .red.bold[`funs()`] wrapper * Additional arguments must be relevant to ___all___ functions * Applicable to `*_all()`, `*_if()`, and `*at_()` ] ] .pull-right-60[ ```r flights %>% group_by(dest) %>% summarize_at(vars(contains("delay")), funs(mean, sd), na.rm = TRUE) ## # A tibble: 105 x 5 ## dest dep_delay_mean arr_delay_mean dep_delay_sd arr_delay_sd ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 ABQ 13.7 4.38 30.5 42.0 ## 2 ACK 6.46 4.85 26.3 30.0 ## 3 ALB 23.6 14.4 48.6 50.5 ## 4 ANC 12.9 -2.5 25.6 26.4 ## 5 ATL 12.5 11.3 43.8 47.0 ## 6 AUS 13.0 6.02 36.7 43.5 ## 7 AVL 8.19 8.00 30.1 33.6 ## 8 BDL 17.7 7.05 41.0 42.1 ## 9 BGR 19.5 8.03 45.1 46.4 ## 10 BHM 29.7 16.9 56.9 56.2 ## # … with 95 more rows ``` ] --- # `filter_*()` .red[and its helpers] <br> .font120[ * Sometimes we want to filter rows based on multiple variables * Until now, we have been discussing manipulation and summarization of multiple variables, but we also ossasionally need to filter by the same condition on multiple variables. * Common examples of this include: - missing values - infinite values - removing outliers ] --- # Filtering rows that meet certain conditions .pull-left[ * The .red[`all_vars()`] function can be used to filter rows where .red[all variables meet the same logical condition]. <br><br> ```r # This will return rows where all variables containing "delay" are NA flights %>% filter_at(vars(contains("delay")), all_vars(is.na(.))) ## # A tibble: 8,255 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 NA 1630 NA NA ## 2 2013 1 1 NA 1935 NA NA ## 3 2013 1 1 NA 1500 NA NA ## 4 2013 1 1 NA 600 NA NA ## 5 2013 1 2 NA 1540 NA NA ## 6 2013 1 2 NA 1620 NA NA ## 7 2013 1 2 NA 1355 NA NA ## 8 2013 1 2 NA 1420 NA NA ## 9 2013 1 2 NA 1321 NA NA ## 10 2013 1 2 NA 1545 NA NA ## # … with 8,245 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm> ``` ] -- .pull-right[ * The .red[`any_vars()`] function can be used to filter rows where .red[at least one variable meets the logical condition]. ```r # This will return rows where any variable containing "delay" is NA flights %>% filter_at(vars(contains("delay")), any_vars(is.na(.))) ## # A tibble: 9,430 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 1525 1530 -5 1934 ## 2 2013 1 1 1528 1459 29 2002 ## 3 2013 1 1 1740 1745 -5 2158 ## 4 2013 1 1 1807 1738 29 2251 ## 5 2013 1 1 1939 1840 59 29 ## 6 2013 1 1 1952 1930 22 2358 ## 7 2013 1 1 2016 1930 46 NA ## 8 2013 1 1 NA 1630 NA NA ## 9 2013 1 1 NA 1935 NA NA ## 10 2013 1 1 NA 1500 NA NA ## # … with 9,420 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm> ``` ] --- # `group_by_*()` .red[and its helpers] Say we wanted to compute the median delay values for carriers by month and in doing so, we wanted to treat both `carrier` and `month` as factors. .pull-left[ .center.font120.bold[Option A] ```r flights %>% mutate( carrier = as.factor(carrier), month = as.factor(month) ) %>% group_by(carrier, month) %>% summarize_at(vars(contains("delay")), median, na.rm = TRUE) ## # A tibble: 185 x 4 ## # Groups: carrier [?] ## carrier month dep_delay arr_delay ## <fct> <fct> <dbl> <dbl> ## 1 9E 1 -2 -4 ## 2 9E 2 -3 -6 ## 3 9E 3 -3 -11 ## 4 9E 4 -3 -8 ## 5 9E 5 -1 -8 ## 6 9E 6 3 -2 ## 7 9E 7 3 -1 ## 8 9E 8 0 -8 ## 9 9E 9 -4 -15 ## 10 9E 10 -3 -10 ## # … with 175 more rows ``` ] -- .pull-right[ .center.font120.bold[Option B] ```r flights %>% group_by_at(vars(carrier, month), as.factor) %>% summarize_at(vars(contains("delay")), median, na.rm = TRUE) ## # A tibble: 185 x 4 ## # Groups: carrier [?] ## carrier month dep_delay arr_delay ## <fct> <fct> <dbl> <dbl> ## 1 9E 1 -2 -4 ## 2 9E 2 -3 -6 ## 3 9E 3 -3 -11 ## 4 9E 4 -3 -8 ## 5 9E 5 -1 -8 ## 6 9E 6 3 -2 ## 7 9E 7 3 -1 ## 8 9E 8 0 -8 ## 9 9E 9 -4 -15 ## 10 9E 10 -3 -10 ## # … with 175 more rows ``` ] --- class: yourturn # Last Challenge! ### Challenge Fill in the blanks and select the right .red[`filter_*()`] to filter for those flights where .red[either] departure .red[delay] (`dep_delay`) .red[or] arrival .red[delay] (`arr_delay`) exceeded the 99th percentile (hint: `quantile(x, .99)` provides the 99th percentile for variable `x`). ```r flights %>% filter_xxx(vars(contains("_____")), any_vars(___ > quantile(___, .99, na.rm = TRUE))) ``` --- class: yourturn # Last Challenge! ### Challenge Fill in the blanks and select the right .red[`filter_*()`] to filter for those flights where .red[either] departure .red[delay] (`dep_delay`) .red[or] arrival .red[delay] (`arr_delay`) exceeded the 99th percentile (hint: `quantile(x, .99)` provides the 99th percentile for variable `x`). ```r flights %>% filter_at(vars(contains("delay")), any_vars(. > quantile(., .99, na.rm = TRUE))) ## # A tibble: 3,777 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## <int> <int> <int> <int> <int> <dbl> <int> ## 1 2013 1 1 848 1835 853 1001 ## 2 2013 1 1 1815 1325 290 2120 ## 3 2013 1 1 1842 1422 260 1958 ## 4 2013 1 1 2006 1630 216 2230 ## 5 2013 1 1 2115 1700 255 2330 ## 6 2013 1 1 2205 1720 285 46 ## 7 2013 1 1 2312 2000 192 21 ## 8 2013 1 1 2343 1724 379 314 ## 9 2013 1 2 1244 900 224 1431 ## 10 2013 1 2 1332 904 268 1616 ## # … with 3,767 more rows, and 12 more variables: sched_arr_time <int>, ## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, ## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm> ``` --- # Key things to remember .pull-left-60[ * dplyr scoped variants: - .bold[`*_all()`]: execute function(s) on all variables or... - .bold[`*_if()`]: on variables that meet a certain condition or... - .bold[`*_at()`]: for pre-specified variables * argument functions within scoped variants: - .bold[`vars()`]: specify the variables to be executed on - .bold[`funs()`]: specify the functions to be executed * helper functions for `filter_*()` - .bold[`all_vars()`]: filter for rows where all variables meet the specified condition - .bold[`any_vars()`]: filter for rows where at least one variable meets the specified condition ] .pull-right-40[ <br><br> <img src="images/information-overload.jpg" width="640" style="display: block; margin: auto;" /> ] --- # Key things to remember <img src="images/cheatsheet-dplyr.png" width="55%" style="display: block; margin: auto;" /> .center[.content-box-gray[.bold[`Help >> Cheatsheets >> Data Transformation with dplyr`]]] --- # Questions? <br> <img src="images/questions.png" width="450" height="450" style="display: block; margin: auto;" />