class: clear, center, middle background-image: url(images/time-for-review.jpg) background-size: cover --- # Prerequisites .pull-left[ ### Packages ```r library(dplyr) # or library(tidyverse) ``` ] .pull-right[ ### Data ```r transactions <- data.table::fread("data/transactions.csv", data.table = FALSE) %>% as.tibble() transactions ## # A tibble: 1,857,567 x 9 ## basket_num hshd_num purchase_ product_num spend units store_r week_num ## <int> <int> <chr> <int> <dbl> <int> <chr> <int> ## 1 100369 3708 09-DEC-17 93466 3.18 2 SOUTH 101 ## 2 891779 719 20-SEP-17 85201 3.49 1 CENTRAL 90 ## 3 609562 4995 07-MAR-17 2507006 0.89 1 CENTRAL 62 ## 4 760220 44 19-JUN-17 4819172 8.99 1 SOUTH 77 ## 5 869525 3937 04-SEP-17 1055355 1 1 SOUTH 88 ## 6 922989 2356 13-OCT-17 4285485 2.87 1 WEST 93 ## 7 732544 694 31-MAY-17 8511 0.64 1 CENTRAL 74 ## 8 536185 3728 14-JAN-17 85939 1 1 SOUTH 54 ## 9 967914 887 15-NOV-17 3775301 4.69 1 CENTRAL 98 ## 10 102995 574 27-DEC-17 72765 1.29 1 EAST 104 ## # ... with 1,857,557 more rows, and 1 more variable: year <int> ``` ] --- # Leverage the cheat sheet <img src="images/cheatsheet-dplyr.png" width="1460" style="display: block; margin: auto;" /> .center[.content-box-gray[.bold[`Help >> Cheatsheets >> Data Transformation with dplyr`]]] --- # Question 1 .pull-left[ ### Challenge How much total spend did household 3937 have in week 54? ```r transactions %>% filter(_____, _____) %>% summarize(_____) ``` ] -- .pull-right[ ### Solution ```r transactions %>% filter(hshd_num == 3937, week_num == 54) %>% summarize(spend = sum(spend, na.rm = TRUE)) ## # A tibble: 1 x 1 ## spend ## <dbl> ## 1 86.2 ``` ] --- # Question 2 .pull-left[ ### Challenge Compute the total spend for baskets that household 3937 had in week 54. ```r transactions %>% filter(_____, _____) %>% group_by(_____) %>% summarize(_____) ``` ] -- .pull-right[ ### Solution ```r transactions %>% filter(hshd_num == 3937, week_num == 54) %>% group_by(basket_num) %>% summarize(spend = sum(spend, na.rm = TRUE)) ## # A tibble: 6 x 2 ## basket_num spend ## <int> <dbl> ## 1 529106 5.59 ## 2 530413 3.99 ## 3 530495 10.3 ## 4 530607 17.8 ## 5 533329 29.8 ## 6 536201 18.8 ``` ] --- # Question 3 .pull-left[ ### Challenge Compute the total units per basket for the central region. Which basket contained the most units? ```r transactions %>% filter(_____) %>% group_by(_____) %>% summarize(_____) %>% arrange(_____) ``` ] -- .pull-right[ ### Solution ```r transactions %>% filter(store_r == "CENTRAL") %>% group_by(basket_num) %>% summarize(units = sum(units, na.rm = TRUE)) %>% arrange(desc(units)) ## # A tibble: 90,410 x 2 ## basket_num units ## <int> <int> ## 1 100268 96 ## 2 101497 96 ## 3 577720 89 ## 4 100466 84 ## 5 101895 84 ## 6 101994 84 ## 7 102184 84 ## 8 101476 83 ## 9 100292 79 ## 10 705629 79 ## # ... with 90,400 more rows ``` ] --- # Question 4 .pull-left[ ### Challenge In the east region, compute each household's spend-to-units ratio for each week. Which household has the largest spend-to-units ratio? ```r transactions %>% filter(_____) %>% group_by(_____, _____) %>% summarize( _____, _____ ) %>% mutate(_____) %>% arrange(_____) ``` ] -- .pull-right[ ### Solution ```r transactions %>% filter(store_r == "EAST") %>% group_by(hshd_num, week_num) %>% summarize( spend = sum(spend, na.rm = TRUE), units = sum(units, na.rm = TRUE) ) %>% mutate(ratio = spend / units) %>% arrange(desc(ratio)) ## # A tibble: 45,865 x 5 ## # Groups: hshd_num [1,251] ## hshd_num week_num spend units ratio ## <int> <int> <dbl> <int> <dbl> ## 1 1324 74 40.3 1 40.3 ## 2 1492 70 40.0 1 40.0 ## 3 144 80 37.0 1 37.0 ## 4 144 104 37.0 1 37.0 ## 5 4912 77 37.0 1 37.0 ## 6 4397 67 35.0 1 35.0 ## 7 4039 88 34.0 1 34.0 ## 8 4429 100 97.6 3 32.5 ## 9 14 93 30.0 1 30.0 ## 10 1050 58 59.9 2 29.9 ## # ... with 45,855 more rows ``` ] --- # Question 5 .pull-left[ ### Challenge Compute total spend by basket for each purchase date. Can you find the date that has the largest average (mean) total spend? ```r transactions %>% ______ %>% ______ %>% ______ %>% ______ %>% ``` ] -- .pull-right[ ### Solution ```r transactions %>% group_by(purchase_, basket_num) %>% summarize(spend = sum(spend, na.rm = TRUE)) %>% summarize(avg_spend = mean(spend)) %>% arrange(desc(avg_spend)) ## # A tibble: 363 x 2 ## purchase_ avg_spend ## <chr> <dbl> ## 1 23-DEC-17 165. ## 2 17-DEC-17 164. ## 3 22-DEC-17 161. ## 4 30-DEC-17 158. ## 5 10-DEC-17 158. ## 6 16-DEC-17 146. ## 7 09-DEC-17 132. ## 8 15-DEC-17 132. ## 9 24-DEC-17 127. ## 10 29-DEC-17 127. ## # ... with 353 more rows ``` ] --- class: clear, center, middle background-image: url(https://raw.githubusercontent.com/bradleyboehmke/Dayton-Weather-2018/master/Dayton_Weather.png) background-size: cover <br><br><br><br><br><br><br><br><br><br><br><br> .font200.bold[Next up...[
Visualizing Data
](https://uc-r.github.io/Intro-R/day-1e-visualization.html)]