suppressMessages(library(tidyverse)) # load necessary packages, but don't display annoying output messages
suppressMessages(library(mosaic)) # Don't do this in your own R code unless we do it for you, since sometimes those messages are important!
psych_60_data_file  <- 'class_data.csv' # this is a file in the same directory as this notebook


psyc60  <- read_csv(psych_60_data_file, # read in our class survey results
                    col_types = cols()) # this tells read_csv() to guess column types
                                        # (which I use here to suppress its normal output of its guesses)
sample_n(psyc60, 3) # randomly print 3 rows


tally(~birth_month, data = psyc60)

birth_month
    April    August  December  February   January      July      June     March 
       10        10         6         4         9        10         6         7 
      May  November   October September 
        8         7         6         5


# get rid of unneeded columns:
seasons_dat  <- select(psyc60, subID, birth_month, favorite_season)
# make birth_season column:
seasons_dat$birth_season  <- case_when(seasons_dat$birth_month %in% month.name[c(12,1,2)] ~ 'Winter',
                                       # ^ Call winter any birth month Dec - Feb 
                                       # (I like this better than Winter = Jan - March, I think)
                                       seasons_dat$birth_month %in% month.name[3:5] ~ 'Spring',
                                       # ^ Spring = Mar - May
                                       seasons_dat$birth_month %in% month.name[6:8] ~ 'Summer',
                                       # ^ June - Aug
                                       seasons_dat$birth_month %in% month.name[9:11] ~ 'Fall') # Sep - Nov
# make birth_season_fave column:
seasons_dat$birth_season_fave  <- seasons_dat$birth_season == seasons_dat$favorite_season
sample_n(seasons_dat,5) # display a few rows


sum(seasons_dat$birth_season_fave == TRUE) / nrow(seasons_dat)
# Looks like about .36 of the class's favorite season is the season they were born


# Quick aside - a nifty trick:
mean(seasons_dat$birth_season_fave) 
# why does this work??? We'll explain it in an upcoming lab, but try to figure it out!


psyc60 %>% # send psych60 to select(), and...
# select these three columns (subID just for posterity, not really needed here)
select(subID, birth_month, favorite_season) %>%  # send just those three columns to...
# add birth season column:
mutate(birth_season = case_when(birth_month %in% month.name[c(12,1,2)] ~ 'Winter',
                                birth_month %in% month.name[3:5] ~ 'Spring',
                                birth_month %in% month.name[6:8] ~ 'Summer',
                                birth_month %in% month.name[9:11] ~ 'Fall')) %>%
# add favorite == birth column
mutate(fav_birth_season_same = birth_season == favorite_season) %>%
# summarize as proportion same:
summarize(prop_fave_birth_month = mean(fav_birth_season_same))


psyc60 %>%
# first line of mutate is new! reorder birth_month. 
mutate(birth_month = factor(birth_month, levels = month.name),
       birth_season = case_when(birth_month %in% month.name[1:3] ~ 'Winter',
                                birth_month %in% month.name[4:6] ~ 'Spring',
                                birth_month %in% month.name[7:9] ~ 'Summer',
                                birth_month %in% month.name[10:12] ~ 'Fall'),
       fav_birth_season_same = birth_season == favorite_season) %>%
# mutate_at() is a special version of mutate() that does the same thing to multiple columns...
# Here I want to reorder the seasons for both season columns (birth and favorite)
mutate_at(c('favorite_season', 'birth_season'), factor, levels = c('Fall','Winter','Spring','Summer')) %>%
# And all the rest of this is new: 
# make the basic plot
gf_bar(~favorite_season, fill = ~birth_season) %>%
# add labels 
gf_labs(x = 'Favorite Season', 
        y = '# Students',
        title = 'Favorite Seasons of Psyc60 Students',
        # I used double quotes " instead of single ' because needed an apostrophe within text!
        subtitle = "And whether it's the same as birth season",
        fill = 'Birth Season') %>%
# facet on same birth & fave season
gf_facet_grid(~fav_birth_season_same, 
              # custom labels: need `` around TRUE / FALSE b/c they are special R values otherwise
              labeller = labeller(fav_birth_season_same = c(`TRUE` = 'Favorite == Birth',
                                                            `FALSE` = 'Favorite != Birth'))) %>%
# tweaking a couple theme options
gf_theme(axis.text.x = element_text(angle=45, hjust = 1), # axis at diagonal
        legend.position = c(.66,.67)) # relative coordinates c(0-1 x , 0-1 y)

subID	years_in_college_range	years_in_college_numeric	num_siblings	birth_month	birth_year	num_times_left_home_country	target_age	num_classes	go_with_aliens	⋯	favorite_country	favorite_color	favorite_musical_genre	num_hours_studying_stats	living_on_campus	age	lark_owl	num_languages	the_dress_colors	home_region
<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	⋯	<chr>	<chr>	<chr>	<chr>	<chr>	<dbl>	<chr>	<dbl>	<chr>	<chr>
sub6510167	3 - 4	3	2	May	1999	2	85	3	Yes	⋯	Spain	Green	funk	0 - 4	No	22	neither	2	black & blue	North America
sub8912365	0 - 1	0	0	December	2001	4	119	5	Yes	⋯	Japan	Blue	Piano music	8 - 12	No	19	night owl	2	white & gold	East Asia
sub4794784	0 - 1	0	1	December	2001	0	92	3	No	⋯	I have not visited another country yet	Purple	r&b	8 - 12	No	19	night owl	1	black & blue	North America

Chaining Operations Demo¶

Overview¶

Load in data & display sample¶

Why Chain Operations?¶

A more complex process...¶

Step-by-step: create/modify variables¶

Pipes:¶

Returning to the pipeline¶

The End¶

A tibble: 1 × 1
prop_fave_birth_month
<dbl>
0.3636364