|
- # Josua Kugler, Christian Merten
-
- # install.packages("babynames")
- library(tidyverse)
-
- ## Create some data-----------------------------------------------------------
-
- set.seed(1)
-
- baseset <- list()
- baseset$grade <- as.integer(c(5,6,7,8,9,10,11))
- baseset$grade_boost <- c(1,3,5,7,8,9,10)
- baseset$letter <- letters[1:4]
- baseset$letter_boost <- sample(1:5, 4, replace=T)
- babynames::babynames %>%
- group_by(sex, name) %>%
- summarise(n = sum(n)) %>%
- arrange(desc(n)) %>%
- mutate(rank = min_rank(-n)) %>%
- filter (rank <= 3000) ->
- ranked_names
- baseset$name <- ranked_names$name
- baseset$distance <- c(100,200,400,1000)
- baseset$distance_boost <- c(14,12,10,8)
-
- sample_observation <- function(n) {
- res <- list()
- res$name <- sample(baseset$name, n, replace=T)
- res$grade <- sample(baseset$grade, n, replace=T)
- res$letter <- sample(baseset$letter, n, replace=T)
- boost_base <-
- baseset$grade_boost[match(res$grade,baseset$grade)] +
- baseset$letter_boost[match(res$letter,baseset$letter)]
- res$time100 <- sample_time(100, baseset$distance_boost[1] + boost_base)
- res$time200 <- sample_time(200, baseset$distance_boost[2] + boost_base)
- res$time400 <- sample_time(400, baseset$distance_boost[3] + boost_base)
- res$time1000 <- sample_time(1000, baseset$distance_boost[4] + boost_base)
- as_tibble(res)
- }
-
- sample_time <- function(dist, boost) {
- (runif(length(boost))/2+2.5)/boost*dist*2
- }
-
- sports <- sample_observation(1000)
-
- requirements <- tibble(
- level = 1:11,
- min100 = seq(43,23,len=11),
- min1000 = seq(500,300,len=11)
- )
-
-
-
- ## Exercises -----------------------------------------------------------------
-
- # a)
- # sort sports by 'name' (alphabetically)
- arrange(sports, name)
-
- # b)
- # sort sports by 'grade' (11, 10, ..., 5),
- # in case of ties by 'letter' (a, b, d, e),
- # in case of ties by 'name' (A-Z)
- arrange(sports, grade, letter, name)
-
- # c)
- # count the numbers of students per class
- sports %>% group_by(grade, letter) %>% summarize(student_count = n())
-
- # d)
- # what is the mean, max and min class size
- sports %>%
- group_by(grade, letter) %>%
- summarize(student_count = n()) %>%
- ungroup() %>%
- summarize(mean_class_size = mean(student_count),
- max_class_size = max(student_count),
- min_class_size = min(student_count))
-
- # e)
- # get all students with a non-unqiue name
- sports %>% group_by(name) %>% filter(n()>1)
-
- # f)
- # get the top 10 sprinters (100m)
- sports %>% top_n(10, -time100)
-
- # g)
- # get the slowest 10 sprinters (100m)
- sports %>% top_n(10, time100)
-
- # h)
- # remove 100m, 200m, and 400m, and add velocity in km/h for 1000m
- sports %>% mutate(velocity = 60*60/time1000) %>% select(-(time100:time1000))
-
- # i)
- # rename 'grade' to 'level'
- sports %>% rename(level = grade)
-
- # j)
- # calculate average and min times for 200m in each grade
- sports %>% group_by(grade) %>% summarize(average200 = mean(time200), min200 = min(time200))
-
- # k)
- # select all time-columns
- sports %>% select(time100:time1000)
-
- # l)
- # show the fastest sprinter (100m) in each class
- # sorted by class (ie by grade and letter)
- sports %>% group_by(grade, letter) %>% top_n(1, -time100) %>% arrange(grade, letter)
|