library(tidyverse) # --- load athlete data # We are only interested in following disciplines std_runs <- c("100 metres", "200 metres", "400 metres", "800 metres", "1,500 metres", "5,000 metres", "10,000 metres", "Marathon") read_csv("athlete_events.csv") %>% filter(Sport == "Athletics") %>% select(Name, Sex, Age, Height, Weight, Year, Event, Medal) %>% mutate( Event = str_remove(Event, "Athletics Women's "), Event = str_remove(Event, "Athletics Men's ")) %>% filter(Event %in% std_runs) -> athletes # --- Top Medalists athletes %>% pivot_wider(names_from = Medal, values_from = Medal) %>% mutate(across(c(Gold, Bronze, Silver), ~ !is.na(.))) %>% group_by(Name) %>% summarize(across(where(is.logical), sum)) %>% arrange(-Gold, -Silver, -Bronze) %>% transmute(Name, Gold, Silver, Bronze) -> medal_ranking # --- Age distribution among Women's Marathon participants medal_color <- c(Bronze = "#6A3805", Silver = "#B4B4B4", Gold = "#AF9500") athletes %>% filter(Event == "Marathon", Sex == "F", Year > 1980) %>% mutate(Year = as.factor(Year)) -> d ggplot(d, aes(x = Year, y = Age)) + geom_boxplot(na.rm=T) + geom_point(data = drop_na(d), mapping = aes(color = Medal)) + scale_color_manual(values = medal_color) + ggtitle("Age distribution among Women's Marathon participants") # --- Change in height of male runners athletes %>% mutate(Event = factor(Event, levels=std_runs)) %>% # use factor for ordering filter(Sex == "M", Year > 1900) %>% group_by(Event, Year) %>% summarize(MeanHeight = mean(Height, na.rm=T)) %>% ggplot(aes(x = Year, y = MeanHeight, color = Event)) + geom_point() + ggtitle("Men's runs - mean across participants") + geom_smooth(se = FALSE) # --- Medalist times # Times are given as strings with inconsistent format. # Need custom function for conversion in seconds str2sec <- function(s) { s %>% str_split(":|h|-") %>% sapply(function(x) { v <- as.double(x) v3 <- c(0,0,0) v3[(4-length(v)):3] <- v v3[1] * 3600 + v3[2] * 60 + v3[3] }) } # We are only interested in following disciplines std_runs <- c("100M", "200M", "400M", "800M", "1500M", "5000M", "10000M", "Marathon") read_csv("results.csv") %>% mutate( Event = str_remove(Event, c(" Men")), Event = str_remove(Event, c(" Women"))) %>% filter(Event %in% std_runs) %>% mutate(Result = str2sec(Result)) %>% drop_na() -> runs medal_color <- c(B = "#6A3805", S = "#B4B4B4", G = "#AF9500") ggplot(runs, aes(x = Year, y = Result, shape = Gender)) + facet_wrap(vars(factor(Event, levels = std_runs)), scales = "free_y") + geom_point(aes(color = Medal)) + scale_color_manual(values = medal_color) + ggtitle("Times of medal winners in different running disciplines") + xlab("Year") + ylab("Time in seconds") + geom_smooth(se = T)