|
- library(tidyverse)
-
-
- # --- load athlete data
-
- # We are only interested in following disciplines
- std_runs <- c("100 metres", "200 metres", "400 metres", "800 metres",
- "1,500 metres", "5,000 metres", "10,000 metres", "Marathon")
-
- read_csv("athlete_events.csv") %>%
- filter(Sport == "Athletics") %>%
- select(Name, Sex, Age, Height, Weight, Year, Event, Medal) %>%
- mutate(
- Event = str_remove(Event, "Athletics Women's "),
- Event = str_remove(Event, "Athletics Men's ")) %>%
- filter(Event %in% std_runs) ->
- athletes
-
-
- # --- Top Medalists
-
- athletes %>%
- pivot_wider(names_from = Medal, values_from = Medal) %>%
- mutate(across(c(Gold, Bronze, Silver), ~ !is.na(.))) %>%
- group_by(Name) %>%
- summarize(across(where(is.logical), sum)) %>%
- arrange(-Gold, -Silver, -Bronze) %>%
- transmute(Name, Gold, Silver, Bronze) ->
- medal_ranking
-
-
- # --- Age distribution among Women's Marathon participants
-
- medal_color <- c(Bronze = "#6A3805", Silver = "#B4B4B4", Gold = "#AF9500")
-
- athletes %>%
- filter(Event == "Marathon", Sex == "F", Year > 1980) %>%
- mutate(Year = as.factor(Year)) ->
- d
-
- ggplot(d, aes(x = Year, y = Age)) +
- geom_boxplot(na.rm=T) +
- geom_point(data = drop_na(d), mapping = aes(color = Medal)) +
- scale_color_manual(values = medal_color) +
- ggtitle("Age distribution among Women's Marathon participants")
-
- # --- Change in height of male runners
-
- athletes %>%
- mutate(Event = factor(Event, levels=std_runs)) %>% # use factor for ordering
- filter(Sex == "M", Year > 1900) %>%
- group_by(Event, Year) %>%
- summarize(MeanHeight = mean(Height, na.rm=T)) %>%
- ggplot(aes(x = Year, y = MeanHeight, color = Event)) +
- geom_point() +
- ggtitle("Men's runs - mean across participants") +
- geom_smooth(se = FALSE)
-
- # --- Medalist times
-
- # Times are given as strings with inconsistent format.
- # Need custom function for conversion in seconds
- str2sec <- function(s) {
- s %>%
- str_split(":|h|-") %>%
- sapply(function(x) {
- v <- as.double(x)
- v3 <- c(0,0,0)
- v3[(4-length(v)):3] <- v
- v3[1] * 3600 + v3[2] * 60 + v3[3]
- })
- }
-
- # We are only interested in following disciplines
- std_runs <- c("100M", "200M", "400M", "800M", "1500M", "5000M", "10000M", "Marathon")
-
- read_csv("results.csv") %>%
- mutate(
- Event = str_remove(Event, c(" Men")),
- Event = str_remove(Event, c(" Women"))) %>%
- filter(Event %in% std_runs) %>%
- mutate(Result = str2sec(Result)) %>%
- drop_na() ->
- runs
-
- medal_color <- c(B = "#6A3805", S = "#B4B4B4", G = "#AF9500")
-
- ggplot(runs, aes(x = Year, y = Result, shape = Gender)) +
- facet_wrap(vars(factor(Event, levels = std_runs)), scales = "free_y") +
- geom_point(aes(color = Medal)) +
- scale_color_manual(values = medal_color) +
- ggtitle("Times of medal winners in different running disciplines") +
- xlab("Year") +
- ylab("Time in seconds") +
- geom_smooth(se = T)
|