library(tidyverse) library(babynames) # 1) babynames %>% filter(year >= 2000) %>% group_by(sex, name) %>% summarize(n=sum(n), .groups="drop") -> bn20 # 2) bn20 %>% group_by(name) %>% pivot_wider(names_from="sex", values_from="n") %>% rename(female = "F", male = "M") %>% filter(female > 0 & male > 0 & (female+male) > 1e5) %>% arrange(abs(female - male) / (male + female)) -> bn20_sim # 3) bn20 %>% group_by(name_length = nchar(name)) %>% summarize(n=sum(n)) # 4) min_len <- min(nchar(bn20$name)) bn20 %>% filter(nchar(name) == min_len) %>% arrange(desc(n)) # 5) babynames %>% group_by(year, sex) %>% summarize(avg_len = mean(nchar(name))) -> bn_avg_len bn_avg_len %>% ggplot() + geom_line(aes(x=year, y=avg_len, color=sex)) # 6) babynames %>% select(-prop) %>% filter(year >= 2000) -> bn # Annahme: "Neue" Namen die im Vorjahr noch nicht in bn auftauchen # werden ausgewertet mit n_prev = 0 replace_with_zero <- function(x) { x[is.na(x)] <- 0 x } bn %>% filter(year > 2000) %>% left_join(mutate(bn, year = year + 1, n_prev = n, n = NULL)) %>% mutate(n_prev = replace_with_zero(n_prev)) -> bn_prev # Wenn das nicht erwünscht, führe man diese Zeile aus # bn %>% inner_join(mutate(bn, year = year + 1, n_prev = n, n = NULL)) -> bn_prev n_mean <- mean(bn$n) bn_prev %>% mutate(s_incr = (n-n_prev)/(n_prev + n_mean)) %>% group_by(year, sex) %>% slice_max(s_incr, n = 3) -> bn_trending