Data science with {hyenaR}:
LESSON 14

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions
library(stringr) ## Working with text
library(waldo) ## To compare objects
library(skimr) ## Inspect data


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of all adults of main clans with their highest lifetime rank.

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of all adults of main clans with their highest lifetime rank.

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of all adults of main clans with their highest lifetime rank.

List of all adults of main clans



## Work within the observation period...
start_date = find_pop_date.observation.first()
end_date = find_pop_date.observation.last()
create_id_starting.table(
  #Only known sex
  sex = c("female", "male"),
  #Individuals started and stopped being adults
  #in the study period (i.e. uncensored)
  lifestage = "adult",
  lifestage.overlap = "within",
  #Only use individuals born in main clans
  #Males that disperse from outside could have adult period unobserved
  clan.birth = find_clan_name.all(main.clans = TRUE),
  from = start_date,
  to = end_date
)
# A tibble: 904 × 1
   ID   
   <chr>
 1 A-008
 2 A-010
 3 A-018
 4 A-019
 5 A-040
 6 A-041
 7 A-046
 8 A-049
 9 A-054
10 A-056
# … with 894 more rows

…with their highest lifetime rank.



create_id_starting.table(
  sex = c("female", "male"),
  lifestage = "adult",
  lifestage.overlap = "within",
  clan.birth = find_clan_name.all(main.clans = TRUE),
  from = start_date,
  to = end_date
) %>% 
  # Find the period during which ranks need to be calculated
  mutate(start_adult = fetch_id_date.at.age(ID, age = 2),
         end_adult = fetch_id_date.death(ID))
# A tibble: 904 × 3
   ID    start_adult end_adult 
   <chr> <date>      <date>    
 1 A-008 1996-05-20  2008-11-30
 2 A-010 1996-09-20  2000-02-06
 3 A-018 1996-09-20  2004-06-28
 4 A-019 1996-09-16  2010-05-17
 5 A-040 1996-04-12  2006-10-26
 6 A-041 1996-07-13  2003-05-03
 7 A-046 1996-09-20  2003-11-03
 8 A-049 1996-04-13  2003-09-29
 9 A-054 1997-12-22  2002-12-03
10 A-056 1996-06-05  2008-05-16
# … with 894 more rows

…with their highest lifetime rank.



(expanded_df <- create_id_starting.table(
  sex = c("female", "male"),
  lifestage = "adult", lifestage.overlap = "within",
  clan.birth = find_clan_name.all(main.clans = TRUE),
  from = start_date, to = end_date) %>% 
  mutate(start_adult = fetch_id_date.at.age(ID, age = 2),
         end_adult = fetch_id_date.death(ID)) %>% 
  #Expand data to one row per 6 month period of an adults life
  reshape_row_date.seq(ID, from = start_adult, to = end_adult, by = "6 month"))
# A tibble: 9,798 × 2
   ID    date      
   <chr> <date>    
 1 A-008 1996-05-20
 2 A-008 1996-11-20
 3 A-008 1997-05-20
 4 A-008 1997-11-20
 5 A-008 1998-05-20
 6 A-008 1998-11-20
 7 A-008 1999-05-20
 8 A-008 1999-11-20
 9 A-008 2000-05-20
10 A-008 2000-11-20
# … with 9,788 more rows

…with their highest lifetime rank.



mainclans <- find_clan_name.all()
system.time({expanded_df_rank <- expanded_df %>% 
    # Extract current clan and per 6 months
    mutate(current_clan = fetch_id_clan.current(ID, at = date)) %>% 
    # Remove individuals that were outside the clan at any point
    group_by(ID) %>% 
    filter(all(current_clan %in% mainclans)) %>% 
    ungroup() %>% 
    #Extract ranks of remaining individuals
    mutate(rank = fetch_id_rank.std(ID, at = date))})
   user  system elapsed 
  5.655   0.595   6.053 

…with their highest lifetime rank.



expanded_df_rank
# A tibble: 9,348 × 4
   ID    date       current_clan    rank
   <chr> <date>     <chr>          <dbl>
 1 A-008 1996-05-20 A             0.0244
 2 A-008 1996-11-20 A            -0.116 
 3 A-008 1997-05-20 A            -0.277 
 4 A-008 1997-11-20 A            -0.257 
 5 A-008 1998-05-20 A            -0.189 
 6 A-008 1998-11-20 S            -0.111 
 7 A-008 1999-05-20 S             0     
 8 A-008 1999-11-20 S             0.167 
 9 A-008 2000-05-20 S             0     
10 A-008 2000-11-20 S             0.333 
# … with 9,338 more rows

…with their highest lifetime rank.

Code
plot_data <- expanded_df_rank %>% 
  mutate(sex = fetch_id_sex(ID),
         philo = case_when(fetch_id_is.native(ID, at = date) ~ "philopatric",
                           TRUE ~ "disperser")) %>% 
  group_by(ID, philo) %>% 
  mutate(years_since_start = (date - min(date))/365,
         relative_rank_change = rank - min(rank))

ggplot(data = plot_data) +
  geom_line(aes(x = date, y = rank,
                group = ID, colour = ID)) +
  scale_x_date(breaks = seq(as.Date("1995-01-01"), as.Date("2020-01-01"), "5 year"),
               date_labels = "%Y",
               limits = c(as.Date("1995-01-01"), as.Date("2021-01-01"))) +
  labs(x = "Date", y = "Standardised rank") +
  facet_wrap(facets = ~sex + philo) +
  theme_classic() +
  theme(legend.position = "none",
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10, colour = "black"))

…with their highest lifetime rank.

Code
plot_data2 <- plot_data %>% 
  filter(current_clan == "A")

ggplot(data = plot_data2) +
  geom_line(aes(x = date, y = rank,
                group = ID, colour = ID)) +
  scale_x_date(breaks = seq(as.Date("1995-01-01"), as.Date("2020-01-01"), "5 year"),
               date_labels = "%Y",
               limits = c(as.Date("1995-01-01"), as.Date("2021-01-01"))) +
  facet_wrap(facets = ~sex + philo) +
  labs(x = "Date", y = "Standardised rank",
       title = "Airstrip") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10, colour = "black"),
        plot.title = element_text(size = 20, colour = "black", face = "bold"))

…with their highest lifetime rank.

Code
ggplot(data = plot_data2) +
  geom_line(aes(x = years_since_start, y = rank,
                group = ID, colour = ID)) +
  labs(y = "Standardised rank", x = "Years in lifestage",
       title = "Airstrip") +
  facet_wrap(facets = ~sex + philo) +
  theme_classic() +
  theme(legend.position = "none",
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10, colour = "black"),
        plot.title = element_text(size = 20, colour = "black", face = "bold"))

…with their highest lifetime rank.

Code
ggplot(data = plot_data2) +
  geom_line(aes(x = years_since_start, y = rank,
                group = ID), colour = "grey60",
            alpha = 0.5) +
  geom_smooth(aes(x = years_since_start, y = rank),
              colour = "black", se = FALSE) +
  labs(y = "Standardised rank", x = "Years in lifestage",
       title = "Airstrip") +
  facet_wrap(facets = ~sex + philo) +
  theme_classic() +
  theme(legend.position = "none",
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10, colour = "black"),
        plot.title = element_text(size = 20, colour = "black", face = "bold"))

…with their highest lifetime rank.

Code
plot_data3 <- plot_data %>% 
  filter(ID == "A-019")

ggplot(data = plot_data3) +
  geom_line(aes(x = date, y = rank,
                group = ID), size = 1,
            colour = "black") +
  geom_point(aes(x = date, y = rank),
             size = 2, shape = 21,
            colour = "black", fill = "white", stroke = 1) +
  labs(y = "Standardised rank", x = "Years in lifestage",
       title = "A-019") +
  scale_y_continuous(limits = c(NA, 1),
                     breaks = seq(-0.25, 1, 0.25)) +
  scale_x_date(breaks = seq(as.Date("1995-01-01"), as.Date("2020-01-01"), "5 year"),
               date_labels = "%Y",
               limits = c(as.Date("1995-01-01"), as.Date("2021-01-01"))) +
  theme_classic() +
  theme(legend.position = "none",
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10, colour = "black"),
        plot.title = element_text(size = 20, colour = "black", face = "bold"))

…with their highest lifetime rank.



expanded_df_rank %>% 
  #Determine the max rank per ID
  group_by(ID) %>% 
  summarise(max_rank = max(rank)) %>% 
  arrange(desc(max_rank))
# A tibble: 877 × 2
   ID    max_rank
   <chr>    <dbl>
 1 A-019        1
 2 A-114        1
 3 A-194        1
 4 A-353        1
 5 E-006        1
 6 E-089        1
 7 E-166        1
 8 E-191        1
 9 F-006        1
10 F-107        1
# … with 867 more rows

…with their highest lifetime rank.

Code
plot_data4 <- expanded_df_rank %>% 
  #Determine the max rank per ID
  group_by(ID) %>% 
  summarise(max_rank = max(rank)) %>% 
  arrange(desc(max_rank)) %>% 
  mutate(birthclan = fetch_id_clan.birth(ID),
         sex = fetch_id_sex(ID))

ggplot(data = plot_data4) +
  geom_boxplot(aes(x = birthclan, y = max_rank, fill = sex)) +
  labs(x = "Clan", y = "Maximum rank") +
  theme_classic() +
  theme(axis.title = element_text(size = 15),
        axis.text = element_text(size = 10, colour = "black"),
        plot.title = element_text(size = 20, colour = "black", face = "bold"))