Data science with {hyenaR}:
LESSON 12

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions
library(stringr) ## Working with text
library(waldo) ## To compare objects


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of females and males of main clans with offspring that survived to adulthood (2y), in descending order, and with the information of whether the number of offspring is censored or not.



GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of females and males of main clans with offspring that survived to adulthood (2y), in descending order, and with the information of whether the number of offspring is censored or not.



GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of females and males of main clans with offspring that survived to adulthood (2y), in descending order, and with the information of whether the number of offspring is censored or not.



GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of females and males of main clans with offspring that survived to adulthood (2y), in descending order, and with the information of whether the number of offspring is censored or not.



GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



List of females and males of main clans with offspring that survived to adulthood (2y), in descending order, and with the information of whether the number of offspring is censored or not.



List of females and males of main clans



start_date <- find_pop_date.observation.first()
end_date   <- find_pop_date.observation.last()
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date)
# A tibble: 2,810 × 1
   ID   
   <chr>
 1 A-001
 2 A-002
 3 A-003
 4 A-004
 5 A-006
 6 A-007
 7 A-008
 8 A-009
 9 A-010
10 A-011
# … with 2,800 more rows

List of females and males of main clans



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  #Extract sex
  mutate(sex = fetch_id_sex(ID))
# A tibble: 2,810 × 2
   ID    sex   
   <chr> <chr> 
 1 A-001 female
 2 A-002 female
 3 A-003 female
 4 A-004 female
 5 A-006 female
 6 A-007 female
 7 A-008 female
 8 A-009 female
 9 A-010 female
10 A-011 male  
# … with 2,800 more rows

List of females and males of main clans



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  ## Count number of each sex
  group_by(sex) %>% 
  count()
# A tibble: 3 × 2
# Groups:   sex [3]
  sex        n
  <chr>  <int>
1 female  1179
2 male    1261
3 <NA>     370

List of females and males of main clans



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  # Only include known sex individuals
  filter(!is.na(sex)) %>% 
  group_by(sex) %>% 
  count()
# A tibble: 2 × 2
# Groups:   sex [2]
  sex        n
  <chr>  <int>
1 female  1179
2 male    1261

with offspring that survived to adulthood (2y)



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex))
# A tibble: 2,440 × 2
   ID    sex   
   <chr> <chr> 
 1 A-001 female
 2 A-002 female
 3 A-003 female
 4 A-004 female
 5 A-006 female
 6 A-007 female
 7 A-008 female
 8 A-009 female
 9 A-010 female
10 A-011 male  
# … with 2,430 more rows

with offspring that survived to adulthood (2y)



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex)) %>% 
  ## Extract counts of offspring >= 2yo
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2,
                                                 unit = "year"))
# A tibble: 2,440 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-001 female          12
 2 A-002 female           1
 3 A-003 female           6
 4 A-004 female           0
 5 A-006 female           5
 6 A-007 female           0
 7 A-008 female           6
 8 A-009 female           4
 9 A-010 female           2
10 A-011 male             7
# … with 2,430 more rows

in descending order



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex)) %>% 
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2,
                                                 unit = "year"))
# A tibble: 2,440 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-001 female          12
 2 A-002 female           1
 3 A-003 female           6
 4 A-004 female           0
 5 A-006 female           5
 6 A-007 female           0
 7 A-008 female           6
 8 A-009 female           4
 9 A-010 female           2
10 A-011 male             7
# … with 2,430 more rows

in descending order



create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex)) %>% 
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2, 
                                                 unit = "year")) %>% 
  # Arrange by sex and descending number offspring
  arrange(sex, desc(nroffspring))
# A tibble: 2,440 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-194 female          16
 2 L-205 female          16
 3 S-105 female          16
 4 N-087 female          15
 5 F-107 female          13
 6 A-001 female          12
 7 L-004 female          11
 8 M-122 female          11
 9 F-007 female          10
10 M-099 female          10
# … with 2,430 more rows

and with the information of whether the number of offspring is censored or not.

create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex)) %>% 
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2,
                                                 unit = "year")) %>% 
  arrange(sex, desc(nroffspring))
# A tibble: 2,440 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-194 female          16
 2 L-205 female          16
 3 S-105 female          16
 4 N-087 female          15
 5 F-107 female          13
 6 A-001 female          12
 7 L-004 female          11
 8 M-122 female          11
 9 F-007 female          10
10 M-099 female          10
# … with 2,430 more rows

and with the information of whether the number of offspring is censored or not.

create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex),
         # Check left and right censoring...
         !fetch_id_is.censored.left(ID, at = start_date),
         !fetch_id_is.censored.right(ID, at = end_date)) %>% 
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2,
                                                 unit = "year")) %>% 
  arrange(sex, desc(nroffspring))
# A tibble: 1,837 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-194 female          16
 2 L-205 female          16
 3 S-105 female          16
 4 N-087 female          15
 5 F-107 female          13
 6 M-122 female          11
 7 M-099 female          10
 8 A-195 female           9
 9 F-097 female           9
10 T-014 female           9
# … with 1,827 more rows

BONUS: More accurate censoring…

What about individuals that were cubs at first observation? Technically, these are left censored but we can be confident they have not yet reproduced.

create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex),
         # Individuals are only considered left censored
         # if they were >2yo at first observation
         !fetch_id_is.censored.left(ID, at = start_date - lubridate::years(2)),
         !fetch_id_is.censored.right(ID, at = end_date)) %>% 
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2,
                                                 unit = "year")) %>% 
  arrange(sex, desc(nroffspring)) -> all_IDs

all_IDs
# A tibble: 1,900 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-194 female          16
 2 L-205 female          16
 3 S-105 female          16
 4 N-087 female          15
 5 F-107 female          13
 6 M-122 female          11
 7 F-007 female          10
 8 M-099 female          10
 9 A-195 female           9
10 F-097 female           9
# … with 1,890 more rows

BONUS: More accurate censoring…

Are cubs censored?

# Table with parent and all their offspring
create_id_offspring.table(ID = all_IDs$ID) %>% 
  ## Check cubs aren't left censored (should be impossible)
  mutate(left_censored = fetch_id_is.censored.left(offspringID, at = find_pop_date.observation.first()),
         ## Check cubs COULD have been observed at 2yo
         right_censored = fetch_id_date.at.age(offspringID, age = 2, unit = "year") > find_pop_date.observation.last())
# A tibble: 4,341 × 6
   parentID offspringID birthdate  filiation             left_censored right_c…¹
   <chr>    <chr>       <date>     <chr>                 <lgl>         <lgl>    
 1 A-194    A-237       2008-01-29 mother_social_genetic FALSE         FALSE    
 2 A-194    A-238       2008-01-29 mother_social_genetic FALSE         FALSE    
 3 A-194    A-259       2009-06-12 mother_social_genetic FALSE         FALSE    
 4 A-194    A-260       2009-06-12 mother_social_genetic FALSE         FALSE    
 5 A-194    A-288       2010-07-10 mother_social_genetic FALSE         FALSE    
 6 A-194    A-289       2010-07-10 mother_social_genetic FALSE         FALSE    
 7 A-194    A-305       2011-06-12 mother_social_genetic FALSE         FALSE    
 8 A-194    A-306       2011-06-12 mother_social_genetic FALSE         FALSE    
 9 A-194    A-326       2012-07-15 mother_social_genetic FALSE         FALSE    
10 A-194    A-327       2012-07-15 mother_social_genetic FALSE         FALSE    
# … with 4,331 more rows, and abbreviated variable name ¹​right_censored

BONUS: More accurate censoring…

Are cubs censored?

create_id_offspring.table(ID = all_IDs$ID) %>% 
  filter(!is.na(offspringID)) %>% 
  mutate(left_censored = fetch_id_is.censored.left(offspringID, at = find_pop_date.observation.first()),
         right_censored = fetch_id_date.at.age(offspringID, age = 2, unit = "year") > find_pop_date.observation.last()) %>% 
  # Count censored individuals
  group_by(left_censored, right_censored) %>% 
  count()
# A tibble: 2 × 3
# Groups:   left_censored, right_censored [2]
  left_censored right_censored     n
  <lgl>         <lgl>          <int>
1 FALSE         FALSE           2973
2 FALSE         TRUE               4

BONUS: More accurate censoring…

Are cubs censored?

create_id_offspring.table(ID = all_IDs$ID) %>% 
  filter(!is.na(offspringID)) %>% 
  mutate(left_censored = fetch_id_is.censored.left(offspringID, at = find_pop_date.observation.first()),
         right_censored = fetch_id_date.at.age(offspringID, age = 2, unit = "year") > find_pop_date.observation.last()) %>% 
  ## EXTRACT PARENTS WITH CENSORED CUBS
  group_by(parentID) %>% 
  filter(sum(right_censored) > 0) %>% 
  summarise()
# A tibble: 3 × 1
  parentID
  <chr>   
1 L-425   
2 S-247   
3 T-157   

BONUS: More accurate censoring…

Are cubs censored?

create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), from = start_date, to = end_date) %>% 
  mutate(sex = fetch_id_sex(ID)) %>% 
  filter(!is.na(sex),
         !fetch_id_is.censored.left(ID, at = start_date),
         # Censoring with end date 2 years before last observation
         # will ensure that we only use parents that could not
         # have censored cubs
         !fetch_id_is.censored.right(ID, at = end_date - lubridate::years(2))) %>% 
  mutate(nroffspring = fetch_id_number.offspring(ID, age.mature = 2, unit = "year")) %>% 
  arrange(sex, desc(nroffspring)) -> all_IDs_trunc

all_IDs_trunc
# A tibble: 1,911 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-194 female          16
 2 L-205 female          16
 3 S-105 female          16
 4 N-087 female          15
 5 M-122 female          11
 6 M-099 female          10
 7 A-195 female           9
 8 F-097 female           9
 9 T-014 female           9
10 A-106 female           8
# … with 1,901 more rows

BONUS: More accurate censoring…

Are cubs censored?

create_id_offspring.table(ID = all_IDs_trunc$ID) %>% 
  filter(!is.na(offspringID)) %>% 
  mutate(left_censored = fetch_id_is.censored.left(offspringID, at = find_pop_date.observation.first()),
         right_censored = fetch_id_date.at.age(offspringID, age = 2, unit = "year") > find_pop_date.observation.last()) %>% 
  # Count censored individuals
  group_by(left_censored, right_censored) %>% 
  count()
# A tibble: 1 × 3
# Groups:   left_censored, right_censored [1]
  left_censored right_censored     n
  <lgl>         <lgl>          <int>
1 FALSE         FALSE           2555

BONUS: More accurate censoring…



…males are right-censored if they were alive by the [birthdate+2years] of the last genetically typed cub of the clan.

BONUS: More accurate censoring…

We can find the youngest genotyped cub in each clan using create_id_offspring.table to identify individuals with assigned fathers.

# Return all parent-offspring relationships
create_id_offspring.table() %>% 
  # Filter only paternity assignment
  filter(filiation == "father") %>% 
  # Arrange by clan and birthdate, so youngest cub is at the top
  mutate(birthclan = fetch_id_clan.birth(offspringID)) %>% 
  arrange(birthclan, desc(birthdate)) %>% 
  # Take the first row for clan (i.e. the youngest cub with assigned father)
  group_by(birthclan) %>% 
  slice(1)
# A tibble: 8 × 5
# Groups:   birthclan [8]
  parentID offspringID birthdate  filiation birthclan
  <chr>    <chr>       <date>     <chr>     <chr>    
1 A-263    A-510       2019-03-16 father    A        
2 M-320    E-297       2018-05-28 father    E        
3 T-143    F-286       2019-05-10 father    F        
4 A-289    L-517       2019-08-04 father    L        
5 X-113    M-483       2019-08-30 father    M        
6 X-095    N-283       2018-04-22 father    N        
7 A-306    S-358       2019-09-05 father    S        
8 E-227    T-221       2019-11-30 father    T        

BONUS: More accurate censoring…

If we want to be extra conservative, we could use the date from the clan genotyped least recently.

# Return all parent-offspring relationships
create_id_offspring.table() %>% 
  # Filter only paternity assignment
  filter(filiation == "father") %>% 
  # Arrange by clan and birthdate, so youngest cub is at the top
  mutate(birthclan = fetch_id_clan.birth(offspringID)) %>% 
  arrange(birthclan, desc(birthdate)) %>% 
  # Take the first row for clan (i.e. the youngest cub with assigned father)
  group_by(birthclan) %>% 
  slice(1) %>% 
  #Take the birthdate that is the furthest in the past
  ungroup() %>% 
  filter(birthdate == min(birthdate)) -> genotype_date

genotype_date$birthdate
[1] "2018-04-22"

BONUS: More accurate censoring…

We could use this date to select males.

# Use first and last observation to start with all observed males
# (but filter them below)
start_date <- find_pop_date.observation.first()
end_date   <- find_pop_date.observation.last()
create_id_starting.table(sex = "male",
                         clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  filter(!fetch_id_is.censored(ID,
                               # Include individuals that were cubs at start
                               censored.left = start_date - lubridate::years(2),
                               # Right censor based on genotyping date
                               censored.right = genotype_date$birthdate + lubridate::years(2))) -> maleIDs

maleIDs
# A tibble: 1,261 × 1
   ID   
   <chr>
 1 A-011
 2 A-040
 3 A-041
 4 A-042
 5 A-043
 6 A-044
 7 A-045
 8 A-046
 9 A-047
10 A-048
# … with 1,251 more rows

BONUS: More accurate censoring…

Extract data separately for females. Here we don’t worry about the genotyping date and just have to account for cub censoring.

# Use first and last observation to start with all observed males
# (but filter them below)
start_date <- find_pop_date.observation.first()
end_date   <- find_pop_date.observation.last()
create_id_starting.table(sex = "female",
                         clan = find_clan_name.all(main.clans = TRUE),
                         from = start_date,
                         to = end_date) %>% 
  filter(!fetch_id_is.censored(ID,
                               # Include individuals that were cubs at start
                               censored.left = start_date - lubridate::years(2),
                               # Only individuals with uncensored cubs
                               censored.right = end_date - lubridate::years(2))) -> femaleIDs

femaleIDs
# A tibble: 1,179 × 1
   ID   
   <chr>
 1 A-001
 2 A-002
 3 A-003
 4 A-004
 5 A-006
 6 A-007
 7 A-008
 8 A-009
 9 A-010
10 A-013
# … with 1,169 more rows

BONUS: More accurate censoring…

Combine males and females to get final dataset

allIDs <- dplyr::bind_rows(maleIDs, femaleIDs) %>% 
  mutate(sex = fetch_id_sex(ID),
         nroffspring = fetch_id_number.offspring(ID,
                                                 age.mature = 2, unit = "year")) %>% 
  arrange(sex, desc(nroffspring))

allIDs
# A tibble: 2,440 × 3
   ID    sex    nroffspring
   <chr> <chr>        <int>
 1 A-194 female          16
 2 L-205 female          16
 3 S-105 female          16
 4 N-087 female          15
 5 F-107 female          13
 6 A-001 female          12
 7 L-004 female          11
 8 M-122 female          11
 9 F-007 female          10
10 M-099 female          10
# … with 2,430 more rows

Visualize our results…

Code
plot_data <- allIDs

ggplot(data = plot_data) +
  geom_bar(aes(x = nroffspring, fill = sex),
           colour = "black", size = 0.25) +
  labs(title = "Reproductive skew for all uncensored individuals",
       x = "Lifetime reproductive success (offspring >=2yo)",
       y = "Number of individuals") +
  scale_x_continuous(breaks = seq(0, 20, 1),
                     limits = c(-0.5, 20)) +
  facet_wrap(facets = ~sex, ncol = 2) +
  theme_classic() +
  theme(legend.position = "none",
        axis.text = element_text(colour = "black"),
        plot.margin = margin(t = 10, r = 10, b = 10, l = 10))

Visualize our results…

Code
plot_data <- allIDs %>% 
  filter(nroffspring > 0)

ggplot(data = plot_data) +
  geom_bar(aes(x = nroffspring, fill = sex),
           colour = "black", size = 0.25) +
  labs(title = "Reproductive skew for individuals\nwith atleast 1 offspring",
       x = "Lifetime reproductive success (offspring >=2yo)",
       y = "Number of individuals") +
  scale_x_continuous(breaks = seq(1, 20, 1),
                     limits = c(0.5, 20)) +
  facet_wrap(facets = ~sex, ncol = 2) +
  theme_classic() +
  theme(legend.position = "none",
        axis.text = element_text(colour = "black"),
        plot.margin = margin(t = 10, r = 10, b = 10, l = 10))

Visualize our results…

Code
plot_data <- allIDs %>% 
  filter(nroffspring > 0) %>% 
  group_by(sex, nroffspring) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(n = case_when(sex == "female" ~ -n,
                       sex == "male" ~ n))

ggplot() +
  geom_col(data = plot_data,
           aes(x = nroffspring, y = n, fill = sex),
           colour = "black", size = 0.25) +
  geom_text(aes(y = c(-40, 40),
                x = 15,
                colour = c("Female", "Male")),
            label = c("Female", "Male"), size = 10) +
  labs(title = "Reproductive skew for individuals\nwith atleast 1 offspring",
       x = "Lifetime reproductive success (offspring >=2yo)",
       y = "Number of individuals") +
  scale_x_reverse(breaks = seq(20, 1, -1)) +
  scale_y_continuous(limits = c(-120, 120),
                     breaks = seq(-120, 120, 20),
                     labels = c(seq(120, 20, -20),
                                0,
                                seq(20, 120, 20))) +
  coord_flip(expand = FALSE) +
  theme_classic() +
  theme(legend.position = "none",
        axis.text = element_text(colour = "black"),
        plot.margin = margin(t = 10, r = 10, b = 10, l = 10))

Visualize our results…

Code
plot_data <- allIDs %>% 
  filter(nroffspring > 0) %>% 
  group_by(sex, nroffspring) %>% 
  count() %>%
  group_by(sex) %>% 
  mutate(n = n/sum(n)) %>% 
  mutate(n = case_when(sex == "female" ~ -n,
                       sex == "male" ~ n))


ggplot() +
  geom_col(data = plot_data,
           aes(x = nroffspring, y = n, fill = sex),
           colour = "black", size = 0.25) +
  geom_text(aes(y = c(-0.2, 0.2),
                x = 15,
                colour = c("Female", "Male")),
            label = c("Female", "Male"), size = 10) +
  labs(title = "Reproductive skew for individuals\nwith atleast 1 offspring",
       x = "Lifetime reproductive success (offspring >=2yo)",
       y = "Proportion of individuals") +
  scale_x_reverse(breaks = seq(20, 1, -1)) +
  scale_y_continuous(limits = c(-0.4, 0.4),
                     breaks = seq(-0.4, 0.4, 0.1),
                     labels = c(0.4, 0.3, 0.2, 0.1, 0, 0.1, 0.2, 0.3, 0.4)) +
  coord_flip(expand = FALSE) +
  theme_classic() +
  theme(legend.position = "none",
        axis.text = element_text(colour = "black"),
        plot.margin = margin(t = 10, r = 10, b = 10, l = 10))