Data science with {hyenaR}:
LESSON 15

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions
library(stringr) ## Working with text
library(waldo) ## To compare objects
library(skimr) ## Inspect data


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



Extract monthly female reproduction and clan size for Airstrip. Build a model to study the relationship.

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



Extract monthly female reproduction and clan size for Airstrip. Build a model to study the relationship.

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



Extract monthly female reproduction and clan size for Airstrip. Build a model to study the relationship.

GOAL 1: 🧑‍🏫 Practical application of {hyenaR}



Extract monthly female reproduction and clan size for Airstrip. Build a model to study the relationship.

Extract monthly female reproduction…



start_date <- find_pop_date.observation.first()
end_date <- find_pop_date.observation.last()

## Extract all adult females in Airstrip during observation period
create_id_starting.table(sex = "female",
                         clan = "A",
                         lifestage = "adult",
                         from = start_date,
                         to = end_date)
# A tibble: 139 × 1
   ID   
   <chr>
 1 A-001
 2 A-002
 3 A-003
 4 A-004
 5 A-006
 6 A-007
 7 A-008
 8 A-009
 9 A-010
10 A-013
# … with 129 more rows

Extract monthly female reproduction…



start_date <- find_pop_date.observation.first()
end_date <- find_pop_date.observation.last()

create_id_starting.table(sex = "female", clan = "A", lifestage = "adult", from = start_date, to = end_date) %>% 
  ## Extract dates over which individual could reproduce...
  mutate(adult_date = fetch_id_date.at.age(ID = ID, age = 2),
         death_date = fetch_id_date.death(ID = ID))
# A tibble: 139 × 3
   ID    adult_date death_date
   <chr> <date>     <date>    
 1 A-001 1990-05-21 2007-06-08
 2 A-002 1994-10-21 1997-10-21
 3 A-003 1991-05-21 2005-01-27
 4 A-004 1990-07-13 1997-07-16
 5 A-006 1993-07-16 2011-02-25
 6 A-007 1990-07-14 1996-08-02
 7 A-008 1996-05-20 2008-11-30
 8 A-009 1996-01-16 2006-10-29
 9 A-010 1996-09-20 2000-02-06
10 A-013 1993-12-06 2011-10-07
# … with 129 more rows

Extract monthly female reproduction…



start_date <- find_pop_date.observation.first()
end_date <- find_pop_date.observation.last()

create_id_starting.table(sex = "female", clan = "A", lifestage = "adult", from = start_date, to = end_date) %>% 
  mutate(adult_date = fetch_id_date.at.age(ID = ID, age = 2),
         death_date = fetch_id_date.death(ID = ID)) %>% 
  ## Use case when to deal with left and right censored individuals...
  mutate(start_date = case_when(adult_date < start_date ~ start_date,
                                TRUE ~ start_date),
         end_date = case_when(is.na(death_date) ~ end_date,
                              TRUE ~ death_date))
# A tibble: 139 × 5
   ID    adult_date death_date start_date end_date  
   <chr> <date>     <date>     <date>     <date>    
 1 A-001 1990-05-21 2007-06-08 1996-04-12 2007-06-08
 2 A-002 1994-10-21 1997-10-21 1996-04-12 1997-10-21
 3 A-003 1991-05-21 2005-01-27 1996-04-12 2005-01-27
 4 A-004 1990-07-13 1997-07-16 1996-04-12 1997-07-16
 5 A-006 1993-07-16 2011-02-25 1996-04-12 2011-02-25
 6 A-007 1990-07-14 1996-08-02 1996-04-12 1996-08-02
 7 A-008 1996-05-20 2008-11-30 1996-04-12 2008-11-30
 8 A-009 1996-01-16 2006-10-29 1996-04-12 2006-10-29
 9 A-010 1996-09-20 2000-02-06 1996-04-12 2000-02-06
10 A-013 1993-12-06 2011-10-07 1996-04-12 2011-10-07
# … with 129 more rows

Extract monthly female reproduction…



start_date <- find_pop_date.observation.first()
end_date <- find_pop_date.observation.last()

create_id_starting.table(sex = "female", clan = "A", lifestage = "adult", from = start_date, to = end_date) %>% 
  mutate(adult_date = fetch_id_date.at.age(ID = ID, age = 2),
         death_date = fetch_id_date.death(ID = ID)) %>% 
  mutate(start_date = case_when(adult_date < start_date ~ start_date,
                                TRUE ~ start_date),
         end_date = case_when(is.na(death_date) ~ end_date,
                              TRUE ~ death_date)) %>% 
  ## Only use months where individual was alive the whole time
  mutate(start_date = lubridate::ceiling_date(start_date, unit = "month"),
         end_date = lubridate::floor_date(end_date, unit = "month"))
# A tibble: 139 × 5
   ID    adult_date death_date start_date end_date  
   <chr> <date>     <date>     <date>     <date>    
 1 A-001 1990-05-21 2007-06-08 1996-05-01 2007-06-01
 2 A-002 1994-10-21 1997-10-21 1996-05-01 1997-10-01
 3 A-003 1991-05-21 2005-01-27 1996-05-01 2005-01-01
 4 A-004 1990-07-13 1997-07-16 1996-05-01 1997-07-01
 5 A-006 1993-07-16 2011-02-25 1996-05-01 2011-02-01
 6 A-007 1990-07-14 1996-08-02 1996-05-01 1996-08-01
 7 A-008 1996-05-20 2008-11-30 1996-05-01 2008-11-01
 8 A-009 1996-01-16 2006-10-29 1996-05-01 2006-10-01
 9 A-010 1996-09-20 2000-02-06 1996-05-01 2000-02-01
10 A-013 1993-12-06 2011-10-07 1996-05-01 2011-10-01
# … with 129 more rows

Extract monthly female reproduction…



start_date <- find_pop_date.observation.first()
end_date <- find_pop_date.observation.last()

ID_data <- create_id_starting.table(sex = "female", clan = "A", lifestage = "adult", from = start_date, to = end_date) %>% 
  mutate(adult_date = fetch_id_date.at.age(ID = ID, age = 2),
         death_date = fetch_id_date.death(ID = ID)) %>% 
  mutate(start_date = case_when(adult_date < start_date ~ start_date,
                                TRUE ~ start_date),
         end_date = case_when(is.na(death_date) ~ end_date,
                              TRUE ~ death_date)) %>% 
  mutate(start_date = lubridate::ceiling_date(start_date, unit = "month"),
         end_date = lubridate::floor_date(end_date, unit = "month")) %>% 
  ## Create one row per month of life...
  reshape_row_date.seq(ID, from = start_date, to = end_date, by = "1 month") %>% 
  group_by(ID) %>% 
  mutate(end_date = lead(date) - 1) %>% 
  ungroup() %>% 
  filter(!is.na(end_date))

Extract monthly female reproduction…



ID_data
# A tibble: 31,174 × 3
   ID    date       end_date  
   <chr> <date>     <date>    
 1 A-001 1996-05-01 1996-05-31
 2 A-001 1996-06-01 1996-06-30
 3 A-001 1996-07-01 1996-07-31
 4 A-001 1996-08-01 1996-08-31
 5 A-001 1996-09-01 1996-09-30
 6 A-001 1996-10-01 1996-10-31
 7 A-001 1996-11-01 1996-11-30
 8 A-001 1996-12-01 1996-12-31
 9 A-001 1997-01-01 1997-01-31
10 A-001 1997-02-01 1997-02-28
# … with 31,164 more rows

Extract monthly female reproduction…



(repro_data <- ID_data %>% 
  mutate(RS = fetch_id_number.offspring(ID, from = date, to = end_date,
                                        age.mature = 6,
                                        unit = "month"),
         repro = RS > 0))
# A tibble: 31,174 × 5
   ID    date       end_date      RS repro
   <chr> <date>     <date>     <int> <lgl>
 1 A-001 1996-05-01 1996-05-31     0 FALSE
 2 A-001 1996-06-01 1996-06-30     0 FALSE
 3 A-001 1996-07-01 1996-07-31     0 FALSE
 4 A-001 1996-08-01 1996-08-31     0 FALSE
 5 A-001 1996-09-01 1996-09-30     0 FALSE
 6 A-001 1996-10-01 1996-10-31     0 FALSE
 7 A-001 1996-11-01 1996-11-30     0 FALSE
 8 A-001 1996-12-01 1996-12-31     2 TRUE 
 9 A-001 1997-01-01 1997-01-31     0 FALSE
10 A-001 1997-02-01 1997-02-28     0 FALSE
# … with 31,164 more rows

…and clan size for Airstrip



tibble(clan = "A") %>% 
  #Expand out for Airstrip over the same dates...
  reshape_row_date.seq(clan,
                       from = ceiling_date(start_date, unit = "month"),
                       to = floor_date(end_date, unit = "month"),
                       by = "month") %>% 
  mutate(end_date = lead(date) - 1) %>% 
  filter(!is.na(end_date))
# A tibble: 314 × 3
   clan  date       end_date  
   <chr> <date>     <date>    
 1 A     1996-05-01 1996-05-31
 2 A     1996-06-01 1996-06-30
 3 A     1996-07-01 1996-07-31
 4 A     1996-08-01 1996-08-31
 5 A     1996-09-01 1996-09-30
 6 A     1996-10-01 1996-10-31
 7 A     1996-11-01 1996-11-30
 8 A     1996-12-01 1996-12-31
 9 A     1997-01-01 1997-01-31
10 A     1997-02-01 1997-02-28
# … with 304 more rows

…and clan size for Airstrip



clan_data <- tibble(clan = "A") %>% 
  reshape_row_date.seq(clan, from = ceiling_date(start_date, unit = "month"), to = floor_date(end_date, unit = "month"), by = "month") %>% 
  mutate(end_date = lead(date) - 1) %>% 
  filter(!is.na(end_date)) %>% 
  mutate(clan_size = fetch_clan_number(clan = clan,
                                       from = date, to = end_date))

…and clan size for Airstrip



clan_data
# A tibble: 314 × 4
   clan  date       end_date   clan_size
   <chr> <date>     <date>         <int>
 1 A     1996-05-01 1996-05-31        42
 2 A     1996-06-01 1996-06-30        44
 3 A     1996-07-01 1996-07-31        44
 4 A     1996-08-01 1996-08-31        44
 5 A     1996-09-01 1996-09-30        41
 6 A     1996-10-01 1996-10-31        43
 7 A     1996-11-01 1996-11-30        44
 8 A     1996-12-01 1996-12-31        47
 9 A     1997-01-01 1997-01-31        49
10 A     1997-02-01 1997-02-28        49
# … with 304 more rows

…and clan size for Airstrip



final_data <- repro_data %>% 
  left_join(clan_data, by = c("date", "end_date"))

Build a model to study the relationship.



mod <- glm(repro ~ clan_size, data = final_data,
           family = "binomial")

summary(mod)

Call:
glm(formula = repro ~ clan_size, family = "binomial", data = final_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.2270  -0.1487  -0.1235  -0.1091   3.2734  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.01069    0.18759 -32.042  < 2e-16 ***
clan_size    0.01778    0.00219   8.119 4.69e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3280.9  on 31173  degrees of freedom
Residual deviance: 3218.5  on 31172  degrees of freedom
AIC: 3222.5

Number of Fisher Scoring iterations: 7

Build a model to study the relationship.



Code
pred_data <- data.frame(clan_size = seq(30, 140, 1)) %>% 
  mutate(pred = predict(mod, newdata = ., type = "response"))

plot_data <- final_data %>% 
  mutate(clan_size_10 = (clan_size %/% 10) * 10 + 5) %>% 
  group_by(clan_size_10) %>% 
  summarise(binom::binom.wilson(x = sum(repro), n = n()))

ggplot() +
  geom_errorbar(data = plot_data, aes(x = clan_size_10,
                                      ymin = lower, ymax = upper),
                width = 2.5) +
  geom_point(data = plot_data, aes(x = clan_size_10,
                                   y = mean),
             shape = 21, colour = "white", fill = "grey10",
             size = 2) +
  geom_line(data = pred_data, aes(x = clan_size, y = pred),
            linewidth = 1) +
  labs(y = "Probability of reproduction",
       x = "Clan size",
       title = "Airstrip clan") +
  scale_y_continuous(breaks = seq(0, 1, by = 0.01)) +
  theme_classic() +
  theme(axis.text = element_text(colour = "black", size = 10))