Data science with {hyenaR}:
LESSON 13

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions
library(stringr) ## Working with text
library(waldo) ## To compare objects
library(skimr) ## Inspect data


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals

GOAL 1: 📝HOMEWORK: Lesson 10.

TASK 1:


Create a data frame of all females.

create_id_starting.table(sex = "female")
# A tibble: 1,211 × 1
   ID   
   <chr>
 1 A-001
 2 A-002
 3 A-003
 4 A-004
 5 A-006
 6 A-007
 7 A-008
 8 A-009
 9 A-010
10 A-013
# … with 1,201 more rows

TASK 1:


Use group_by()/summarise() to show how many individuals are left or right censored.

create_id_starting.table(sex = "female") %>% 
  # Find which are left and right censored
  mutate(left_censor = fetch_id_is.censored.left(ID),
         right_censor = fetch_id_is.censored.right(ID)) %>% 
  #Group by and count
  group_by(left_censor, right_censor) %>% 
  count()
# A tibble: 3 × 3
# Groups:   left_censor, right_censor [3]
  left_censor right_censor     n
  <lgl>       <lgl>        <int>
1 FALSE       FALSE          901
2 FALSE       TRUE           213
3 TRUE        FALSE           97

TASK 1:


Filter only un-censored individuals.

create_id_starting.table(sex = "female") %>% 
  # Filter uncensored individuals
  filter(!fetch_id_is.censored(ID))
# A tibble: 901 × 1
   ID   
   <chr>
 1 A-060
 2 A-080
 3 A-081
 4 A-088
 5 A-096
 6 A-098
 7 A-101
 8 A-102
 9 A-104
10 A-105
# … with 891 more rows

TASK 2:


Extract lifetime reproductive success of females (consider only offspring that reach atleast 18 months old).

create_id_starting.table(sex = "female") %>% 
  filter(!fetch_id_is.censored(ID)) %>% 
  # Use age.mature and unit to find individuals
  mutate(LRS = fetch_id_number.offspring(ID, age.mature = 18, unit = "month"))
# A tibble: 901 × 2
   ID      LRS
   <chr> <int>
 1 A-060     0
 2 A-080     4
 3 A-081     7
 4 A-088     0
 5 A-096     0
 6 A-098     0
 7 A-101     0
 8 A-102     6
 9 A-104     0
10 A-105     1
# … with 891 more rows

TASK 2:


Extract standardised rank within females at the date of adulthood (i.e. when they were 2 years old).

create_id_starting.table(sex = "female") %>% 
  filter(!fetch_id_is.censored(ID)) %>% 
  mutate(LRS = fetch_id_number.offspring(ID, age.mature = 18, unit = "month"),
         #Extract standardised rank within sexes
         #(i.e. just comparing females to other females)
         rank = fetch_id_rank.sex.std(ID,
                                      at = fetch_id_date.at.age(ID, age = 2)))
# A tibble: 901 × 3
   ID      LRS   rank
   <chr> <int>  <dbl>
 1 A-060     0 NA    
 2 A-080     4  0.538
 3 A-081     7 -1    
 4 A-088     0 NA    
 5 A-096     0 NA    
 6 A-098     0 NA    
 7 A-101     0  0    
 8 A-102     6  0.231
 9 A-104     0  0.429
10 A-105     1 -0.2  
# … with 891 more rows

TASK 2:


Inspect the data. Are there any individuals you might exclude?

output <- create_id_starting.table(sex = "female") %>% 
  filter(!fetch_id_is.censored(ID)) %>% 
  mutate(LRS = fetch_id_number.offspring(ID, age.mature = 18, unit = "month"),
         rank = fetch_id_rank.sex.std(ID, at = fetch_id_date.at.age(ID, age = 2)))

#Check for NAs
any(is.na(output$LRS))
[1] FALSE
any(is.na(output$rank))
[1] TRUE

TASK 2:


Use {skimr} package to inspect data!

NOTE We need to look in RStudio because it doesn’t work well on the slides!

skim(output)

TASK 3:


Create a new column with rank category (top rank >0 or low ranking <=0). HINT: You might need to use ifelse.

output %>% 
  filter(!is.na(rank)) %>% 
  #Create high and low category
  mutate(rank_cat = ifelse(rank > 0, "high", "low"))
# A tibble: 456 × 4
   ID      LRS    rank rank_cat
   <chr> <int>   <dbl> <chr>   
 1 A-080     4  0.538  high    
 2 A-081     7 -1      low     
 3 A-101     0  0      low     
 4 A-102     6  0.231  high    
 5 A-104     0  0.429  high    
 6 A-105     1 -0.2    low     
 7 A-106    10 -0.333  low     
 8 A-107     4  0.0588 high    
 9 A-108     0 -0.0588 low     
10 A-110     0 -0.556  low     
# … with 446 more rows

TASK 3:


What is the mean and SE of lifetime reproductive success in each group? HINT: Remember SE = \(\frac{sd}{\sqrt N}\)

output %>% 
  filter(!is.na(rank)) %>% 
  mutate(rank_cat = ifelse(rank > 0, "high", "low")) %>% 
  #Group-by summarise
  group_by(rank_cat) %>% 
  summarise(mean = mean(LRS),
            se = sd(LRS)/sqrt(n()))
# A tibble: 2 × 3
  rank_cat  mean    se
  <chr>    <dbl> <dbl>
1 high      2.21 0.179
2 low       1.28 0.167

TASK 3:


BONUS: Look at the help for the case_when() function. Can you use this to make 3 groups instead of 2?

output %>% 
  filter(!is.na(rank)) %>% 
  #Create 3 categories!
  mutate(rank_cat = ifelse(rank > 0.33, "rankgrp1",
                           ifelse(rank <= 0.33 & rank > -0.33,
                                  "rankgrp2",
                                  "rankgrp3"))) %>% 
  #Group-by/summarise
  group_by(rank_cat) %>% 
  summarise(mean = mean(LRS),
            se = sd(LRS)/sqrt(n()))
# A tibble: 3 × 3
  rank_cat  mean    se
  <chr>    <dbl> <dbl>
1 rankgrp1  2.30 0.213
2 rankgrp2  1.81 0.255
3 rankgrp3  1.13 0.174

TASK 3:


BONUS: Look at the help for the case_when() function. Can you use this to make 3 groups instead of 2?

output %>% 
  filter(!is.na(rank)) %>% 
  #Do the same thing with case_when()
  mutate(rank_cat = case_when(rank > 0.33 ~ "rankgrp1",
                              rank <= 0.33 & rank > -0.33 ~ "rankgrp2",
                              rank <= -0.33 ~ "rankgrp3")) %>% 
  #group-by/summarise
  group_by(rank_cat) %>% 
  summarise(mean = mean(LRS),
            se = sd(LRS)/sqrt(n()))
# A tibble: 3 × 3
  rank_cat  mean    se
  <chr>    <dbl> <dbl>
1 rankgrp1  2.30 0.213
2 rankgrp2  1.81 0.255
3 rankgrp3  1.13 0.174

TASK 3:


BONUS: Look at the help for the case_when() function. Can you use this to make 3 groups instead of 2?

output %>% 
  filter(!is.na(rank)) %>% 
  mutate(rank_cat = case_when(rank > 0.33 ~ "rankgrp1",
                              rank <= 0.33 & rank > -0.33 ~ "rankgrp2",
                              #Everything else that doesn't match others
                              TRUE ~ "rankgrp3")) %>% 
  group_by(rank_cat) %>% 
  summarise(mean = mean(LRS),
            se = sd(LRS)/sqrt(n()))
# A tibble: 3 × 3
  rank_cat  mean    se
  <chr>    <dbl> <dbl>
1 rankgrp1  2.30 0.213
2 rankgrp2  1.81 0.255
3 rankgrp3  1.13 0.174

TASK 3:


BONUS: Look at the help for the case_when() function. Can you use this to make 3 groups instead of 2?

# Be careful using the TRUE group. It will include *anything* that doesn't match other criteria
output %>% 
  mutate(birthclan = fetch_id_clan.birth(ID),
         clan_grp = case_when(birthclan == "X" ~ "Outside",
                              TRUE ~ "Main")) %>% 
  filter(clan_grp == "Main") %>% 
  group_by(birthclan) %>% 
  count()
# A tibble: 11 × 2
# Groups:   birthclan [11]
   birthclan     n
   <chr>     <int>
 1 A           170
 2 B             1
 3 E            74
 4 F            84
 5 L           167
 6 M           147
 7 N            84
 8 R             2
 9 S           100
10 T            50
11 U             6

TASK 3:


BONUS: Look at the help for the case_when() function. Can you use this to make 3 groups instead of 2?

# Be careful *without* TRUE. Everything else is NA!
output %>% 
  mutate(birthclan = fetch_id_clan.birth(ID),
         clan_grp = case_when(birthclan %in% find_clan_name.all(main.clans = TRUE) ~ "Main")) %>% 
  group_by(clan_grp) %>% 
  count()
# A tibble: 2 × 2
# Groups:   clan_grp [2]
  clan_grp     n
  <chr>    <int>
1 Main       876
2 <NA>        25

BONUS: Plotting

Code
library(ggdist)
plot_data <- output %>% 
  filter(!is.na(rank)) %>% 
  mutate(rank_cat = case_when(rank > 0.33 ~ "rankgrp1",
                              rank <= 0.33 & rank > -0.33 ~ "rankgrp2",
                              #Everything else that doesn't match others
                              TRUE ~ "rankgrp3"))

plot_data_summary <- plot_data %>% 
  group_by(rank_cat) %>% 
  summarise(mean = mean(LRS),
            se = sd(LRS)/sqrt(n()))

ggplot(data = plot_data_summary) +
  geom_col(aes(x = rank_cat, y = mean)) +
  geom_errorbar(aes(x = rank_cat, ymin = mean - se, ymax = mean + se), width = 0.2) +
  scale_x_discrete(labels = c("High", "Medium", "Low"), name = "") +
  scale_y_continuous(name = "Lifetime reproductive success (mean +- SE)") +
  theme_classic()