Data science with {hyenaR}:
LESSON 3

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## Some (bonus) plotting


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Recap: What did we cover last week?

EXAMPLE 1: Find 10 most successful males

create_id_starting.table(
  sex = "male") %>% 
  mutate(RS = fetch_id_number.offspring(ID = ID),
         #Determine the rank of males based on reproductive success
         #Largest RS is ranked 1
         rank = dense_rank(-RS)) %>%
  #Filter top 10 ranked males (allows for ties)
  filter(rank %in% 1:10) %>% 
  arrange(rank)
# A tibble: 27 × 3
   ID       RS  rank
   <chr> <int> <int>
 1 F-146    29     1
 2 A-150    26     2
 3 M-046    22     3
 4 X-024    20     4
 5 L-047    19     5
 6 M-240    19     5
 7 X-017    19     5
 8 A-122    18     6
 9 A-128    18     6
10 E-094    17     7
# … with 17 more rows

EXAMPLE 2: Find clan that produces most successful males on average

create_id_starting.table(
  sex = "male") %>% 
  mutate(RS = fetch_id_number.offspring(ID = ID),
         birth.clan = fetch_id_clan.birth(ID = ID)) %>% 
  filter(birth.clan %in% find_clan_name.all(main.clans = TRUE)) %>% 
  group_by(birth.clan) %>% 
  summarise(meanRS = mean(RS),
            maxRS = max(RS)) %>% 
  arrange(desc(meanRS))
# A tibble: 8 × 3
  birth.clan meanRS maxRS
  <chr>       <dbl> <int>
1 N           1.46     14
2 M           1.42     19
3 A           1.27     26
4 L           1.13     14
5 F           1.13     29
6 E           1.01     17
7 T           0.462     8
8 S           0.357     7

EXAMPLE 3: Reproductive success of uncensored males

## Do the same thing with `create_id_starting table()`
create_id_starting.table(
  
  ## Select all males..
  sex = "male",
  
  ## Only during the observation period
  ## (rather than from first estimated birth or conception)
  from = find_pop_date.observation.first(),
  to = find_pop_date.observation.last(),
  
  ## Males that 'started being cubs' during our focal period
  ## i.e. they were born during this time
  lifestage = "cub", lifestage.overlap = "start",
  
  ## Only individuals born into main clans
  clan = find_clan_name.all(main.clans = TRUE)
  
)

EXAMPLE 3: Reproductive success of uncensored males

# Extract data 'manually' without `create_id_starting.table()` arguments
create_id_starting.table(
  sex = "male"
) %>% 
  #Extract birth date and birth clan
  mutate(birth.date = fetch_id_date.birth(ID = ID),
         birth.clan = fetch_id_clan.birth(ID = ID)) %>% 
  filter(
    #Filter individuals born after study started (i.e. non left censored)
    birth.date > find_pop_date.observation.first(),
    #Filter individuals born in main clans
    birth.clan %in% find_clan_name.all(main.clans = TRUE))

EXAMPLE 3: Reproductive success of uncensored males

create_id_starting.table(sex = "male", lifestage = "cub", clan = find_clan_name.all(main.clans = TRUE), lifestage.overlap = "start", from = find_pop_date.observation.first(), to = find_pop_date.observation.last()) %>% 
  filter(!fetch_id_is.censored.right(ID = ID)) %>% 
  mutate(RS = fetch_id_number.offspring(ID = ID, age.mature = 6, unit = "months"),
         birth.clan = fetch_id_clan.birth(ID = ID)) %>% 
  group_by(birth.clan) %>% 
  summarise(meanRS = mean(RS)) %>% 
  arrange(desc(meanRS))
# A tibble: 8 × 2
  birth.clan meanRS
  <chr>       <dbl>
1 M           1.34 
2 A           1.24 
3 N           1.13 
4 L           1.03 
5 F           0.927
6 E           0.908
7 T           0.388
8 S           0.370

Recap: Any questions after last week?

Today’s goals


GOAL 1: create_id_starting.table()


GOAL 2: fetch_id_rank()


GOAL 3: reshape_row_date.seq()

TASK: Rank of females

EXAMPLE 1: Hierarchy of Airstrip females

We want all females in Airstrip on a single date

# Extract all within `create_id_starting.table()`
create_id_starting.table(
  
  ## Select all females..
  sex = "female",
  
  ## In Airstrip...
  clan = "A",
  
  ## On this date...
  at = "2007-01-01"
  
)
# A tibble: 31 × 1
   ID   
   <chr>
 1 A-001
 2 A-006
 3 A-013
 4 A-015
 5 A-016
 6 A-081
 7 A-102
 8 A-106
 9 A-107
10 A-115
# … with 21 more rows

EXAMPLE 1: Hierarchy of Airstrip females

# Extract data 'manually' without using `create_id_starting.table()` arguments

#Return all individuals (no filtering)
create_id_starting.table() %>% 
  
  #Extract variables for filtering...
  mutate(sex = fetch_id_sex(ID = ID),
         alive = fetch_id_is.alive(ID = ID, at = "2007-01-01"),
         clan = fetch_id_clan.current(ID = ID, at = "2007-01-01")) %>% 
  
  #Filter to just return Airstrip females alive on 2007-01-01
  filter(sex == "female" & alive & clan == "A") %>% 
  
  #Remove filtering columns
  select(ID)
# A tibble: 31 × 1
   ID   
   <chr>
 1 A-001
 2 A-006
 3 A-013
 4 A-015
 5 A-016
 6 A-081
 7 A-102
 8 A-106
 9 A-107
10 A-115
# … with 21 more rows

EXAMPLE 1: Hierarchy of Airstrip females

create_id_starting.table(
  sex = "female",
  clan = "A",
  at = "2007-01-01"
) %>% 
  
  #Find rank of each female (only among females)
  mutate(rank = fetch_id_rank.sex(ID = ID, at = "2007-01-01"))
# A tibble: 31 × 2
   ID     rank
   <chr> <int>
 1 A-001    10
 2 A-006    13
 3 A-013     1
 4 A-015    15
 5 A-016     7
 6 A-081    20
 7 A-102     8
 8 A-106    11
 9 A-107    21
10 A-115    19
# … with 21 more rows

EXAMPLE 1: Hierarchy of Airstrip females

create_id_starting.table(
  sex = "female",
  clan = "A",
  at = "2007-01-01"
) %>% 
  mutate(rank = fetch_id_rank.sex(ID = ID, at = "2007-01-01")) %>% 
  
  #Arrange to show alpha on top
  arrange(rank)
# A tibble: 31 × 2
   ID     rank
   <chr> <int>
 1 A-013     1
 2 A-159     2
 3 A-119     3
 4 A-186     4
 5 A-178     5
 6 A-145     6
 7 A-016     7
 8 A-102     8
 9 A-164     9
10 A-001    10
# … with 21 more rows

TASK: Rank of females

EXAMPLE 2: Change in female rank

We want all females alive in Airstrip over a year

create_id_starting.table(
  
  ## Select all females..
  sex = "female",
  
  ## In Airstrip...
  clan = "A",
  
  ## During all of 2007...
  from = "2007-01-01", to = "2008-01-01",
  
  #Individuals were alive this whole period (alive = !dead)
  lifestage = "!dead", lifestage.overlap = "always" 
)
# A tibble: 16 × 1
   ID   
   <chr>
 1 A-006
 2 A-013
 3 A-016
 4 A-081
 5 A-102
 6 A-106
 7 A-107
 8 A-116
 9 A-119
10 A-129
11 A-132
12 A-139
13 A-140
14 A-145
15 A-164
16 A-178

EXAMPLE 2: Change in female rank

create_id_starting.table(
  
  ## Select all females..
  sex = "female",
  
  ## In Airstrip...
  clan = "A",
  
  ## During all of 2007...
  from = "2007-01-01", to = "2008-01-01",
  
  #Individuals were alive this whole period (alive = !dead)
  lifestage = "!dead", lifestage.overlap = "always" 
  
)

WARNING

This has a bug in the {drat} version, but I am fixing it!!

EXAMPLE 2: Change in female rank

# Extract data 'manually' without using `create_id_starting.table()` arguments

#Return all individuals (no filtering)
create_id_starting.table() %>% 
  
  #Extract variables for filtering...
  mutate(sex = fetch_id_sex(ID = ID),
         alive_07 = fetch_id_is.alive(ID = ID, at = "2007-01-01"),
         alive_08 = fetch_id_is.alive(ID = ID, at = "2008-01-01"),
         clan = fetch_id_clan.current(ID = ID, at = "2007-01-01")) %>% 
  
  #Filter to just return Airstrip females alive on 2007-01-01
  filter(sex == "female" & alive_07 & alive_08 & clan == "A") %>% 
  
  #Remove filtering columns
  select(ID)
# A tibble: 25 × 1
   ID   
   <chr>
 1 A-006
 2 A-013
 3 A-016
 4 A-081
 5 A-102
 6 A-106
 7 A-107
 8 A-116
 9 A-119
10 A-129
# … with 15 more rows

EXAMPLE 2: Change in female rank

create_id_starting.table(
  sex = "female",
  clan = "A",
  from = "2007-01-01", to = "2008-01-01",
  lifestage = "!dead", lifestage.overlap = "always" 
)  %>% 
  
  #Expand our data to include multiple dates for each female
  reshape_row_date.seq(ID, from = "2007-01-01", to = "2008-01-01",
                       by = "year")
# A tibble: 32 × 2
   ID    date      
   <chr> <date>    
 1 A-006 2007-01-01
 2 A-006 2008-01-01
 3 A-013 2007-01-01
 4 A-013 2008-01-01
 5 A-016 2007-01-01
 6 A-016 2008-01-01
 7 A-081 2007-01-01
 8 A-081 2008-01-01
 9 A-102 2007-01-01
10 A-102 2008-01-01
# … with 22 more rows

EXAMPLE 2: Change in female rank

create_id_starting.table(
  sex = "female",
  clan = "A",
  from = "2007-01-01", to = "2008-01-01",
  lifestage = "!dead", lifestage.overlap = "always" 
)  %>% 
  reshape_row_date.seq(ID, from = "2007-01-01", to = "2008-01-01",
                       by = "year") %>% 
  
  #Extract (standardised) rank information for each female on each date
  mutate(rank = fetch_id_rank.sex.std(ID = ID, at = date))
# A tibble: 32 × 3
   ID    date         rank
   <chr> <date>      <dbl>
 1 A-006 2007-01-01 -0.2  
 2 A-006 2008-01-01 -0.143
 3 A-013 2007-01-01  1    
 4 A-013 2008-01-01  1    
 5 A-016 2007-01-01  0.4  
 6 A-016 2008-01-01  0.429
 7 A-081 2007-01-01 -0.9  
 8 A-081 2008-01-01 -0.905
 9 A-102 2007-01-01  0.3  
10 A-102 2008-01-01  0.333
# … with 22 more rows

EXAMPLE 2: Change in female rank

create_id_starting.table(
  sex = "female",
  clan = "A",
  from = "2007-01-01", to = "2008-01-01",
  lifestage = "!dead", lifestage.overlap = "always" 
)  %>% 
  reshape_row_date.seq(ID, from = "2007-01-01", to = "2008-01-01",
                       by = "year") %>% 
  mutate(rank = fetch_id_rank.sex.std(ID = ID, at = date)) %>% 
  
  #Group by/summarise to see how individual ranks changed over the year
  group_by(ID) %>% 
  ## first() and last() are dplyr functions. Useful with group_by()
  summarise(rank_change = last(rank) - first(rank)) %>% 
  arrange(rank_change)
# A tibble: 16 × 2
   ID    rank_change
   <chr>       <dbl>
 1 A-139    -0.114  
 2 A-140    -0.110  
 3 A-119    -0.0857 
 4 A-132    -0.0333 
 5 A-116    -0.0238 
 6 A-081    -0.00476
 7 A-013     0      
 8 A-107     0      
 9 A-178     0.0190 
10 A-145     0.0238 
11 A-016     0.0286 
12 A-102     0.0333 
13 A-164     0.0381 
14 A-106     0.0476 
15 A-129     0.0524 
16 A-006     0.0571 

EXAMPLE 2: Change in female rank

## TIP: DEFINE REPEATED VALUES ONCE
start_date <- "2007-01-01"
end_date   <- "2008-01-01"

create_id_starting.table(
  sex = "female",
  clan = "A",
  from = start_date, to = end_date,
  lifestage = "!dead", lifestage.overlap = "always" 
)  %>% 
  
  #Quicker way!
  mutate(start_rank = fetch_id_rank.sex.std(ID = ID, at = start_date),
         end_rank = fetch_id_rank.sex.std(ID = ID, at = end_date),
         rank_change = end_rank - start_rank) %>% 
  arrange(rank_change)
# A tibble: 16 × 4
   ID    start_rank end_rank rank_change
   <chr>      <dbl>    <dbl>       <dbl>
 1 A-139     -0.6    -0.714     -0.114  
 2 A-140     -0.7    -0.810     -0.110  
 3 A-119      0.8     0.714     -0.0857 
 4 A-132     -0.3    -0.333     -0.0333 
 5 A-116     -0.5    -0.524     -0.0238 
 6 A-081     -0.9    -0.905     -0.00476
 7 A-013      1       1          0      
 8 A-107     -1      -1          0      
 9 A-178      0.6     0.619      0.0190 
10 A-145      0.5     0.524      0.0238 
11 A-016      0.4     0.429      0.0286 
12 A-102      0.3     0.333      0.0333 
13 A-164      0.2     0.238      0.0381 
14 A-106      0       0.0476     0.0476 
15 A-129     -0.100  -0.0476     0.0524 
16 A-006     -0.2    -0.143      0.0571 

TASK: Rank of females

BONUS: Plotting rank change over time

We want all females that were alive in Shamba for at least 1 day

create_id_starting.table(
  sex = "female",
  clan = "S",
  
  ## During this period...
  from = "2000-01-01", to = "2002-01-01",
  
  #Individuals could be alive at any point
  lifestage = "!dead", lifestage.overlap = "any" 
)
# A tibble: 10 × 1
   ID   
   <chr>
 1 A-008
 2 A-018
 3 A-019
 4 S-082
 5 S-083
 6 S-084
 7 S-086
 8 S-089
 9 S-090
10 S-092

BONUS: Plotting rank change over time

create_id_starting.table(
  sex = "female",
  clan = "S",
  from = "2000-01-01", to = "2002-01-01",
  lifestage = "!dead", lifestage.overlap = "any" 
)  %>% 
  
  #Expand our data to include multiple dates for each female
  reshape_row_date.seq(ID, from = "2000-01-01", to = "2002-01-01",
                       by = "month")
# A tibble: 250 × 2
   ID    date      
   <chr> <date>    
 1 A-008 2000-01-01
 2 A-008 2000-02-01
 3 A-008 2000-03-01
 4 A-008 2000-04-01
 5 A-008 2000-05-01
 6 A-008 2000-06-01
 7 A-008 2000-07-01
 8 A-008 2000-08-01
 9 A-008 2000-09-01
10 A-008 2000-10-01
# … with 240 more rows

BONUS: Plotting rank change over time

(create_id_starting.table(
  sex = "female",
  clan = "S",
  from = "2000-01-01", to = "2002-01-01",
  lifestage = "!dead", lifestage.overlap = "any" 
)  %>% 
  reshape_row_date.seq(ID, from = "2000-01-01", to = "2002-01-01",
                       by = "month") %>% 
  
  #Determine standardised rank at each date
  mutate(rank = fetch_id_rank.sex.std(ID = ID, at = date)) %>% 
  #Only include cases where rank was returned
  #(i.e. exclude when they were not alive)
  filter(!is.na(rank)) -> plot_data)
# A tibble: 130 × 3
   ID    date         rank
   <chr> <date>      <dbl>
 1 A-008 2000-01-01  0    
 2 A-008 2000-02-01  0    
 3 A-008 2000-03-01 -0.333
 4 A-008 2000-04-01 -0.333
 5 A-008 2000-05-01 -0.333
 6 A-008 2000-06-01 -0.333
 7 A-008 2000-07-01 -0.333
 8 A-008 2000-08-01 -0.333
 9 A-008 2000-09-01 -0.333
10 A-008 2000-10-01 -0.333
# … with 120 more rows

BONUS: Plotting rank change over time

Code
last_record <- plot_data %>% 
  group_by(ID) %>% 
  slice(n())

ggplot() +
  geom_line(data = plot_data,
            aes(x = date, y = rank, group = ID, colour = ID), size = 1) +
  geom_text(data = last_record,
            aes(x = date + 50, y = rank, label = ID, colour = ID), size = 3) +
  labs(x = "", y = "Standardized rank of females (Shamba)") +
  scale_x_date(date_labels = "%b-%y", date_breaks = "2 month") +
  coord_cartesian(clip = "off", xlim = c(as.Date(NA), as.Date("2002-01-01"))) +
  theme_classic() +
  theme(legend.position = "none",
        plot.margin = margin(r = 40, l = 20, t = 20))

HOMEWORK: Find the oldest individual in each main clan on 2000-01-01