Data science with {hyenaR}:
LESSON 4

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## NEW PACKAGE: Working with dates!
library(tidyr) ## NEW PACKAGE: Extra data wrangling functions!


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Recap: What did we cover last week?

EXAMPLE 1: Hierarchy of Airstrip females

create_id_starting.table(
  sex = "female",
  clan = "A",
  at = "2007-01-01"
) %>% 
  mutate(rank = fetch_id_rank.sex(ID = ID, at = "2007-01-01")) %>% 
  arrange(rank)
# A tibble: 31 × 2
   ID     rank
   <chr> <int>
 1 A-013     1
 2 A-159     2
 3 A-119     3
 4 A-186     4
 5 A-178     5
 6 A-145     6
 7 A-016     7
 8 A-102     8
 9 A-164     9
10 A-001    10
# … with 21 more rows
# ℹ Use `print(n = ...)` to see more rows

EXAMPLE 2: Change in female rank

## TIP: DEFINE REPEATED VALUES ONCE
start_date <- "2007-01-01"
end_date   <- "2008-01-01"

create_id_starting.table(
  sex = "female",
  clan = "A",
  from = start_date, to = end_date,
  lifestage = "!dead", lifestage.overlap = "always" 
)  %>% 
  mutate(start_rank = fetch_id_rank.sex.std(ID = ID, at = start_date),
         end_rank = fetch_id_rank.sex.std(ID = ID, at = end_date),
         rank_change = end_rank - start_rank) %>% 
  arrange(rank_change)
# A tibble: 16 × 4
   ID    start_rank end_rank rank_change
   <chr>      <dbl>    <dbl>       <dbl>
 1 A-139     -0.6    -0.714     -0.114  
 2 A-140     -0.7    -0.810     -0.110  
 3 A-119      0.8     0.714     -0.0857 
 4 A-132     -0.3    -0.333     -0.0333 
 5 A-116     -0.5    -0.524     -0.0238 
 6 A-081     -0.9    -0.905     -0.00476
 7 A-013      1       1          0      
 8 A-107     -1      -1          0      
 9 A-178      0.6     0.619      0.0190 
10 A-145      0.5     0.524      0.0238 
11 A-016      0.4     0.429      0.0286 
12 A-102      0.3     0.333      0.0333 
13 A-164      0.2     0.238      0.0381 
14 A-106      0       0.0476     0.0476 
15 A-129     -0.100  -0.0476     0.0524 
16 A-006     -0.2    -0.143      0.0571 

EXAMPLE 3: Plot change in rank over time

create_id_starting.table(
  sex = "female",
  clan = "S",
  from = "2000-01-01", to = "2002-01-01",
  lifestage = "!dead", lifestage.overlap = "any" 
)  %>% 
  reshape_row_date.seq(ID, from = "2000-01-01", to = "2002-01-01",
                       by = "month") %>% 
  mutate(rank = fetch_id_rank.sex.std(ID = ID, at = date)) %>% 
  filter(!is.na(rank))
# A tibble: 130 × 3
   ID    date         rank
   <chr> <date>      <dbl>
 1 A-008 2000-01-01  0    
 2 A-008 2000-02-01  0    
 3 A-008 2000-03-01 -0.333
 4 A-008 2000-04-01 -0.333
 5 A-008 2000-05-01 -0.333
 6 A-008 2000-06-01 -0.333
 7 A-008 2000-07-01 -0.333
 8 A-008 2000-08-01 -0.333
 9 A-008 2000-09-01 -0.333
10 A-008 2000-10-01 -0.333
# … with 120 more rows
# ℹ Use `print(n = ...)` to see more rows

EXAMPLE 3: Plot change in rank over time

Code
plot_data <- create_id_starting.table(
  sex = "female",
  clan = "S",
  from = "2000-01-01", to = "2002-01-01",
  lifestage = "!dead", lifestage.overlap = "any" 
)  %>% 
  reshape_row_date.seq(ID, from = "2000-01-01", to = "2002-01-01",
                       by = "month") %>% 
  mutate(rank = fetch_id_rank.sex.std(ID = ID, at = date)) %>% 
  filter(!is.na(rank))

last_record <- plot_data %>% 
  group_by(ID) %>% 
  slice(n())

ggplot() +
  geom_line(data = plot_data,
            aes(x = date, y = rank, group = ID, colour = ID), size = 1) +
  geom_text(data = last_record,
            aes(x = date + 50, y = rank, label = ID, colour = ID), size = 3) +
  labs(x = "", y = "Standardized rank of females (Shamba)") +
  scale_x_date(date_labels = "%b-%y", date_breaks = "2 month") +
  coord_cartesian(clip = "off", xlim = c(as.Date(NA), as.Date("2002-01-01"))) +
  theme_classic() +
  theme(legend.position = "none",
        plot.margin = margin(r = 40, l = 20, t = 20))

Recap: Any questions after last week?

Today’s goals


GOAL 1: HOMEWORK - Find the oldest individual in each clan


GOAL 2: BONUS HOMEWORK - Size of clans over time


GOAL 3: {lubridate}


GOAL 4: {tidyr}

HOMEWORK: Find the oldest individual in each clan

Our attempts

OLIVER

Code
date <- "2000-01-01"
create_id_starting.table(
  clan = find_clan_name.all(main.clans = TRUE),
  at = date
) %>%
  mutate(age = fetch_id_age(ID = ID, at = date),
         clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>%
  select(-age)
# A tibble: 8 × 2
# Groups:   clan [8]
  ID    clan 
  <chr> <chr>
1 A-001 A    
2 E-004 E    
3 F-001 F    
4 L-005 L    
5 M-012 M    
6 N-003 N    
7 S-040 S    
8 T-002 T    

EVE

Code
date <- "2000-01-01"
create_id_starting.table(at = date, lifestage = "adult", clan = find_clan_name.all(main.clans = TRUE)) %>% 
  mutate(clan = fetch_id_clan.current(ID = ID, at = date), age = fetch_id_age(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age))
# A tibble: 8 × 3
# Groups:   clan [8]
  ID    clan    age
  <chr> <chr> <dbl>
1 A-001 A     11.6 
2 E-004 E     13.8 
3 F-001 F     11.5 
4 L-005 L     13.7 
5 M-012 M     16.6 
6 N-003 N     12.5 
7 S-040 S      9.72
8 T-002 T     10.9 

ARJUN

Code
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = "2000-01-01") %>%  
  mutate(Age = fetch_id_age(ID=ID, at="2000-01-01"),
         Clan = fetch_id_clan.current(ID = ID, at="2000-01-01")) %>%
  group_by(Clan) %>%
  filter(Age == max(Age))
# A tibble: 8 × 3
# Groups:   Clan [8]
  ID      Age Clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

Our attempts

OLIVER

date <- "2000-01-01"
create_id_starting.table(
  clan = find_clan_name.all(main.clans = TRUE),
  at = date
) %>%
  mutate(age = fetch_id_age(ID = ID, at = date),
         clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>%
  select(-age)
# A tibble: 8 × 2
# Groups:   clan [8]
  ID    clan 
  <chr> <chr>
1 A-001 A    
2 E-004 E    
3 F-001 F    
4 L-005 L    
5 M-012 M    
6 N-003 N    
7 S-040 S    
8 T-002 T    

Our attempts

EVE

date <- "2000-01-01"
create_id_starting.table(
  at = date,
  lifestage = "adult",
  clan = find_clan_name.all(main.clans = TRUE)
) %>%
  mutate(clan = fetch_id_clan.current(ID = ID, at = date),
         age = fetch_id_age(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age))
# A tibble: 8 × 3
# Groups:   clan [8]
  ID    clan    age
  <chr> <chr> <dbl>
1 A-001 A     11.6 
2 E-004 E     13.8 
3 F-001 F     11.5 
4 L-005 L     13.7 
5 M-012 M     16.6 
6 N-003 N     12.5 
7 S-040 S      9.72
8 T-002 T     10.9 

Our attempts

ARJUN

create_id_starting.table(
  clan = find_clan_name.all(main.clans = TRUE),
  at = "2000-01-01"
) %>%  
  mutate(Age = fetch_id_age(ID=ID, at="2000-01-01"),
         Clan = fetch_id_clan.current(ID = ID, at="2000-01-01")) %>%
  group_by(Clan) %>%
  filter(Age == max(Age))
# A tibble: 8 × 3
# Groups:   Clan [8]
  ID      Age Clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

My solution (v1.0)

date <- "2000-01-01"
create_id_starting.table(
  clan = find_clan_name.all(main.clans = TRUE),
  at = date,
  lifestage = "adult"
) %>%
  ## TIP: I prefer lower-case names to minimise typos
  ## but consistency is key!
  mutate(age = fetch_id_age(ID = ID, at = date),
         clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  ## TIP: UNGROUP DATA ONCE YOU'RE FINISHED
  ungroup()
# A tibble: 8 × 3
  ID      age clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

My solution (v1.0)

Grouped data can have unexpected behaviour!

GROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age))
# A tibble: 8 × 3
# Groups:   clan [8]
  ID      age clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

UNGROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  ungroup()
# A tibble: 8 × 3
  ID      age clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

My solution (v1.0)

Grouped data can have unexpected behaviour!

GROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  summarise(sd_maxage = sd(age))
# A tibble: 8 × 2
  clan  sd_maxage
  <chr>     <dbl>
1 A            NA
2 E            NA
3 F            NA
4 L            NA
5 M            NA
6 N            NA
7 S            NA
8 T            NA

UNGROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  ungroup() %>% 
  summarise(sd_maxage = sd(age))
# A tibble: 1 × 1
  sd_maxage
      <dbl>
1      2.14

My solution (v2.0)

date <- "2000-01-01"
create_id_starting.table(
  clan = find_clan_name.all(main.clans = TRUE),
  at = date,
  lifestage = "adult"
) %>%
  mutate(age = fetch_id_age(ID = ID, at = date),
         clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  ## TIP: WE CAN ALSO USE SUMMARISE TO UNGROUP
  summarise(ID = ID[age == max(age)],
            age = max(age),
            #Ungroup afterwards (this is default)
            .groups = "drop")
# A tibble: 8 × 3
  clan  ID      age
  <chr> <chr> <dbl>
1 A     A-001 11.6 
2 E     E-004 13.8 
3 F     F-001 11.5 
4 L     L-005 13.7 
5 M     M-012 16.6 
6 N     N-003 12.5 
7 S     S-040  9.72
8 T     T-002 10.9 

My solution (v2.0)

date <- "2000-01-01"
create_id_starting.table(
  clan = find_clan_name.all(main.clans = TRUE),
  at = date,
  lifestage = "adult"
) %>%
  mutate(age = fetch_id_age(ID = ID, at = date),
         clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  ## TIP: WE CAN ALSO USE SUMMARISE TO UNGROUP
  summarise(oldest_adult = ID[age == max(age)],
            maxage = max(age),
            ## Using summarise allows us to add more info if needed
            youngest_adult = ID[age == min(age)],
            minage = min(age),
            .groups = "drop")
# A tibble: 8 × 5
  clan  oldest_adult maxage youngest_adult minage
  <chr> <chr>         <dbl> <chr>           <dbl>
1 A     A-001         11.6  A-114            2.36
2 E     E-004         13.8  E-006            2.50
3 F     F-001         11.5  F-080            2.14
4 L     L-005         13.7  L-108            2.00
5 M     M-012         16.6  M-115            2.34
6 N     N-003         12.5  N-089            2.01
7 S     S-040          9.72 S-080            3.54
8 T     T-002         10.9  T-014            2.31

BONUS HOMEWORK: Size of clans over time

Our attempts

OLIVER

as_tibble(x = seq(as.Date("1996-07-01"), length.out = 10, by = "6 month")) %>%
  mutate(A = fetch_clan_number.anysex.all(clan = "A", at = value),
         E = fetch_clan_number.anysex.all(clan = "E", at = value),
         F = fetch_clan_number.anysex.all(clan = "F", at = value),
         L = fetch_clan_number.anysex.all(clan = "L", at = value),
         M = fetch_clan_number.anysex.all(clan = "M", at = value),
         N = fetch_clan_number.anysex.all(clan = "N", at = value),
         S = fetch_clan_number.anysex.all(clan = "S", at = value),
         T = fetch_clan_number.anysex.all(clan = "T", at = value))
# A tibble: 10 × 9
   value          A     E     F     L     M     N     S     T
   <date>     <int> <int> <int> <int> <int> <int> <int> <int>
 1 1996-07-01    44    13    16    38    42    17    11    13
 2 1997-01-01    47    13    12    34    31    15     7    13
 3 1997-07-01    45    16    12    43    49    17     4    10
 4 1998-01-01    36    18    14    42    41    17     4    12
 5 1998-07-01    40    18    15    44    47    17     5    13
 6 1999-01-01    37    22    16    48    43    15    11    15
 7 1999-07-01    38    23    18    50    50    17    11    15
 8 2000-01-01    45    23    24    50    52    20    13    16
 9 2000-07-01    46    23    21    56    54    21    15    20
10 2001-01-01    48    29    25    57    56    24    20    21

My solution

tibble(clan = find_clan_name.all(main.clans = TRUE)) %>%
  reshape_row_date.seq(clan, from = "1997-07-01", length.out = 10, by = "6 months")
# A tibble: 80 × 2
   clan  date      
   <chr> <date>    
 1 A     1997-07-01
 2 A     1998-01-01
 3 A     1998-07-01
 4 A     1999-01-01
 5 A     1999-07-01
 6 A     2000-01-01
 7 A     2000-07-01
 8 A     2001-01-01
 9 A     2001-07-01
10 A     2002-01-01
# … with 70 more rows
# ℹ Use `print(n = ...)` to see more rows

My solution

tibble(clan = find_clan_name.all(main.clans = TRUE)) %>%
  reshape_row_date.seq(clan, from = "1997-07-01", length.out = 10, by = "6 months") %>%
  mutate(
    #Use {lubridate} package so that we can manipulate dates better
    #I prefer to calculate pop/clan size over an interval...
    to = date + months(6) - lubridate::days(1),
    clan_size = fetch_clan_number.anysex.all(clan = clan, from = date, to = to)
  )
# A tibble: 80 × 4
   clan  date       to         clan_size
   <chr> <date>     <date>         <int>
 1 A     1997-07-01 1997-12-31        49
 2 A     1998-01-01 1998-06-30        41
 3 A     1998-07-01 1998-12-31        44
 4 A     1999-01-01 1999-06-30        40
 5 A     1999-07-01 1999-12-31        46
 6 A     2000-01-01 2000-06-30        51
 7 A     2000-07-01 2000-12-31        51
 8 A     2001-01-01 2001-06-30        57
 9 A     2001-07-01 2001-12-31        69
10 A     2002-01-01 2002-06-30        65
# … with 70 more rows
# ℹ Use `print(n = ...)` to see more rows

My solution

tibble(clan = find_clan_name.all(main.clans = TRUE)) %>%
  reshape_row_date.seq(clan, from = "1997-07-01", length.out = 10, by = "6 months") %>%
  mutate(
    to = date + months(6) - lubridate::days(1),
    clan_size = fetch_clan_number.anysex.all(clan = clan, from = date, to = to)
  ) %>% 
  #Use {tidyr} to make this the same as Oliver's data
  tidyr::pivot_wider(names_from = clan, values_from = clan_size)
# A tibble: 10 × 10
   date       to             A     E     F     L     M     N     S     T
   <date>     <date>     <int> <int> <int> <int> <int> <int> <int> <int>
 1 1997-07-01 1997-12-31    49    18    14    48    52    20     4    12
 2 1998-01-01 1998-06-30    41    18    15    49    50    19     5    13
 3 1998-07-01 1998-12-31    44    24    18    51    53    21    13    16
 4 1999-01-01 1999-06-30    40    24    20    57    58    17    11    15
 5 1999-07-01 1999-12-31    46    24    25    54    54    22    13    17
 6 2000-01-01 2000-06-30    51    27    26    58    63    23    15    20
 7 2000-07-01 2000-12-31    51    32    27    63    64    24    20    25
 8 2001-01-01 2001-06-30    57    30    26    69    65    27    25    22
 9 2001-07-01 2001-12-31    69    44    30    83    69    30    33    23
10 2002-01-01 2002-06-30    65    44    30    86    67    25    28    24

Pivotting functions

Wide vs. long formats

WIDE FORMAT

  • Each row should have one unique value (e.g. date)
  • Each variable is its own column
  • Common for storing/recording data
  • Has many columns but fewer rows
  • Less suited for {ggplot2} and {tidyverse} functions

Wide vs. long formats

WIDE FORMAT

Code
tibble(clan = find_clan_name.all(main.clans = TRUE)) %>%
  reshape_row_date.seq(clan, from = "1997-07-01", length.out = 10, by = "6 months") %>%
  mutate(
    to = date + months(6) - lubridate::days(1),
    clan_size = fetch_clan_number.anysex.all(clan = clan, from = date, to = to)
  ) %>% 
  #Use {tidyr} to make this the same as Oliver's data
  tidyr::pivot_wider(names_from = clan, values_from = clan_size) %>% 
  head(1)
# A tibble: 1 × 10
  date       to             A     E     F     L     M     N     S     T
  <date>     <date>     <int> <int> <int> <int> <int> <int> <int> <int>
1 1997-07-01 1997-12-31    49    18    14    48    52    20     4    12

Wide vs. long formats

LONG FORMAT

  • Each row should have one unique combination of values (e.g. date, clan)
  • Each measurement is its own row
  • Has many rows but fewer columns
  • Better suited for {ggplot2} and {tidyverse} functions

Wide vs. long formats

LONG FORMAT

Code
tibble(clan = find_clan_name.all(main.clans = TRUE)) %>%
  reshape_row_date.seq(clan, from = "1997-07-01", length.out = 10, by = "6 months") %>%
  mutate(
    to = date + months(6) - lubridate::days(1),
    clan_size = fetch_clan_number.anysex.all(clan = clan, from = date, to = to)
  ) %>% 
  filter(date == "1997-07-01") %>% 
  head(8)
# A tibble: 8 × 4
  clan  date       to         clan_size
  <chr> <date>     <date>         <int>
1 A     1997-07-01 1997-12-31        49
2 E     1997-07-01 1997-12-31        18
3 F     1997-07-01 1997-12-31        14
4 L     1997-07-01 1997-12-31        48
5 M     1997-07-01 1997-12-31        52
6 N     1997-07-01 1997-12-31        20
7 S     1997-07-01 1997-12-31         4
8 T     1997-07-01 1997-12-31        12

Wide vs. long formats

LONG FORMAT

as_tibble(x = seq(as.Date("1996-07-01"), length.out = 10, by = "6 month")) %>%
  mutate(A = fetch_clan_number.anysex.all(clan = "A", at = value), E = fetch_clan_number.anysex.all(clan = "E", at = value),
         F = fetch_clan_number.anysex.all(clan = "F", at = value), L = fetch_clan_number.anysex.all(clan = "L", at = value),
         M = fetch_clan_number.anysex.all(clan = "M", at = value), N = fetch_clan_number.anysex.all(clan = "N", at = value),
         S = fetch_clan_number.anysex.all(clan = "S", at = value), T = fetch_clan_number.anysex.all(clan = "T", at = value)) %>% 
  # Use {tidyr} to convert Oliver's data to long format
  tidyr::pivot_longer(cols = A:T,
                      names_to = "clan",
                      values_to = "clan_size")
# A tibble: 80 × 3
   value      clan  clan_size
   <date>     <chr>     <int>
 1 1996-07-01 A            44
 2 1996-07-01 E            13
 3 1996-07-01 F            16
 4 1996-07-01 L            38
 5 1996-07-01 M            42
 6 1996-07-01 N            17
 7 1996-07-01 S            11
 8 1996-07-01 T            13
 9 1997-01-01 A            47
10 1997-01-01 E            13
# … with 70 more rows
# ℹ Use `print(n = ...)` to see more rows

HOMEWORK: What to do while I’m away!


TASK 1: Find all uncensored individuals that were born in main clans and reached maturity.


TASK 2: Calculate the mean and standard deviation of their lifespan


TASK 3: How does the mean and standard deviation differ between sexes?