Data science with {hyenaR}:
LESSON 5

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates!
library(tidyr) ## Extra data wrangling functions!


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Recap: What did we cover last week?

Grouped v. ungrouped data

GROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age))
# A tibble: 8 × 3
# Groups:   clan [8]
  ID      age clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

UNGROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  ungroup()
# A tibble: 8 × 3
  ID      age clan 
  <chr> <dbl> <chr>
1 A-001 11.6  A    
2 E-004 13.8  E    
3 F-001 11.5  F    
4 L-005 13.7  L    
5 M-012 16.6  M    
6 N-003 12.5  N    
7 S-040  9.72 S    
8 T-002 10.9  T    

Grouped v. ungrouped data

GROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  summarise(sd_maxage = sd(age))
# A tibble: 8 × 2
  clan  sd_maxage
  <chr>     <dbl>
1 A            NA
2 E            NA
3 F            NA
4 L            NA
5 M            NA
6 N            NA
7 S            NA
8 T            NA

UNGROUPED

date <- "2000-01-01"
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE), at = date, lifestage = "adult") %>%
  mutate(age = fetch_id_age(ID = ID, at = date), clan = fetch_id_clan.current(ID = ID, at = date)) %>%
  group_by(clan) %>%
  filter(age == max(age)) %>% 
  ungroup() %>% 
  summarise(sd_maxage = sd(age))
# A tibble: 1 × 1
  sd_maxage
      <dbl>
1      2.14

Wide vs. long formats

WIDE FORMAT

  • Each row should have one unique value (e.g. date)
  • Each variable is its own column
  • Common for storing/recording data
  • Has many columns but fewer rows
  • Less suited for {ggplot2} and {tidyverse} functions

Wide vs. long formats

WIDE FORMAT

# A tibble: 1 × 10
  date       to             A     E     F     L     M     N     S     T
  <date>     <date>     <int> <int> <int> <int> <int> <int> <int> <int>
1 1997-07-01 1997-12-31    49    18    14    48    52    20     4    12

Wide vs. long formats

LONG FORMAT

  • Each row should have one unique combination of values (e.g. date, clan)
  • Each measurement is its own row
  • Has many rows but fewer columns
  • Better suited for {ggplot2} and {tidyverse} functions

Wide vs. long formats

LONG FORMAT

# A tibble: 8 × 4
  clan  date       to         clan_size
  <chr> <date>     <date>         <int>
1 A     1997-07-01 1997-12-31        49
2 E     1997-07-01 1997-12-31        18
3 F     1997-07-01 1997-12-31        14
4 L     1997-07-01 1997-12-31        48
5 M     1997-07-01 1997-12-31        52
6 N     1997-07-01 1997-12-31        20
7 S     1997-07-01 1997-12-31         4
8 T     1997-07-01 1997-12-31        12

Convert wide to long

# Start with data in wide format
wide_format
# A tibble: 1 × 10
  date       to             A     E     F     L     M     N     S     T
  <date>     <date>     <int> <int> <int> <int> <int> <int> <int> <int>
1 1997-07-01 1997-12-31    49    18    14    48    52    20     4    12

Convert wide to long

# Start with data in wide format
wide_format %>% 
  tidyr::pivot_longer(cols = A:T,
                      names_to = "clan",
                      values_to = "clan_size")
# A tibble: 1 × 10
  date       to             A     E     F     L     M     N     S     T
  <date>     <date>     <int> <int> <int> <int> <int> <int> <int> <int>
1 1997-07-01 1997-12-31    49    18    14    48    52    20     4    12

Convert wide to long

# Start with data in wide format
wide_format %>% 
  tidyr::pivot_longer(cols = A:T,
                      names_to = "clan",
                      values_to = "clan_size")
# A tibble: 8 × 4
  date       to         clan  clan_size
  <date>     <date>     <chr>     <int>
1 1997-07-01 1997-12-31 A            49
2 1997-07-01 1997-12-31 E            18
3 1997-07-01 1997-12-31 F            14
4 1997-07-01 1997-12-31 L            48
5 1997-07-01 1997-12-31 M            52
6 1997-07-01 1997-12-31 N            20
7 1997-07-01 1997-12-31 S             4
8 1997-07-01 1997-12-31 T            12

Convert long to wide

# Start with data in long format
long_format
# A tibble: 8 × 4
  clan  date       to         clan_size
  <chr> <date>     <date>         <int>
1 A     1997-07-01 1997-12-31        49
2 E     1997-07-01 1997-12-31        18
3 F     1997-07-01 1997-12-31        14
4 L     1997-07-01 1997-12-31        48
5 M     1997-07-01 1997-12-31        52
6 N     1997-07-01 1997-12-31        20
7 S     1997-07-01 1997-12-31         4
8 T     1997-07-01 1997-12-31        12

Convert long to wide

# Start with data in long format
long_format %>% 
  tidyr::pivot_wider(names_from = clan,
                     values_from = clan_size)
# A tibble: 8 × 4
  clan  date       to         clan_size
  <chr> <date>     <date>         <int>
1 A     1997-07-01 1997-12-31        49
2 E     1997-07-01 1997-12-31        18
3 F     1997-07-01 1997-12-31        14
4 L     1997-07-01 1997-12-31        48
5 M     1997-07-01 1997-12-31        52
6 N     1997-07-01 1997-12-31        20
7 S     1997-07-01 1997-12-31         4
8 T     1997-07-01 1997-12-31        12

Convert long to wide

# Start with data in long format
long_format %>% 
  tidyr::pivot_wider(names_from = clan,
                     values_from = clan_size)
# A tibble: 1 × 10
  date       to             A     E     F     L     M     N     S     T
  <date>     <date>     <int> <int> <int> <int> <int> <int> <int> <int>
1 1997-07-01 1997-12-31    49    18    14    48    52    20     4    12

Today’s goals


GOAL 1: HOMEWORK


GOAL 2: Last sighting and death dates


GOAL 3: Outstanding questions

HOMEWORK (TASK 1):
Find all uncensored individuals that were born in main clans and reached maturity.

Our attempts

OLIVER

Code
create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE)) %>%
  mutate(sex = fetch_id_sex(ID = ID),
         age = fetch_id_age(ID = ID, at = fetch_id_date.observation.last(ID = ID))) %>%
  filter(!fetch_id_is.censored(ID = ID),
         age >= 2)  %>% 
  select(ID, age)
# A tibble: 853 × 2
   ID      age
   <chr> <dbl>
 1 A-080  9.26
 2 A-081 13.1 
 3 A-087 11.3 
 4 A-089  3.73
 5 A-092 11.2 
 6 A-093 16.2 
 7 A-100  6.26
 8 A-101  5.60
 9 A-102 14.6 
10 A-103  5.40
# … with 843 more rows
# ℹ Use `print(n = ...)` to see more rows

ARJUN

Code
create_id_starting.table() %>% 
  #Extract birth date and birth clan
  mutate(birth.date = fetch_id_date.birth(ID = ID),
         birth.clan = fetch_id_clan.birth(ID = ID),
         date.dead = fetch_id_date.death(ID=ID),
         age.dead = fetch_id_age(ID = ID, at = date.dead),
         sex = fetch_id_sex(ID = ID)) %>% 
  filter(
    #Filter left censored
    birth.date > find_pop_date.observation.first(),
    #Filter individuals born in main clans
    birth.clan %in% find_clan_name.all(main.clans = TRUE), 
    #filter individuals who lived to age 2+ 
    age.dead >= 2) %>%
  #filter right censored
  filter(!fetch_id_is.censored.right(ID = ID)) %>%
  ungroup() %>% 
  select(ID, age.dead)
# A tibble: 855 × 2
   ID    age.dead
   <chr>    <dbl>
 1 A-080     9.26
 2 A-081    13.1 
 3 A-087    11.3 
 4 A-089     3.73
 5 A-092    11.2 
 6 A-093    16.2 
 7 A-100     6.26
 8 A-101     5.60
 9 A-102    14.6 
10 A-103     5.40
# … with 845 more rows
# ℹ Use `print(n = ...)` to see more rows

Our attempts

OLIVER

#If we just specify clan it is the same as giving clan.overlap = "birth" (desirable?)
(create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE)) %>%
   mutate(sex = fetch_id_sex(ID = ID),
          ## NOTE: Last observation and death are not the same! (more on this later)
          age = fetch_id_age(ID = ID, at = fetch_id_date.observation.last(ID = ID))) %>%
   #Filter for uncensored and mature individuals
   filter(!fetch_id_is.censored(ID = ID),
          age >= 2) -> my_data)
# A tibble: 853 × 3
   ID    sex      age
   <chr> <chr>  <dbl>
 1 A-080 female  9.26
 2 A-081 female 13.1 
 3 A-087 male   11.3 
 4 A-089 male    3.73
 5 A-092 male   11.2 
 6 A-093 male   16.2 
 7 A-100 male    6.26
 8 A-101 female  5.60
 9 A-102 female 14.6 
10 A-103 male    5.40
# … with 843 more rows
# ℹ Use `print(n = ...)` to see more rows

HOMEWORK (TASK 1): Alternatives

Our attempts

OLIVER ALTERNATIVE #1

# Same starting code
(create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE)) %>%
   mutate(sex = fetch_id_sex(ID = ID),
          age = fetch_id_age(ID = ID, at = fetch_id_date.observation.last(ID = ID))) %>%
   filter(!fetch_id_is.censored(ID = ID),
          #Alternative which applies age > 2 (should make consistent)
          fetch_id_is.adult(ID = ID, at = fetch_id_date.observation.last(ID = ID))) -> my_data2)
# A tibble: 853 × 3
   ID    sex      age
   <chr> <chr>  <dbl>
 1 A-080 female  9.26
 2 A-081 female 13.1 
 3 A-087 male   11.3 
 4 A-089 male    3.73
 5 A-092 male   11.2 
 6 A-093 male   16.2 
 7 A-100 male    6.26
 8 A-101 female  5.60
 9 A-102 female 14.6 
10 A-103 male    5.40
# … with 843 more rows
# ℹ Use `print(n = ...)` to see more rows

Our attempts

OLIVER ALTERNATIVE #2

# Same starting code
(create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE)) %>%
   mutate(sex = fetch_id_sex(ID = ID),
          age = fetch_id_age(ID = ID, at = fetch_id_date.observation.last(ID = ID))) %>%
   filter(!fetch_id_is.censored(ID = ID),
          # Death and last sighting are not the same
          fetch_id_is.adult(ID = ID, at = fetch_id_date.death(ID = ID))) -> my_data3)
# A tibble: 855 × 3
   ID    sex      age
   <chr> <chr>  <dbl>
 1 A-080 female  9.26
 2 A-081 female 13.1 
 3 A-087 male   11.3 
 4 A-089 male    3.73
 5 A-092 male   11.2 
 6 A-093 male   16.2 
 7 A-100 male    6.26
 8 A-101 female  5.60
 9 A-102 female 14.6 
10 A-103 male    5.40
# … with 845 more rows
# ℹ Use `print(n = ...)` to see more rows

Our attempts

OLIVER ALTERNATIVE #3

# lifestage = "adult" returns individuals that were adults at some point in main clans
# Therefore, they are not necessarily BORN in the main clans
(create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                          lifestage = "adult") %>%
   mutate(sex = fetch_id_sex(ID = ID),
          age = fetch_id_age(ID = ID, at = fetch_id_date.observation.last(ID = ID))) %>%
   filter(!fetch_id_is.censored(ID = ID)) -> my_data4)
# A tibble: 880 × 3
   ID    sex      age
   <chr> <chr>  <dbl>
 1 A-080 female  9.26
 2 A-081 female 13.1 
 3 A-087 male   11.3 
 4 A-089 male    3.73
 5 A-092 male   11.2 
 6 A-093 male   16.2 
 7 A-100 male    6.26
 8 A-101 female  5.60
 9 A-102 female 14.6 
10 A-103 male    5.40
# … with 870 more rows
# ℹ Use `print(n = ...)` to see more rows

Our attempts

ARJUN

# Do it 'manually'
(create_id_starting.table() %>% 
   #Extract birth date and birth clan
   mutate(birth.date = fetch_id_date.birth(ID = ID),
          birth.clan = fetch_id_clan.birth(ID = ID),
          date.dead = fetch_id_date.death(ID=ID),
          age.dead = fetch_id_age(ID = ID, at = date.dead),
          sex = fetch_id_sex(ID = ID)) %>% 
   filter(
     #Filter left censored
     birth.date > find_pop_date.observation.first(),
     #Filter individuals born in main clans
     birth.clan %in% find_clan_name.all(main.clans = TRUE), 
     #filter individuals who lived to age 2+ 
     age.dead >= 2) %>%
   #filter right censored
   filter(!fetch_id_is.censored.right(ID = ID)) %>%
   ungroup() -> mytibble)
# A tibble: 855 × 6
   ID    birth.date birth.clan date.dead  age.dead sex   
   <chr> <date>     <chr>      <date>        <dbl> <chr> 
 1 A-080 1996-06-11 A          2005-09-16     9.26 female
 2 A-081 1996-06-21 A          2009-08-11    13.1  female
 3 A-087 1996-10-28 A          2008-02-23    11.3  male  
 4 A-089 1996-12-12 A          2000-09-04     3.73 male  
 5 A-092 1996-10-29 A          2008-01-14    11.2  male  
 6 A-093 1996-12-26 A          2013-02-20    16.2  male  
 7 A-100 1997-08-15 A          2003-11-18     6.26 male  
 8 A-101 1998-01-18 A          2003-08-27     5.60 female
 9 A-102 1998-06-15 A          2013-01-25    14.6  female
10 A-103 1998-06-15 A          2003-11-08     5.40 male  
# … with 845 more rows
# ℹ Use `print(n = ...)` to see more rows

My solution #1

(create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE)) %>% 
   filter(!fetch_id_is.censored(ID = ID)) %>% 
   mutate(lifespan = fetch_id_duration.lifespan(ID = ID, unit = "year"),
          sex = fetch_id_sex(ID = ID)) %>% 
   filter(lifespan >= 2) -> liam_df)
# A tibble: 855 × 3
   ID    lifespan sex   
   <chr>    <dbl> <chr> 
 1 A-080     9.26 female
 2 A-081    13.1  female
 3 A-087    11.3  male  
 4 A-089     3.73 male  
 5 A-092    11.2  male  
 6 A-093    16.2  male  
 7 A-100     6.26 male  
 8 A-101     5.60 female
 9 A-102    14.6  female
10 A-103     5.40 male  
# … with 845 more rows
# ℹ Use `print(n = ...)` to see more rows

My solution #2

# Use separate clan.overlap and lifestage args
(create_id_starting.table(clan = find_clan_name.all(main.clans = TRUE),
                          clan.overlap = "birth",
                          lifestage = "adult") %>% 
   filter(!fetch_id_is.censored(ID = ID)) %>% 
   mutate(lifespan = fetch_id_duration.lifespan(ID = ID, unit = "year"),
          sex = fetch_id_sex(ID = ID)) -> liam_df2)
# A tibble: 853 × 3
   ID    lifespan sex   
   <chr>    <dbl> <chr> 
 1 A-080     9.26 female
 2 A-081    13.1  female
 3 A-087    11.3  male  
 4 A-089     3.73 male  
 5 A-092    11.2  male  
 6 A-093    16.2  male  
 7 A-100     6.26 male  
 8 A-101     5.60 female
 9 A-102    14.6  female
10 A-103     5.40 male  
# … with 843 more rows
# ℹ Use `print(n = ...)` to see more rows

HOMEWORK (TASK 2):
Calculate the mean and standard deviation of their lifespan.

Our attempts

OLIVER

Code
my_data %>%
  summarise(n = length(age),
            mean = mean(age),
            sd = sd(age))
# A tibble: 1 × 3
      n  mean    sd
  <int> <dbl> <dbl>
1   853  6.98  4.02

ARJUN

Code
mytibble %>% 
  summarize(meanage = mean(age.dead), sdage = sd(age.dead))
# A tibble: 1 × 2
  meanage sdage
    <dbl> <dbl>
1    6.97  4.01

LIAM

Code
liam_df %>% 
  summarize(mean = mean(lifespan), sd = sd(lifespan))
# A tibble: 1 × 2
   mean    sd
  <dbl> <dbl>
1  6.97  4.01

HOMEWORK (TASK 3):
How does the mean and standard deviation differ between sexes?

Our attempts

OLIVER

Code
my_data %>%
  group_by(sex) %>%
  summarise(mean = mean(age),
            sd = sd(age)) %>%
  ungroup()
# A tibble: 3 × 3
  sex     mean    sd
  <chr>  <dbl> <dbl>
1 female  7.35  4.06
2 male    6.60  3.94
3 <NA>    2.53 NA   

ARJUN

Code
mytibble %>%
  group_by(sex) %>% 
  summarize(mean = mean(age.dead),
            sd = sd(age.dead))
# A tibble: 3 × 3
  sex     mean    sd
  <chr>  <dbl> <dbl>
1 female  7.34  4.06
2 male    6.60  3.94
3 <NA>    2.54 NA   

LIAM

Code
liam_df %>% 
  group_by(sex) %>% 
  summarize(mean = mean(lifespan), sd = sd(lifespan))
# A tibble: 3 × 3
  sex     mean    sd
  <chr>  <dbl> <dbl>
1 female  7.34  4.06
2 male    6.60  3.94
3 <NA>    2.54 NA   

Why are last sighting and death date not the same?

  • fetch_id_date.observation.last() gives the date at which individual was last known to be alive (but see issue #545)
  • fetch_id_date.death() gives the date after an individual’s last sighting (or date when death was confirmed)
  • fetch_id_duration.lifespan() uses an individual’s death date.

What is the difference between “2” and 2 when filtering?

tibble(x = c(1, 2, 3)) %>% 
  #Coerces character into numeric
  filter(x > "1")
# A tibble: 2 × 1
      x
  <dbl>
1     2
2     3
tibble(x = c(1, 2, 3)) %>% 
  #Does the same for logical!!
  filter(x > TRUE)
# A tibble: 2 × 1
      x
  <dbl>
1     2
2     3

What is the difference between “2” and 2 when filtering?

#Logical comparison between character strings can be very confusing!!
#What do we expect here?
"abc" > "abb"
[1] TRUE
#Logical comparison between character strings can be very confusing!!
#What do we expect here?
"abc" > "abd"
[1] FALSE

Why are some calls to create_id_starting.table() slower?

#Easy calls just filter from raw data

#All indivduals in hyena.csv
system.time({create_id_starting.table()})
   user  system elapsed 
  0.035   0.000   0.035 
#All indivduals born in clan "A"
system.time({create_id_starting.table(clan = "A")})
   user  system elapsed 
  0.028   0.000   0.028 

Why are some calls to create_id_starting.table() slower?

#More complex calls can take longer (is manual process sometimes better?)

#Needs to compare life history table and dates
system.time({create_id_starting.table(lifestage = "adult",
                         from = "1997-01-01", to = "1998-01-01")})
   user  system elapsed 
  0.272   0.007   0.279 

HOMEWORK: What is our next task?


TASK 1: Filter sightings of all three jackal species from the carnivores table (bj, gj, sj).


TASK 2: Determine total sightings of each species in each year.


TASK 3: Convert table to wide format so that we have one row per year and one column per species