Data science with {hyenaR}:
LESSON 7

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals

GOAL 1: The overlap argument


GOAL 2: Dealing with NAs (tidyr::replace_na())

The overlap argument

The overlap argument

create_id_starting.table(lifestage = "philopatric",
                         from = "1998-01-01",
                         to = "1998-12-31",
                         lifestage.overlap = "?")

The overlap argument

Start

Individual started the specified life stage during the focal period.

[ |———-]

create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                         lifestage.overlap = "start")
# A tibble: 11 × 1
   ID   
   <chr>
 1 A-080
 2 A-081
 3 E-080
 4 F-006
 5 F-008
 6 F-048
 7 L-084
 8 L-087
 9 M-097
10 S-080
11 X-004

The overlap argument

End

Individual ended the specified life stage during the focal period.

[———| ]

create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                         lifestage.overlap = "end")
# A tibble: 17 × 1
   ID   
   <chr>
 1 A-008
 2 A-017
 3 A-018
 4 A-019
 5 C-001
 6 C-002
 7 E-005
 8 F-005
 9 L-003
10 M-016
11 M-043
12 N-002
13 N-006
14 N-008
15 S-001
16 X-004
17 X-016

The overlap argument

Within

Individual started and ended the specified life stage during the focal period.

[ |——| ]

create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                         lifestage.overlap = "within")
# A tibble: 1 × 1
  ID   
  <chr>
1 X-004

The overlap argument

Within

This is the same as filtering by overlap ‘start’ and ‘end’.

within <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                   lifestage.overlap = "within")
starters <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                     lifestage.overlap = "start")
enders <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                   lifestage.overlap = "end")

#Compare within to individuals that match BOTH start and end
waldo::compare(within$ID, starters$ID[starters$ID %in% enders$ID])
✔ No differences

The overlap argument

Always

Individual was in the focal life stage during the whole period.

[————]

create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                         lifestage.overlap = "always")
# A tibble: 52 × 1
   ID   
   <chr>
 1 A-001
 2 A-003
 3 A-006
 4 A-009
 5 A-010
 6 A-013
 7 A-015
 8 A-016
 9 A-020
10 A-040
# … with 42 more rows

The overlap argument

Any (default)

Individual was in the focal life stage at some point within the focal period.

[ ?–? ]

create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                         lifestage.overlap = "any")
# A tibble: 79 × 1
   ID   
   <chr>
 1 A-001
 2 A-003
 3 A-006
 4 A-008
 5 A-009
 6 A-010
 7 A-013
 8 A-015
 9 A-016
10 A-017
# … with 69 more rows

The overlap argument

Any (default)

This is the same as combining outputs from each of ‘start’, ‘end’, and ‘always’.

any <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                lifestage.overlap = "any")
starters <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                     lifestage.overlap = "start")
enders <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                   lifestage.overlap = "end")
always <- create_id_starting.table(lifestage = "philopatric", from = "1998-01-01", to = "1998-12-31",
                                   lifestage.overlap = "always")

#Compare any to individuals that match at least one of start, end, or always
waldo::compare(sort(any$ID),
               sort(unique(c(starters$ID, enders$ID, always$ID))))
✔ No differences

The overlap argument

clan.overlap can be used the same way

### Find individuals that ended their time in Airstrip during this period 
### (i.e. dispersed)
create_id_starting.table(clan = "A",
                         from = "1998-01-01", to = "1998-12-31",
                         clan.overlap = "end")
# A tibble: 6 × 1
  ID   
  <chr>
1 A-008
2 A-018
3 A-019
4 A-041
5 A-046
6 A-084

The overlap argument

clan.overlap can be used the same way

# A tibble: 12 × 5
   ID    clan  life_stage  starting_date ending_date
   <chr> <chr> <chr>       <date>        <date>     
 1 A-008 A     philopatric 1996-05-20    1998-11-12 
 2 A-008 S     selector_2  1998-11-13    2008-11-29 
 3 A-018 A     philopatric 1996-09-20    1998-10-02 
 4 A-018 S     selector_2  1998-10-03    2004-06-27 
 5 A-019 A     philopatric 1996-09-16    1998-09-17 
 6 A-019 S     selector_2  1998-09-18    2010-05-16 
 7 A-041 A     natal       1996-07-13    1998-03-30 
 8 A-041 L     disperser   1998-03-31    2003-05-02 
 9 A-046 A     natal       1996-09-20    1998-12-05 
10 A-046 L     disperser   1998-12-06    2003-11-02 
11 A-084 A     natal       1997-10-07    1998-08-18 
12 A-084 L     disperser   1998-08-19    2003-10-19 

The overlap argument

clan.overlap and lifestage.overlap

Find individuals that match lifestage.overlap then check if these individuals also match clan.overlap. Most appropriate for multiple lifestages.

The overlap argument

EXAMPLE:

Check individual was alive for all of 1998.

(all_alive <- create_id_starting.table(lifestage = "!dead",
                                       from = "1998-01-01", to = "1998-12-31",
                                       lifestage.overlap = "always"))
# A tibble: 111 × 1
   ID   
   <chr>
 1 A-001
 2 A-003
 3 A-006
 4 A-009
 5 A-010
 6 A-011
 7 A-013
 8 A-015
 9 A-016
10 A-020
# … with 101 more rows

The overlap argument

EXAMPLE:

Check individual was alive for all of 1998 and was always in Airstrip (i.e. no dispersers).

(all_alive_A <- create_id_starting.table(lifestage = "!dead",
                                         from = "1998-01-01", to = "1998-12-31",
                                         lifestage.overlap = "always",
                                         clan = "A",
                                         clan.overlap = "always"))
# A tibble: 21 × 1
   ID   
   <chr>
 1 A-001
 2 A-003
 3 A-006
 4 A-009
 5 A-010
 6 A-011
 7 A-013
 8 A-015
 9 A-016
10 A-020
# … with 11 more rows

The overlap argument

EXAMPLE:

# A tibble: 10 × 5
   ID    clan  life_stage starting_date ending_date
   <chr> <chr> <chr>      <date>        <date>     
 1 A-049 A     cub        1994-04-14    1995-04-13 
 2 A-049 A     subadult   1995-04-14    1996-04-12 
 3 A-049 A     natal      1996-04-13    1997-06-17 
 4 A-049 L     disperser  1997-06-18    2003-09-28 
 5 A-049 L     dead       2003-09-29    Inf        
 6 A-052 A     cub        1993-08-07    1994-08-06 
 7 A-052 A     subadult   1994-08-07    1995-08-06 
 8 A-052 A     natal      1995-08-07    1997-04-06 
 9 A-052 E     disperser  1997-04-07    2000-03-19 
10 A-052 E     dead       2000-03-20    Inf        

Dealing with NAs

Identifying NAs

# Data to start with
hyenas <- extract_database_table("hyenas") %>% 
  filter(birthdate > "2010-01-01") %>% 
  select(ID:father)

head(hyenas, n = 5)
# A tibble: 5 × 7
  ID    sex    birthclan birthdate  mothergenetic mothersocial father
  <chr> <chr>  <chr>     <date>     <chr>         <chr>        <chr> 
1 A-271 male   A         2010-01-24 A-164         A-164        <NA>  
2 A-272 male   A         2010-01-24 A-164         A-164        <NA>  
3 A-273 female A         2010-01-29 A-206         A-206        F-112 
4 A-274 male   A         2010-01-29 A-206         A-206        F-112 
5 A-275 male   A         2010-02-18 A-178         A-178        L-128 

Identifying NAs

NAs in 1 column

hyenas %>% 
  filter(is.na(mothergenetic))
# A tibble: 111 × 7
   ID    sex    birthclan birthdate  mothergenetic mothersocial father
   <chr> <chr>  <chr>     <date>     <chr>         <chr>        <chr> 
 1 A-320 <NA>   A         2012-01-27 <NA>          <NA>         <NA>  
 2 A-325 male   A         2012-06-11 <NA>          <NA>         <NA>  
 3 A-389 <NA>   A         2014-10-28 <NA>          <NA>         <NA>  
 4 A-410 male   A         2015-08-22 <NA>          <NA>         <NA>  
 5 A-421 female A         2016-01-23 <NA>          <NA>         <NA>  
 6 A-440 <NA>   A         2016-11-15 <NA>          <NA>         <NA>  
 7 A-482 female A         2017-08-28 <NA>          <NA>         <NA>  
 8 A-546 <NA>   A         2019-11-15 <NA>          <NA>         <NA>  
 9 E-224 female E         2012-03-12 <NA>          <NA>         <NA>  
10 E-228 <NA>   E         2012-10-25 <NA>          <NA>         <NA>  
# … with 101 more rows

Identifying NAs

NAs in >1 column

hyenas %>% 
  filter(is.na(mothergenetic) & is.na(father))
# A tibble: 111 × 7
   ID    sex    birthclan birthdate  mothergenetic mothersocial father
   <chr> <chr>  <chr>     <date>     <chr>         <chr>        <chr> 
 1 A-320 <NA>   A         2012-01-27 <NA>          <NA>         <NA>  
 2 A-325 male   A         2012-06-11 <NA>          <NA>         <NA>  
 3 A-389 <NA>   A         2014-10-28 <NA>          <NA>         <NA>  
 4 A-410 male   A         2015-08-22 <NA>          <NA>         <NA>  
 5 A-421 female A         2016-01-23 <NA>          <NA>         <NA>  
 6 A-440 <NA>   A         2016-11-15 <NA>          <NA>         <NA>  
 7 A-482 female A         2017-08-28 <NA>          <NA>         <NA>  
 8 A-546 <NA>   A         2019-11-15 <NA>          <NA>         <NA>  
 9 E-224 female E         2012-03-12 <NA>          <NA>         <NA>  
10 E-228 <NA>   E         2012-10-25 <NA>          <NA>         <NA>  
# … with 101 more rows

Identifying NAs

NAs in either column

hyenas %>% 
  filter(is.na(mothergenetic) | is.na(father))
# A tibble: 702 × 7
   ID    sex    birthclan birthdate  mothergenetic mothersocial father
   <chr> <chr>  <chr>     <date>     <chr>         <chr>        <chr> 
 1 A-271 male   A         2010-01-24 A-164         A-164        <NA>  
 2 A-272 male   A         2010-01-24 A-164         A-164        <NA>  
 3 A-294 male   A         2010-10-01 A-006         A-006        <NA>  
 4 A-308 male   A         2011-10-18 A-206         A-206        <NA>  
 5 A-309 male   A         2011-10-18 A-206         A-206        <NA>  
 6 A-311 male   A         2011-11-20 A-139         A-139        <NA>  
 7 A-312 male   A         2011-11-20 A-139         A-139        <NA>  
 8 A-314 male   A         2011-12-20 A-140         A-140        <NA>  
 9 A-317 female A         2012-01-20 A-233         A-233        <NA>  
10 A-320 <NA>   A         2012-01-27 <NA>          <NA>         <NA>  
# … with 692 more rows

Identifying NAs

NAs in either column

hyenas %>% 
  filter(if_any(.cols = c(mothergenetic, father),
                .fns = is.na))
# A tibble: 702 × 7
   ID    sex    birthclan birthdate  mothergenetic mothersocial father
   <chr> <chr>  <chr>     <date>     <chr>         <chr>        <chr> 
 1 A-271 male   A         2010-01-24 A-164         A-164        <NA>  
 2 A-272 male   A         2010-01-24 A-164         A-164        <NA>  
 3 A-294 male   A         2010-10-01 A-006         A-006        <NA>  
 4 A-308 male   A         2011-10-18 A-206         A-206        <NA>  
 5 A-309 male   A         2011-10-18 A-206         A-206        <NA>  
 6 A-311 male   A         2011-11-20 A-139         A-139        <NA>  
 7 A-312 male   A         2011-11-20 A-139         A-139        <NA>  
 8 A-314 male   A         2011-12-20 A-140         A-140        <NA>  
 9 A-317 female A         2012-01-20 A-233         A-233        <NA>  
10 A-320 <NA>   A         2012-01-27 <NA>          <NA>         <NA>  
# … with 692 more rows

Remove NAs

tidyr::replace_na() for 1 column

extract_database_table("carnivores") %>% 
  mutate(groupsize2 = tidyr::replace_na(groupsize,
                                       replace = 1)) %>% 
  filter(is.na(groupsize)) %>% 
  select(date_time, species, groupsize, groupsize2)
# A tibble: 3 × 4
  date_time           species groupsize groupsize2
  <dttm>              <chr>       <int>      <int>
1 1998-01-21 07:00:00 bj             NA          1
2 1998-01-21 07:00:00 gj             NA          1
3 NA                  li             NA          1

Remove NAs

tidyr::replace_na() for many

extract_database_table("carnivores") %>% 
  filter(if_any(.cols = c(groupsize, latitude, longitude),
                .fns = is.na)) %>% 
  tidyr::replace_na(list(groupsize = 1,
                         latitude = -3.16,
                         longitude = 35.6)) %>% 
  select(groupsize, latitude, longitude)
# A tibble: 48 × 3
   groupsize latitude longitude
       <int>    <dbl>     <dbl>
 1         3    -3.16      35.6
 2         3    -3.16      35.6
 3         3    -3.16      35.6
 4         1    -3.16      35.6
 5         1    -3.16      35.6
 6         1    -3.16      35.6
 7         2    -3.16      35.6
 8         2    -3.16      35.6
 9         1    -3.16      35.6
10         1    -3.16      35.6
# … with 38 more rows

Add NAs

dplyr::na_if()

extract_database_table("carnivores") %>% 
  # Replace an uncertain category with NA
  mutate(species2 = dplyr::na_if(species, "gj?")) %>% 
  filter(species == "gj?") %>% 
  select(date_time, species, species2)
# A tibble: 1 × 3
  date_time           species species2
  <dttm>              <chr>   <chr>   
1 2002-12-10 15:52:00 gj?     <NA>    

HOMEWORK: Nothing this week!