Data science with {hyenaR}:
LESSON 11

RECAP: Getting started

Use {drat} to access the current (stable) version of {hyenaR}.


## To download package from other sources
library(drat)

## Include 'hyenaproject' as a package source
addRepo("hyenaproject") 

## Download hyenaR
install.packages("hyenaR")

Accessing the data


library(hyenaR)
download_package_csv(
  
  #Where to download
  csv.output.folder = "example_git/source_data"
  
)

Build the database


build_package_database.full(
  
  #Name of database file
  db.name = "Fisidata_2022_08_10",
  
  #Where the .csv files are stored
  input.folder = "example_git/source_data",
  
  #Where the database will be stored
  db.output.folder = "example_git/source_data"
)

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions
library(stringr) ## Working with text
library(waldo) ## To compare objects


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals



GOAL 1: ♻️ Revisit the summarise() function


GOAL 2: 🧑‍🏫 Different ways to count data


GOAL 3: ✅ A simple example with censored data

GOAL 1: ♻️ Revisit the summarise() function

summarise() can apply any function to our data


What is the longest a female has ever lived?

#Create a data frame of all females
create_id_starting.table(sex = "female") %>% 
  #Determine the lifespan of each female
  mutate(lifespan = fetch_id_duration.lifespan(ID))
# A tibble: 1,211 × 2
   ID    lifespan
   <chr>    <dbl>
 1 A-001    19.0 
 2 A-002     5.00
 3 A-003    15.7 
 4 A-004     9.01
 5 A-006    19.6 
 6 A-007     8.05
 7 A-008    14.5 
 8 A-009    12.8 
 9 A-010     5.38
10 A-013    19.8 
# … with 1,201 more rows

summarise() can apply any function to our data


What is the longest a female has ever lived?

#Create a data frame of all females
create_id_starting.table(sex = "female") %>% 
  #Determine the lifespan of each female
  mutate(lifespan = fetch_id_duration.lifespan(ID)) %>% 
  #Summarise our data to find the maximum value of lifespan
  summarise(max_age = max(lifespan, na.rm = TRUE))
# A tibble: 1 × 1
  max_age
    <dbl>
1    20.1

summarise() can apply any function to our data


Find the max lifespan in months…

#Create a data frame of all females
create_id_starting.table(sex = "female") %>% 
  #Determine the lifespan of each female
  mutate(lifespan = fetch_id_duration.lifespan(ID)) %>% 
  #Summarise our data to find max lifespan in months
  summarise(lifespan_months = max(lifespan, na.rm = TRUE)*12)
# A tibble: 1 × 1
  lifespan_months
            <dbl>
1            241.

summarise() can apply multiple functions to our data at the same time


Summarise both the max and mean lifespan…

#Create a data frame of all females
create_id_starting.table(sex = "female") %>% 
  #Determine the lifespan of each female
  mutate(lifespan = fetch_id_duration.lifespan(ID)) %>% 
  #Summarise our data to find BOTH the maximum and mean value of lifespan
  summarise(max_age = max(lifespan, na.rm = TRUE),
            mean_age = mean(lifespan, na.rm = TRUE))
# A tibble: 1 × 2
  max_age mean_age
    <dbl>    <dbl>
1    20.1     4.78

group_by()/summarise() applies functions to subsets of the data

Summarise for females born in each clan…

Code
create_id_starting.table(sex = "female") %>% 
  mutate(lifespan = fetch_id_duration.lifespan(ID),
         #Find birth clan...
         clan = fetch_id_clan.birth(ID)) %>% 
  # Group by clan so we will apply functions to data from each clan separately...
  group_by(clan) %>% 
  #Summarise our data to find three things:
  summarise(
            #- Number of females
            n = n(),
            #- Max age
            max_age = max(lifespan, na.rm = TRUE),
            #- Mean age
            mean_age = mean(lifespan, na.rm = TRUE))
# A tibble: 13 × 4
   clan      n max_age mean_age
   <chr> <int>   <dbl>    <dbl>
 1 A       241   19.8      4.80
 2 B         1    5.00     5.00
 3 C         3   12        8.39
 4 E       101   16.7      5.95
 5 F       118   18.0      4.76
 6 L       208   20.1      4.83
 7 M       199   19.1      4.71
 8 N       107   16.9      4.40
 9 R         4    5.89     3.85
10 S       139   14.6      3.81
11 T        66   18.5      5.45
12 U         7    5.56     3.44
13 X        17   10.0      5.21

ANY function in R can be used inside summarise()

🤓 Try for yourself


Start with this data…

create_id_starting.table(sex = "female") %>% 
  mutate(lifespan = fetch_id_duration.lifespan(ID),
         #Find birth clan...
         clan = fetch_id_clan.birth(ID))

…and find the mean lifespan in each clan.

Code
create_id_starting.table(sex = "female") %>% 
  mutate(lifespan = fetch_id_duration.lifespan(ID),
         #Find birth clan...
         clan = fetch_id_clan.birth(ID)) %>% 
  group_by(clan) %>% 
  summarise(mean_lifespan = mean(lifespan, na.rm = TRUE))

🤓 Try for yourself


Start with this data…

create_id_starting.table(sex = "female") %>% 
  mutate(reproductive_success = fetch_id_number.offspring(ID),
         clan = fetch_id_clan.birth(ID))

…and find the sum and mean reproductive success and number of individuals (using n()) in each clan.

Code
create_id_starting.table(sex = "female") %>% 
  mutate(reproductive_success = fetch_id_number.offspring(ID),
         clan = fetch_id_clan.birth(ID)) %>% 
  group_by(clan) %>% 
  summarise(sum = sum(reproductive_success),
            mean = mean(reproductive_success),
            n = n())

🤓 Try for yourself


Start with this data…

create_id_starting.table(sex = "female") %>% 
  mutate(reproductive_success = fetch_id_number.offspring(ID),
         clan = fetch_id_clan.birth(ID))

…and find number of offspring per capita in each clan.

Code
create_id_starting.table(sex = "female") %>% 
  mutate(reproductive_success = fetch_id_number.offspring(ID),
         clan = fetch_id_clan.birth(ID)) %>% 
  group_by(clan) %>% 
  summarise(offspring_percap = sum(reproductive_success)/n())

🧑‍🏫 Different ways to count data

🧑‍🏫 Different ways to count data


If we have a vector (e.g. of IDs) we can use length(). This returns a single number.

# Return a VECTOR of female IDs
IDs <- find_pop_id(sex = "female")

length(IDs)
[1] 1211

🧑‍🏫 Different ways to count data


If we have a data frame we can use nrow(). This returns a single number.

# Create a DATA FRAME with and ID column
ID_dataframe <- create_id_starting.table(sex = "female") 
  
nrow(ID_dataframe)
[1] 1211

🧑‍🏫 Different ways to count data


If we have a data frame we can use count(). This returns a new data frame.

# Create a DATA FRAME with and ID column
create_id_starting.table(sex = "female") %>% 
  #Create data frame with new column `n`
  count()
# A tibble: 1 × 1
      n
  <int>
1  1211

🧑‍🏫 Different ways to count data



Why might we want to return a data frame?!

  • We can continue working with dplyr functions.

  • Data frame is needed for statistical models (e.g. lm()) and plotting (e.g. ggplot()).

  • Data frame can be easily output as .csv with e.g. write.csv()

🧑‍🏫 Different ways to count data


If we have a data frame and want to count and do other tasks we can use summarise() and n(). This returns a new data frame.

# Create a DATA FRAME with an ID and lifespan column
create_id_starting.table(sex = "female") %>% 
  mutate(lifespan = fetch_id_duration.lifespan(ID)) %>% 
  # Count data AND find longest lifespan
  summarise(number_females = n(),
            oldest = max(lifespan, na.rm = TRUE))
# A tibble: 1 × 2
  number_females oldest
           <int>  <dbl>
1           1211   20.1

GOAL 3: ✅ A simple example with censored data

RECAP: What is censored data?

  • left-censored: Individual was born before study period.

  • right-censored: Individual died after study period.

  • uncensored: Individual born and died during study period.

How do we find censored individuals?

# Create a DATA FRAME with all female IDs
create_id_starting.table(sex = "female") %>%
  mutate(
    #If individual is born before 1997-01-01 they are LEFT CENSORED
    left_censored = fetch_id_is.censored.left(ID, at = "1997-01-01"),
    #If individual is still alive at 1997-12-31 they are RIGHT CENSORED
    right_censored = fetch_id_is.censored.right(ID, at = "1997-12-31"),
    )
# A tibble: 1,211 × 3
   ID    left_censored right_censored
   <chr> <lgl>         <lgl>         
 1 A-001 TRUE          TRUE          
 2 A-002 TRUE          FALSE         
 3 A-003 TRUE          TRUE          
 4 A-004 TRUE          FALSE         
 5 A-006 TRUE          TRUE          
 6 A-007 FALSE         FALSE         
 7 A-008 TRUE          TRUE          
 8 A-009 TRUE          TRUE          
 9 A-010 TRUE          TRUE          
10 A-013 TRUE          TRUE          
# … with 1,201 more rows

How many censored individuals are there?

# Create a DATA FRAME with all female IDs
create_id_starting.table(sex = "female") %>%
  mutate(
    #If individual is already alive at 1997-01-01 they are LEFT CENSORED
    left_censored = fetch_id_is.censored.left(ID, at = "1997-01-01"),
    #If individual is still alive at 1997-12-31 they are RIGHT CENSORED
    right_censored = fetch_id_is.censored.right(ID, at = "1997-12-31"),
    ) %>% 
  #Count how many are left and/or right censored
  group_by(left_censored, right_censored) %>% 
  ## REMEMBER: TO RETURN A DATAFRAME WE NEED TO USE COUNT() OR SUMMARISE()
  count()
# A tibble: 4 × 3
# Groups:   left_censored, right_censored [4]
  left_censored right_censored     n
  <lgl>         <lgl>          <int>
1 FALSE         FALSE           1096
2 FALSE         TRUE              24
3 TRUE          FALSE             15
4 TRUE          TRUE              76

How do we remove censored individuals?

# Create a DATA FRAME with all female IDs
create_id_starting.table(sex = "female") %>%
  mutate(
    #If individual is already alive at 1997-01-01 they are LEFT CENSORED
    left_censored = fetch_id_is.censored.left(ID, at = "1997-01-01"),
    #If individual is still alive at 1997-12-31 they are RIGHT CENSORED
    right_censored = fetch_id_is.censored.right(ID, at = "1997-12-31"),
    ) %>% 
  #Keep only individuals that are uncensored
  filter(!left_censored & !right_censored) %>% 
  #How many individuals is this?
  count()
# A tibble: 1 × 1
      n
  <int>
1  1096

How do we remove censored individuals?

# Easiest method is just to filter uncensored individuals
create_id_starting.table(sex = "female") %>% 
  filter(!fetch_id_is.censored(ID, from = "1997-01-01", to = "1997-12-31")) %>% 
  count()
# A tibble: 1 × 1
      n
  <int>
1  1096

🤓 Need more practice?

Try out the introduction on RStudio Cloud!