Data science with {hyenaR}:
LESSON 16

Prepare our workspace


STEP 1: Load required packages

library(hyenaR) ## For our hyena specific functions
library(dplyr) ## For most data wrangling
library(ggplot2) ## For plotting
library(lubridate) ## Working with dates
library(tidyr) ## Extra data wrangling functions
library(stringr) ## Working with text
library(waldo) ## To compare objects
library(skimr) ## Inspect data
library(purrr) ## For loops in the tidyverse


STEP 2: Load the database

load_package_database.full(
  
  # Location of our database file
  db.path = "example_git/source_data/Fisidata_2022_08_10.sqlite"
  
)

Today’s goals

GOAL 1: 🧑‍🏫 Using loops in R

Adult sex ratio per year

Without a loop

#1997
# Extract all adults in 1997
ad_1997 <- create_id_starting.table(lifestage = "adult",
                                    from = "1997-01-01", to = "1997-12-31") %>% 
  #Extract sex information
  mutate(sex = fetch_id_sex(ID))

#Determine sex ratio
sexratio_1997 <- sum(ad_1997$sex %in% "male")/nrow(ad_1997)

sexratio_1997
[1] 0.4810127

Without a loop

#1997
ad_1997 <- create_id_starting.table(lifestage = "adult",
                                    from = "1997-01-01", to = "1997-12-31") %>% 
  mutate(sex = fetch_id_sex(ID))

sexratio_1997 <- sum(ad_1997$sex %in% "male")/nrow(ad_1997)

### COPY-PASTE-EDIT THE SAME CODE FOR EVERY YEAR!
#1998
ad_1998 <- create_id_starting.table(lifestage = "adult",
                                    from = "1998-01-01", to = "1998-12-31") %>% 
  mutate(sex = fetch_id_sex(ID))

sexratio_1998 <- sum(ad_1998$sex %in% "male")/nrow(ad_1998)

tibble(year = 1997:1998,
       sexratio = c(sexratio_1997, sexratio_1998))
# A tibble: 2 × 2
   year sexratio
  <int>    <dbl>
1  1997    0.481
2  1998    0.490

Without a loop



RULE OF THREE: If you need to copy-paste the same code 3 or more times you need to rework (e.g. looping!)

The ‘for’ loop


## For every value between 1997 and 2000...
## Create a variable called 'year' that has this value...
for (year in 1997:2000) {
  
  ## Run this code.
  print(paste("It's year:", year))
  
}
[1] "It's year: 1997"
[1] "It's year: 1998"
[1] "It's year: 1999"
[1] "It's year: 2000"

The ‘for’ loop


Why doesn’t this do anything?!?!?

## For every value between 1997 and 2000...
## Create a variable called 'year' that has this value...
for (year in 1997:2000) {
  
  ## Start and end date of our code is now determined by
  ## the value of the variable 'year'
  start_date <- paste(year, "-01-01")
  end_date   <- paste(year, "-12-31")
  ad         <- create_id_starting.table(lifestage = "adult",
                                         from = start_date,
                                         to = end_date) %>% 
    mutate(sex = fetch_id_sex(ID))
  
  sum(ad$sex %in% "male")/nrow(ad)
  
}

The ‘for’ loop

Anything inside ‘for’ is lost unless we save it!

#Create an empty vector...
sex_ratio <- c()

## For every value between 1997 and 2000...
## Create a variable called 'year' that has this value...
for (year in 1997:2000) {
  
  ## Run this code.
  start_date <- paste(year, "-01-01")
  end_date   <- paste(year, "-12-31")
  ad         <- create_id_starting.table(lifestage = "adult",
                                         from = start_date,
                                         to = end_date) %>% 
    mutate(sex = fetch_id_sex(ID))
  
  #Append the new value to our vector
  sex_ratio <- append(sex_ratio, sum(ad$sex %in% "male")/nrow(ad))
  
}

The ‘for’ loop

Anything inside ‘for’ is lost unless we save it!

Code
#Create an empty vector...
sex_ratio <- c()

## For every value between 1997 and 2000...
## Create a variable called 'year' that has this value...
for (year in 1997:2000) {
  
  ## Run this code.
  start_date <- paste(year, "-01-01")
  end_date   <- paste(year, "-12-31")
  ad         <- create_id_starting.table(lifestage = "adult",
                                         from = start_date,
                                         to = end_date) %>% 
    mutate(sex = fetch_id_sex(ID))
  
  #Append the new value to our vector
  sex_ratio <- append(sex_ratio, sum(ad$sex %in% "male")/nrow(ad))
  
}

tibble(year = 1997:2000,
       sex_ratio = sex_ratio)
# A tibble: 4 × 2
   year sex_ratio
  <int>     <dbl>
1  1997     0.481
2  1998     0.490
3  1999     0.512
4  2000     0.493

‘apply’ functions

sapply returns a vector by default

sapply(
  # Loop through these values...
  X = 1997:2000,
  # Run this function
  FUN = function(year){
    paste("It's year", year)
  })
[1] "It's year 1997" "It's year 1998" "It's year 1999" "It's year 2000"

‘apply’ functions

sapply returns a vector by default

sex_ratio <- sapply(
  ## Use each value 1997:2000...
  X = 1997:2000,
  ## To run this function...
  FUN = function(year){
    start_date <- paste(year, "-01-01")
    end_date   <- paste(year, "-12-31")
    ad         <- create_id_starting.table(lifestage = "adult", from = start_date, to = end_date) %>% 
      mutate(sex = fetch_id_sex(ID))
    
    #The output will be added to a vector
    sum(ad$sex %in% "male")/nrow(ad)
  })

‘apply’ functions

sapply returns a vector by default

Code
sex_ratio <- sapply(
  ## Use each value 1997:2000...
  X = 1997:2000,
  ## To run this function...
  FUN = function(year){
    start_date <- paste(year, "-01-01")
    end_date   <- paste(year, "-12-31")
    ad         <- create_id_starting.table(lifestage = "adult", from = start_date, to = end_date) %>% 
      mutate(sex = fetch_id_sex(ID))
    
    #The output will be added to a vector
    sum(ad$sex %in% "male")/nrow(ad)
  })

tibble(year = 1997:2000,
       sex_ratio = sex_ratio)
# A tibble: 4 × 2
   year sex_ratio
  <int>     <dbl>
1  1997     0.481
2  1998     0.490
3  1999     0.512
4  2000     0.493

‘purrr’ package

map functions from package purrr can also be used to loop.

# Create a loop that already outputs a data frame (i.e. _df)!!
purrr::map_df(
  # Loop through each value of .x and run function .f
  .x = 1997:2000,
  .f = function(year){
    start_date <- paste(year, "-01-01")
    end_date   <- paste(year, "-12-31")
    ad         <- create_id_starting.table(lifestage = "adult", from = start_date, to = end_date) %>% 
      mutate(sex = fetch_id_sex(ID))
    
    #Output a data frame
    tibble(year = year,
           sex_ratio = sum(ad$sex %in% "male")/nrow(ad))
  })
# A tibble: 4 × 2
   year sex_ratio
  <int>     <dbl>
1  1997     0.481
2  1998     0.490
3  1999     0.512
4  2000     0.493

Work across a vector

Many functions in hyenaR accept a vector!

# Create columns for start and end date...
tibble(year = 1997:2000) %>% 
  mutate(start_date = paste0(year, "-01-01"),
         end_date = paste0(year, "-12-31")) %>% 
  # Use fetch function that can take multiple start/end dates...
  mutate(nr_male = fetch_pop_number.male.adult(from = start_date,
                                               to = end_date),
         nr_ad = fetch_pop_number.anysex.adult(from = start_date,
                                               to = end_date),
         ## Calculate sex ratio
         sex_ratio = nr_male/nr_ad)
# A tibble: 4 × 6
   year start_date end_date   nr_male nr_ad sex_ratio
  <int> <chr>      <chr>        <int> <int>     <dbl>
1  1997 1997-01-01 1997-12-31      76   158     0.481
2  1998 1998-01-01 1998-12-31      77   157     0.490
3  1999 1999-01-01 1999-12-31      87   170     0.512
4  2000 2000-01-01 2000-12-31      99   201     0.493

Which is better?


  • hyenaR can be a bit slower (for now), but should be more user friendly.
[1] "purrr"
   user  system elapsed 
  1.576   0.048   1.623 
[1] "sapply"
   user  system elapsed 
  1.572   0.037   1.609 
[1] "for"
   user  system elapsed 
  1.572   0.028   1.601 
[1] "hyenaR function"
   user  system elapsed 
  3.052   0.052   3.103 

Which is better?


  • for and sapply are base functions that require no dependency.

  • purrr syntax works well for parallel processing packages.