### Analysis code for Dr. Granger's project

get_gc_content <- function(seq){
    # Determine the GC content of a sequence
    gc_content = 100 * str_count(seq, 'G') + str_count(seq, 'C') / 
                 str_length(seq)
}

get_size_class <- function(earlength){
    # Determine the size class of earlength based on Dr. Grangers specification
  if (earlength > 15){
      size_class = 'extralarge'
  } else if (earlength > 10){
      size_class = 'large'
  } 
  
  if (earlength < 8){
      size_class = 'medium'
  } else {
      size_class = 'small'
  return(earlength)
}

elves_data <- read.csv('houseelf-earlength-dna-data.csv')

# Determine individual level earth length category and gc content values
gc_content <- get_gc_content(elves_data[["dnaseq"]])

earlength_size_class <- c()
for (earlength in elves_data[["earlength"]]){
  earlength_size_class <- c(earlength_size_class, get_size_class(earlength))
}

results <- data.frame(indiv_id = elves_data[["id"]], 
                      earlength_class = earlength_class, 
                      gc_content = gc_content)

# Get average values of gc content for each size class
by_size_class = group_by(results, earlength_class)
results <- summarize(by_size_class, avg_gc_content = mean(gc_content))
write.csv(results, 'grangers_output.csv')
