1  Evaluation data: input/features

load packages
library(tidyverse) 

# data acquisition ----
#devtools::install_github("bergant/airtabler")
library(airtabler)

# data cleaning & shaping ----

# data analysis ----
# library(lme4)
# library(lmtest) # Testing Linear Regression Models

# markdown et al. ----
library(knitr)
library(bookdown)
library(quarto)
library(formattable) # Create 'Formattable' Data Structures

# others ----
library(here) # A Simpler Way to Find Your Files
#devtools::install_github("metamelb-repliCATS/aggreCAT")
library(aggreCAT)

# Make sure select is always the dplyr version
select <- dplyr::select 

# source DistributionWAggMOD
source(here("code", "DistAggModified.R"))

# options
options(knitr.duplicate.label = "allow")

Below, the evaluation data is input from an Airtable, which itself was largely hand-input from evaluators’ reports. As PubPub builds (target: end of Sept. 2023), this will allow us to include the ratings and predictions as structured data objects. We then plan to access and input this data directly from the PubPub (API?) into the present analysis. This will improve automation and limit the potential for data entry errors.

input from airtable
#base_id <- "appbPYEw9nURln7Qg"
base_id <- "applDG6ifmUmeEJ7j" #new ID to cover "UJ - research & core members" base


# Set your Airtable API key 
Sys.setenv(AIRTABLE_API_KEY = Sys.getenv("AIRTABLE_API_KEY"))
#this should be set in my .Renviron file

# Read data from a specific view

evals <- air_get(base = base_id, "output_eval") 

all_pub_records <- data.frame()
pub_records <- air_select(base = base_id, table = "crucial_rsx")

# Append the records to the list
all_pub_records <- bind_rows(all_pub_records, pub_records)

# While the length of the records list is 100 (the maximum), fetch more records
while(nrow(pub_records) == 100) {
  # Get the ID of the last record in the list
  offset <- get_offset(pub_records)
  
  # Fetch the next 100 records, starting after the last ID
  pub_records <- air_select(base = base_id, table = "crucial_rsx", offset =  offset)
  
  # Append the records to the df
  all_pub_records <- bind_rows(all_pub_records, pub_records)
}

# housekeeping
rm(pub_records)
just the useful and publish-able data, clean a bit
# clean evals names to snakecase
colnames(evals) <- snakecase::to_snake_case(colnames(evals))

evals_pub <- evals %>% 
  dplyr::rename(stage_of_process = stage_of_process_todo_from_crucial_research_2) %>% 
  mutate(stage_of_process = unlist(stage_of_process)) %>% 
  dplyr::filter(stage_of_process == "published") %>% 
    select(id, 
           crucial_research, 
           paper_abbrev, 
           evaluator_name, 
           category, 
           source_main, 
           author_agreement, 
           overall, 
           lb_overall, 
           ub_overall, 
           conf_index_overall, 
           advancing_knowledge_and_practice, 
           lb_advancing_knowledge_and_practice, 
           ub_advancing_knowledge_and_practice, 
           conf_index_advancing_knowledge_and_practice,
           methods_justification_reasonableness_validity_robustness,
           lb_methods_justification_reasonableness_validity_robustness,
           ub_methods_justification_reasonableness_validity_robustness,
           conf_index_methods_justification_reasonableness_validity_robustness, 
           logic_communication, lb_logic_communication, ub_logic_communication, 
           conf_index_logic_communication,
           engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
           lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
           ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
           conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance,
           relevance_to_global_priorities, 
           lb_relevance_to_global_priorities, 
           ub_relevance_to_global_priorities, 
           conf_index_relevance_to_global_priorities, 
           journal_quality_predict, 
           lb_journal_quality_predict, 
           ub_journal_quality_predict,
           conf_index_journal_quality_predict, 
           open_collaborative_replicable, 
           conf_index_open_collaborative_replicable, 
           lb_open_collaborative_replicable, 
           ub_open_collaborative_replicable, 
           merits_journal, 
           lb_merits_journal, 
           ub_merits_journal, 
           conf_index_merits_journal)

# shorten names (before you expand into columns)
new_names <- c(
  "eval_name" = "evaluator_name",
  "cat" = "category",
  "crucial_rsx" = "crucial_research",
  "conf_overall" = "conf_index_overall",
  "adv_knowledge" = "advancing_knowledge_and_practice",
  "lb_adv_knowledge" = "lb_advancing_knowledge_and_practice",
  "ub_adv_knowledge" = "ub_advancing_knowledge_and_practice",
  "conf_adv_knowledge" = "conf_index_advancing_knowledge_and_practice",
  "methods" = "methods_justification_reasonableness_validity_robustness",
  "lb_methods" = "lb_methods_justification_reasonableness_validity_robustness",
  "ub_methods" = "ub_methods_justification_reasonableness_validity_robustness",
  "conf_methods" = "conf_index_methods_justification_reasonableness_validity_robustness",
  "logic_comms" = "logic_communication",
  "lb_logic_comms" = "lb_logic_communication",
  "ub_logic_comms" = "ub_logic_communication",
  "conf_logic_comms" = "conf_index_logic_communication",
  "real_world" = "engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
  "lb_real_world" = "lb_engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
  "ub_real_world" = "ub_engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
  "conf_real_world" = "conf_index_engaging_with_real_world_impact_quantification_practice_realism_and_relevance",
  "gp_relevance" = "relevance_to_global_priorities",
  "lb_gp_relevance" = "lb_relevance_to_global_priorities",
  "ub_gp_relevance" = "ub_relevance_to_global_priorities",
  "conf_gp_relevance" = "conf_index_relevance_to_global_priorities",
  "journal_predict" = "journal_quality_predict",
  "lb_journal_predict" = "lb_journal_quality_predict",
  "ub_journal_predict" = "ub_journal_quality_predict",
  "conf_journal_predict" = "conf_index_journal_quality_predict",
  "open_sci" = "open_collaborative_replicable",
  "conf_open_sci" = "conf_index_open_collaborative_replicable",
  "lb_open_sci" = "lb_open_collaborative_replicable",
  "ub_open_sci" = "ub_open_collaborative_replicable",
  "conf_merits_journal" = "conf_index_merits_journal"
)

evals_pub <- evals_pub %>%
  rename(!!!new_names)

#  Create a list of labels with the old, longer names
labels <- str_replace_all(new_names, "_", " ") %>% str_to_title()

# Assign labels to the dataframe / tibble
# (maybe this can be done as an attribute, not currently working)
# for(i in seq_along(labels)) {
#    col_name <- new_names[names(new_names)[i]]
#    label <- labels[i]
#    attr(evals_pub[[col_name]], "label") <- label
#  }


# expand categories into columns, unlist everything
evals_pub %<>%
  tidyr::unnest_wider(cat, names_sep = "_") %>% # give each of these its own col
  mutate(across(everything(), unlist))  # maybe check why some of these are lists in the first place
  

# clean the Anonymous names
evals_pub$eval_name <- ifelse(
  grepl("^\\b\\w+\\b$|\\bAnonymous\\b", evals_pub$eval_name),
  paste0("Anonymous_", seq_along(evals_pub$eval_name)),
  evals_pub$eval_name
)

#housekeeping
rm(evals)

#Todo -- check the unlist is not propagating the entry
#Note: category,  topic_subfield, and source have multiple meaningful categories. These will need care  
Code
evals_pub_long <- evals_pub %>% 
  pivot_longer(cols = -c(id, crucial_rsx, paper_abbrev, eval_name, 
                         cat_1,cat_2, cat_3,source_main,author_agreement),
               names_pattern = "(lb_|ub_|conf_)?(.+)",
               names_to = c("value_type", "rating_type")) %>% # one line per rating type
  mutate(value_type = if_else(value_type == "", "est_", value_type)) %>% #add main rating id
  pivot_wider(names_from = value_type, 
              values_from = value)

Reconcile uncertainty ratings and CIs

Where people gave only confidence level ‘dots’, we impute CIs (confidence/credible intervals). We follow the correspondence described here. (Otherwise, where they gave actual CIs, we use these.)1

5 = Extremely confident, i.e., 90% confidence interval spans +/- 4 points or less)

For 0-100 ratings, code the LB as \(max(R - 4,0)\) and the UB as \(min(R + 4,100)\), where R is the stated (middle) rating. This ‘scales’ the CI, as interpreted, to be proportional to the rating, with a maximum ‘interval’ of about 8, with the rating is about 96.

4 = Very*confident: 90% confidence interval +/- 8 points or less

For 0-100 ratings, code the LB as \(max(R - 8,0)\) and the UB as \(min(R + 8,100)\), where R is the stated (middle) rating.

3 = Somewhat** confident: 90% confidence interval +/- 15 points or less

2 = Not very** confident: 90% confidence interval, +/- 25 points or less

Comparable scaling for the 2-3 ratings as for the 4 and 5 rating.

1 = Not** confident: (90% confidence interval +/- more than 25 points)

Code LB as \(max(R - 37.5,0)\) and the UB as \(min(R + 37.5,100)\).

This is just a first-pass. There might be a more information-theoretic way of doing this. On the other hand, we might be switching the evaluations to use a different tool soon, perhaps getting rid of the 1-5 confidence ratings altogether.

reconcile explicit bounds and stated confidence level
# Define the baseline widths for each confidence rating
# JB: it would be good to have some more backing of whether this
# is a valid way to translate these confidence levels into CIs

# baseline_widths <- c(4, 8, 15, 25, 37.5)

# Lists of categories
rating_cats <- c("overall", "adv_knowledge", "methods", "logic_comms", "real_world", "gp_relevance", "open_sci")

#... 'predictions' are currently 1-5 (0-5?)
pred_cats <- c("journal_predict", "merits_journal")

#DR: Note that I use these objects in future chapters, but they are not connected to the data frame. Either one saves and reinputs the whole environment (messy, replicability issue), or you save this somewhere and re-input it or connect it to the data frame that gets saved (not sure how to do it), or you hard-code reinput it in the next chapter. 

#I do the latter for now, but I'm not happy about it, because the idea is 'input definitions in a single place to use later'

#JB: I don't really understand why not just hard code it in the next chapter. These are very short strings. Do you expect them to change often? If so, we can derive them from a dataframe somewhere in the future or save as a separate object. 


# JB: Rewritten functions for adding imputed ub and lb

# calculate the lower and upper bounds, 
# rating: given a rating, 
# conf: a confidence (1-5) score,
# type: a bound type (lower, upper),
# scale: a scale (100 is 0-100, 5 is 1-5)
# This function is not vectorized
calc_bounds <- function(rating, conf, type, scale) {
  
  if(scale == 5){ #for the 'journal tier prediction case'
    baseline_width = case_match(conf, 
                                5 ~ .2, #4*5/100
                                4 ~ .4, #8*5/100
                                3 ~ .75, #15*5/100
                                2 ~ 1.25, #25*5/100
                                1 ~ 1.875, #37.5*5/100
                                .default = NA_real_) 

    upper = min(rating + baseline_width, 5)
    lower = max(rating - baseline_width, 1)
  }#/if(scale == 5)
  
  if(scale == 100){ #for the 'ratings case'
    
    baseline_width = case_match(conf, 
                                5 ~ 4, 
                                4 ~ 8, 
                                3 ~ 15, 
                                2 ~ 25, 
                                1 ~ 37.5,
                                .default = NA_real_)
    
    upper = min(rating + baseline_width, 100)
    lower = max(rating - baseline_width, 0)
  } #/if(scale == 100)
  
  if(type == "lower") return(lower)
  if(type == "upper") return(upper)
}


# calculate or find correct lower or upper bound
# based on rating type, and lb, ub, and conf values
impute_bounds <- function(var_name, est, lb, ub, conf, bound_type) {
  
  # get scale of each variable
  scale = if_else(var_name %in% c("journal_predict", "merits_journal"), # if variable is a prediction
                  5, 100) #scale is 5, else scale is 100
  # if calculating lower bound variable
  if(bound_type == "lower") { #we are calculating a lower bound imputation
    # 
    calculated_bound = map_dbl(.x = est, .f = calc_bounds, conf = conf, type = bound_type, scale = scale)
    
    imp_bound = if_else(is.na(lb), calculated_bound, lb)
  }
  
  # if calculating upper bound variable
  if(bound_type == "upper") { #we are calculating an upper bound imputation
    # 
    calculated_bound = map_dbl(.x = est, .f = calc_bounds, conf = conf, type = bound_type, scale = scale)
    imp_bound = if_else(is.na(ub), calculated_bound, ub)
  }
  
  return(imp_bound)
}

# apply functions to evals_pub_long
# where each row is one type of rating
# so each evaluation is 9 rows long
evals_pub_long <- evals_pub_long %>% 
  rowwise() %>% # apply function to each row
  mutate(lb_imp_ = impute_bounds(var_name = rating_type,
                                   est = est_,
                                   lb = lb_, ub = ub_, conf = conf_,
                                   bound_type = "lower")) %>% 
  mutate(ub_imp_ = impute_bounds(var_name = rating_type,
                                   est = est_,
                                   lb = lb_, ub = ub_, conf = conf_,
                                   bound_type = "upper"))

# Reshape evals_pub_long into evals_pub to add imputed bounds
evals_pub <- evals_pub_long %>% 
  pivot_wider(names_from = rating_type, # take the dataframe back to old format
              values_from = c(est_, ub_, lb_, conf_, lb_imp_, ub_imp_),
              names_sep = "") %>% 
  dplyr::rename_with(.cols = matches("^[ul]b_imp"),
                     .fn = gsub,
                     pattern = "(ub_imp|lb_imp)_(.+)", 
                     replacement = "\\2_\\1") %>% 
  dplyr::rename_with(.cols = starts_with("est_"),
                     .fn = gsub,
                     pattern = "est_(.+)",
                     replacement = "\\1")

# Clean evals_pub_long names (remove _ at end)
evals_pub_long <- evals_pub_long %>% 
  rename_with(.cols = ends_with("_"),
              .fn = str_remove,
              pattern = "_$")

We cannot publicly share the ‘papers under consideration’, but we can share some of the statistics on these papers. Let’s generate an ID (or later, salted hash) for each such paper, and keep only the shareable features of interest

keep shareable variables from all papers
all_papers_p <- all_pub_records %>% 
  dplyr::select(
    id,
    category,
    cfdc_DR,
     'confidence -- user entered',
    cfdc_assessor,
    avg_cfdc,
    category,
    cause_cat_1_text,
    cause_cat_2_text,
    topic_subfield_text,
    eval_manager_text,
    'publication status',
    'Contacted author?',
    'stage of process/todo',
    'source_main',  
    'author permission?',
'Direct Kotahi Prize Submission?',
    'createdTime'         
  )
Create and add aggregated ratings information to evals_pub_long
# paper_ratings: even longer dataframe (ie tidy)
# renamed to conform to aggreCAT nomenclature

paper_ratings <- evals_pub_long %>% 
  select(id, eval_name, paper_abbrev, rating_type, est, lb_imp, ub_imp) %>% 
  filter(rating_type %in% rating_cats) %>%
  rename(paper_id = paper_abbrev,
         user_name = eval_name,
         three_point_lower = lb_imp,
         three_point_upper = ub_imp,
         three_point_best = est) %>%
  mutate(round = "round_1") %>% 
  pivot_longer(cols = starts_with("three_point"),
               names_to = "element",
               values_to = "value")

# calculate aggregated upper and lower bounds using modified
# aggreCAT function DistributionWAggMOD
paper_agg_ratings <- paper_ratings %>% 
  group_by(rating_type, user_name, paper_id) %>% 
  filter(sum(is.na(value))==0) %>% #remove ratings (best, ub, lb) that have NA values
  group_by(rating_type) %>% 
  nest() %>% 
  mutate(results = map(.x = data, .f = DistributionWAggMOD, 
                       round_2_filter = FALSE, percent_toggle = T)) %>% 
  unnest(results) %>% 
  select(-data) %>% 
  select(paper_id, rating_type, everything()) %>%   
  arrange(paper_id) %>% 
  mutate(across(.cols = c("agg_est", "agg_90ci_lb", "agg_90ci_ub"), .fns = ~.x*100)) %>% 
  rename(agg_method = method)

rm(paper_ratings)

evals_pub_long <- evals_pub_long %>% 
  left_join(paper_agg_ratings, by = c("paper_abbrev" = "paper_id", "rating_type"="rating_type"))
create a dataset to be used as the input to the shiny app
evals_pub_long %>% 
  mutate(rating_type = factor(rating_type, 
                              levels = c(rating_cats, pred_cats),
                              labels = c("Overall assessment",
                                         "Advances our knowledge & practice",  
                                         "Methods: justification, reasonableness, validity, robustness", 
                                         "Logic and communication", 
                                         "Engages with real-world, impact quantification",
                                         "Relevance to global priorities",
                                         "Open, collaborative, replicable science and methods",
                                         "Predicted Journal", "Merits Journal"))) %>% 
  write_rds(file = here("shinyapp/DataExplorer", "shiny_explorer.rds"))
save data for others’ use
all_papers_p %>% saveRDS(file = here("data", "all_papers_p.Rdata"))
all_papers_p %>% write_csv(file = here("data", "all_papers_p.csv"))

evals_pub %>% saveRDS(file = here("data", "evals.Rdata"))
evals_pub %>% write_csv(file = here("data", "evals.csv"))

evals_pub_long %>% write_rds(file = here("data", "evals_long.rds"))
evals_pub_long %>% write_csv(file = here("data", "evals_long.csv"))

#evals_pub %>% readRDS(file = here("data", "evals.Rdata"))

  1. Note this is only a first-pass; a more sophisticated approach may be warranted in future.↩︎