This vignette continues the demonstration (Benchmarking DAG-based Testing: Part I) of how to use the AEGIS API to create simulation benchmarks. The previous part generates the benchmark data, and this part is used to run methods on the benchmarks. Here, in particular, we investigate DAGGER (a sequential testing algorithm) in the R programming environment.
library(methods)
library(jsonlite)
library(dplyr)
library(ggplot2)
library(reshape2)
library(igraph)
The analysis pipeline is stored in benchmark_process_all_graphs.R
, where two arguments are required:
simulation_dir
: the directory where the simulated data is stored (from Part I), andresult_dir
: the directory where the result will be stored.The function process_one_graph()
is the core function used in the R script.
script_dir = "/PATH/TO/PROVIDED/R/SCRIPTS"
simulation_dir = "/PATH/TO/SIMULATION/OUTPUT/DIRECTORY"
results_dir = "/PATH/TO/RESULT/DIRECTORY"
source(file.path(script_dir,"DAGGER.R"))
source(file.path(script_dir,"plot_graph.R"))
source(file.path(script_dir,"plot_results_one_graph.R"))
source(file.path(script_dir,"construct_graph_metrics.R"))
Here we break down the core components in the funciton process_one_graph()
used above.
The meta-data required are:
The intermediate files from AEGIS's simulation are typically stored as json formats readable by most commonly used programming languages.
graph = 1 # the first graph to plot (example)
data_dir = file.path(simulation_dir, sprintf("graph_%d", graph-1))
children_filename = sprintf("%s/meta_gochild_dag.json", data_dir)
groups_filename = sprintf("%s/meta_gogene_map.json", data_dir)
ground_truth_self_filename = sprintf("%s/meta_self_nonnull_nodes.json", data_dir)
ground_truth_comp_filename = sprintf("%s/meta_comp_nonnull_nodes.json", data_dir)
children = read_json(children_filename, simplifyVector = TRUE)
groups = read_json(groups_filename, simplifyVector = TRUE)
non_nulls_self_idx = read_json(ground_truth_self_filename, simplifyVector = TRUE)+1 # 0-based -> 1-based index
non_nulls_comp_idx = read_json(ground_truth_comp_filename, simplifyVector = TRUE)+1 # 0-based -> 1-based index
params_filename = sprintf("%s/meta_restore_params.json", data_dir)
all_params = read_json(params_filename, simplifyVector = TRUE)
q = all_params$test_params$method_alpha
pvals_types = all_params$test_params$method_test
methods = c("BH", "DAGGER")
n = length(children)
num_methods = length(methods)
num_pvals_types = length(pvals_types)
num_regimes = all_params$oneway_params$n_regimes
num_reps = all_params$oneway_params$n_reps
num_trials = num_regimes * num_reps
message(sprintf("Number of trials to process: %s", num_trials))
The number of trials here is equal to the number of regimes times the number of repetitions per regime.
Note that in our pipeline, each trial is stored as a separate folder, and this makes parallel processing easy.
P_all = array(0, c(n, num_trials, num_pvals_types))
regimes = numeric(num_trials)
replicates = numeric(num_trials)
for(trial_id in 1:num_trials){
params_filename = sprintf("%s/trial_%d/params.json",
data_dir, trial_id-1) # 0-based -> 1-based index
params = read_json(params_filename, simplifyVector = TRUE)
regimes[trial_id] = params$regime
replicates[trial_id] = params$replicate
for(pvals_type_idx in 1:num_pvals_types){
pvals_type = pvals_types[pvals_type_idx]
pvals_filename = sprintf("%s/trial_%d/node_pvals_%s.json",
data_dir, trial_id-1, pvals_type)
params_filename = sprintf("%s/trial_%d/params.json",
data_dir, trial_id-1) # 0-based -> 1-based index
P_all[,trial_id, pvals_type_idx] = read_json(pvals_filename, simplifyVector = TRUE)
}
}
This preprocessing is specific to DAGGER, but other methods can also utilize our DAG data structure, stored in the variable "children".
For book keeping, we also compute some graph metrics which we use for plotting later.
# shift to 1-based indexing
children = sapply(children, function(v)(v+1))
# preprocess graph for DAGGER
G = preprocess_graph_for_DAGGER(children, groups)
# get graph metrics
graph_metrics = construct_graph_metrics(G, non_nulls_self_idx, non_nulls_comp_idx)
graph_metrics$Graph = graph-1
The DAGGER-related functions are from: DAGGER.R
.
# create non-null indicator vector
non_nulls_self = logical(n)
non_nulls_self[non_nulls_self_idx] = TRUE
non_nulls_comp = logical(n)
non_nulls_comp[non_nulls_comp_idx] = TRUE
# initialize results data frame
results = data.frame(matrix(0, num_trials*num_pvals_types*num_methods, 8,
dimnames = list(NULL,
c("Trial", "Replicate", "Regime", "Method",
"P.value.Type", "Graph", "FDR", "Power"))))
# initialize rejections nested lists
rejections = vector("list", num_pvals_types)
setNames(rejections, pvals_types)
for(pvals_type_idx in 1:num_pvals_types){
rejections[[pvals_type_idx]] = vector("list", num_methods)
setNames(rejections[[pvals_type_idx]], methods)
for(method_idx in 1:num_methods){
rejections[[pvals_type_idx]][[method_idx]] = vector("list", num_trials)
}
}
# carry out multiplicty adjustment with BH and DAGGER
idx = 1
for(trial_id in 1:num_trials){
for(pvals_type_idx in 1:num_pvals_types){
pvals_type = pvals_types[pvals_type_idx]
P = P_all[,trial_id, pvals_type_idx]
if(pvals_type %in% c("simes", "binomial")){
non_nulls = non_nulls_self
}
if(pvals_type %in% c("hypergeometric.ga", "hypergeometric.gs")){
non_nulls = non_nulls_comp
}
for(method_idx in 1:num_methods){
method = methods[method_idx]
switch(method,
"BH" = {
R = p.adjust(P, method = "fdr") <= q
},
"DAGGER" = {
R = DAGGER(P, G, q)
}
)
rejections[[pvals_type_idx]][[method_idx]][[trial_id]] = which(R)-1
results$Trial[idx] = trial_id
results$Replicate[idx] = replicates[trial_id]
results$Regime[idx] = regimes[trial_id]
results$Method[idx] = method
results$P.value.Type[idx] = pvals_type
results$Graph[idx] = graph-1
results$FDR[idx] = sum(R & !non_nulls)/max(1,sum(R))
results$Power[idx] = sum(R & non_nulls)/sum(non_nulls)
idx = idx + 1
}
}
}
This is a way to visualize an example of the graphs in R.
options(repr.plot.width=6, repr.plot.height=6)
plot_graph(G, non_nulls_self_idx)
options(repr.plot.width=6, repr.plot.height=2)
plot_one_graph_result(graph, simulation_dir, result_dir)
results_filename = sprintf("%s/results_graph_%d.Rda", results_dir, graph-1)
graph_metrics_filename = sprintf("%s/graph_metrics_graph_%d.Rda", results_dir, graph-1)
save(file = results_filename, results)
save(file = graph_metrics_filename, graph_metrics)
for(pvals_type_idx in 1:num_pvals_types){
for(method_idx in 1:num_methods){
pvals_type = pvals_types[pvals_type_idx]
method = methods[method_idx]
rejections_filename = sprintf("%s/rejections_%s_%s_graph_%d.json",
results_dir, pvals_type, method, graph-1)
write_json(rejections[[pvals_type_idx]][[method_idx]], path = rejections_filename)
}
}