This vignette demonstrates how to use the API of AEGIS to generate simulation benchmark data. The process is to repeatedly create a random context graph and identify how different methods perform under different graph realizations. To allow for more interesting interpretations, we recommend using different regimes of sample sizes (or effect sizes), and then use the flat file outputs from AEGIS for further processing.
In the second part of this vignette (Benchmarking DAG-based Testing: Part II), we show how to test functions implemented in R (instead of Python) as a use case which requires a different programing environment. Both the first part and the second parts of the vignettes can be customized based on particular needs in appropriate programming environments.
The main script that automates the procedure in this code is provided in benchmark_generate_all_graphs.py
, which requires two arguments:
cache_dir
': the directory where the DAG object stores local data during GODAGraph
initialization, and sim_dir
: the directory where the benchmark results will be stored.To run it, simply launch:
%run benchmark_generate_all_graphs.py {cache_dir} {sim_dir}
Now, we break down the code in benchmark_generate_all_graphs.py
step-by-step.
import logging
import numpy as np
import os
from server.dagraph import GODAGraph
You need to specify: MAIN_FOLDER
where there is a local/
directory including the cached .pkl
files. As a shortcut, you can download them directly from http://stanford.edu/~jjzhu/fileshare/aegis/ (e.g., unzip local_20180719.tar.gz
locally).
logger = logging.getLogger(__name__)
logging.basicConfig(format='[%(asctime)s %(name)s %(levelname)s] %(message)s',
datefmt='%I:%M:%S', level=logging.INFO)
MAIN_FOLDER = "/YOUR/LOCAL/FOLDER"
cache_dir = os.path.join(MAIN_FOLDER, "local")
sim_dir = os.path.join(MAIN_FOLDER, "benchmarks", "large_20180628")
The following parameters are used to setup the entire benchmark. For sanity checks, reduce the parameters: n_graphs, n_reps, and n_regimes. The total time to create 50 graphs rooted within 500-1500 sized nodes (with 5 regimes and 10 repeitions each) should take less than a minute; a full-scale and complicated simulation is typically doable within hours. Note that the current implementation creates a number of small-sized files for book keeping. Thus, one may need to clean up all the files after the analysis is done (or use some of our helper functions to do so).
def setup_random_graph_example():
ontology_parmams = {
"ontology": "biological_process",
"species": "human"
}
context_params = {
'anchor_rule': 'root',
'refine_graph': True,
'min_node_size': '1',
'max_node_size': '17439'
}
graph_params = {
"n_graphs": 200,
"min_root_size": 500,
"max_root_size": 10000,
"min_graph_size": 100,
"max_signal_genes": 5,
"signal_type": "single_leaf",
}
sim_params = {
"n_regimes": 5,
"n_reps" : 100,
"min_n" : 10,
"max_n": 130,
"eff_size": 0.5,
"sweep_sample_size": True
}
# output summary
logger.info("Generating {} graphs:".format(graph_params["n_graphs"]))
logger.info(" random root nodes in range {}-{}".format(
graph_params["min_root_size"], graph_params["max_root_size"]))
logger.info(" random signal anchor ({} with max {} genes sampled)".format(
graph_params["signal_type"], graph_params["max_signal_genes"]))
logger.info(" minimum graph size {}".format(graph_params["min_graph_size"]))
logger.info("Universal simulation parameters:")
logger.info(" signal gene effect size: {}".format(sim_params["eff_size"]))
logger.info(" sample size range {}-{} ({} regimes, {} repetitions)".format(
sim_params["min_n"], sim_params["max_n"],
sim_params["n_regimes"], sim_params["n_reps"]))
return {
"ontology_params": ontology_parmams,
"graph_params": graph_params,
"context_params": context_params,
"sim_params": sim_params,
}
all_params = setup_random_graph_example()
g_params = all_params["graph_params"]
c_params = all_params["context_params"]
s_params = all_params["sim_params"]
dag = GODAGraph(cache_dir,
ontology=all_params["ontology_params"]["ontology"],
species=all_params["ontology_params"]["species"],
name="random_contexts")
dag.setup_full_dag(use_cache=True)
general_info = dag.output_general_info()
go_anns = general_info['search_dict']
go_sizes = general_info["go_gene_cnt"]
cand_roots = []
for term in go_sizes:
n_genes = go_sizes[term]
if ((n_genes <= g_params["max_root_size"]) &
(n_genes >= g_params["min_root_size"])):
cand_roots.append(term)
logger.info("Number of candidates: {}".format(len(cand_roots)))
The following is a funciton that takes a set of leaves and randomly selects on for further processing.
def run_single_graph_simulation(dag, leaves, g_params, s_params, outdir):
logger.info("Starting simulation...")
stat = dag.main_statistician
logger.info("Number of leaves: {}".format(len(leaves)))
leaf_genes = {}
for leaf in leaves:
leaf_genes[leaf] = stat.go_gene_map[stat.name_id_map[leaf]]
g_list = []
for leaf in leaf_genes: # draw the first leaf we see
lgenes = leaf_genes[leaf]
if len(lgenes) > g_params["max_signal_genes"]: # draw random terms
lgenes = np.array(lgenes)
lgenes = lgenes[np.random.choice(len(lgenes),
g_params["max_signal_genes"],
replace=False)]
lgenes = lgenes.tolist()
g_list += lgenes
break
# determine the nulls and nonnulls
stat.determine_non_null(g_list)
stat.method_test = ["simes", "hypergeometric.ga"]
stat.method_madj = ["Bonferroni", "BH"]
logger.info("Number of non-nulls: self ({}); comp ({})".format(
len(stat.nonnull_nodes["self_nonnull"]),
len(stat.nonnull_nodes["comp_nonnull"])
))
stat.setup_simulation_oneway(s_params)
# run the simulation
logger.info("Saving results to {}".format(outdir))
dag.launch_simulation_pipeline(outdir)
Once everything is specified, launch the simulation here. This process could take a while if the parameters are set to be large.
np.random.seed(11111) # fix random seed for reproducibility
selected_roots = {}
iter_max = 10000
iter_i = 0
graph_i = 0
while len(selected_roots) < g_params["n_graphs"]:
iter_i += 1
if (iter_i > iter_max):
logger.warning("Reached maximum allowable iteration")
break
# draw a new random root from the candidates
rand_root = cand_roots[np.random.choice(len(cand_roots))]
if (rand_root in selected_roots):
logger.warning("Graph already processed!")
continue
logger.info("Processing root {} ({} genes, {})".format(
rand_root, go_sizes[rand_root], go_anns[rand_root]))
# create a context graph
_, context_info = dag.setup_context_graph(
c_params["anchor_rule"],
[rand_root],
min_w = int(c_params["min_node_size"]), # int
max_w = int(c_params["max_node_size"]), # int
refine_graph = c_params["refine_graph"], # boolean
store_context=True
)
# re-draw root if the context graph is too small
graph_size = len(dag.context_graph.sorted_nodes)
if graph_size < g_params["min_graph_size"]:
logger.warning("Graph too small!")
continue
# re-draw root if the root itself is redundant
if set(context_info["roots"]) != set([rand_root]):
logger.warning("Anchor is redundant!")
continue
# start the simulation on a single graph
selected_roots[rand_root] = graph_size
outdir = os.path.join(sim_dir, "graph_{}".format(graph_i))
run_single_graph_simulation(dag,
context_info["leaves"],
g_params,
s_params,
outdir)
graph_i += 1