Merge pull request #4 from sapiris/master

mmaiers-nmdp · web-flow · commit b23d407a2189 · 2022-08-09T09:31:02.000-05:00
update for pip
diff --git a/grim/conf/minimal-configuration.json b/grim/conf/minimal-configuration.json
@@ -11,6 +11,7 @@
     "gamma": 1e-7,
     "delta": 0.4999999
   },
+  "UNK_priors": "SR",
   "FULL_LOCI": "ABCQR",
    "loci_map": {
     "A": 1,
@@ -19,6 +20,7 @@
     "DQB1": 4,
     "DRB1": 5
   },
+
   "factor_missing_data": 0.0001,
   "Plan_B_Matrix": [
                 [[1, 2, 3, 4, 5]],
diff --git a/grim/grim.py b/grim/grim.py
@@ -46,10 +46,13 @@ def impute(conf_file = ""):
         project_dir_in_file =  os.path.dirname(os.path.realpath(__file__)) + '/'
     runfile.run_impute(conf_file, project_dir_graph, project_dir_in_file)
 
-def impute_instance(config):
+def impute_instance(config, graph, count_by_prob= None):
+    imputation = Imputation(graph, config, count_by_prob)
+    return imputation
+
+def graph_instance(config):
     graph = Graph(config)
     graph.build_graph(config["node_file"], config["top_links_file"], config["edges_file"])
-    imputation = Imputation(graph, config)
-    return imputation
+    return graph
 
 
diff --git a/grim/imputation/graph_generation/generate_neo4j_multi_hpf.py b/grim/imputation/graph_generation/generate_neo4j_multi_hpf.py
@@ -11,6 +11,7 @@
 
 import sys
 import os
+
 #sys.path.insert(0, os.path.join(".."))
 
 
@@ -168,10 +169,10 @@ def labels_for_grap(conf, full_loci,csvdir):
 
     nodes_plan_b = list(set(nodes_plan_b))
     all_combo_list = list(dict.fromkeys(nodes_plan_a + nodes_plan_b + top_nodes_plan_b))
-    with open(csvdir + '/nodes_for_plan_a.txt', 'w') as f:
+    with open(csvdir + 'nodes_for_plan_a.txt', 'w') as f:
         for item in nodes_plan_a:
             f.write("%s\n" % item)
-    with open(csvdir + '/nodes_for_plan_b.txt', 'w') as f:
+    with open(csvdir + 'nodes_for_plan_b.txt', 'w') as f:
         for item in (nodes_plan_b + top_nodes_plan_b):
             f.write("%s\n" % item)
     #pickle.dump(nodes_plan_a, open(csvdir + '/nodes_for_plan_a.pkl', "wb"))
@@ -197,8 +198,8 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     # Configure
     ##############################################################################
     # set output directory and create it if it doesn't exist
-    csvdir = "output/csv"
-    pathlib.Path(csvdir).mkdir(parents=True, exist_ok=True)
+    #csvdir = "output/csv"
+
 
 
     # Input file
@@ -218,6 +219,11 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     with open(configuration_file) as f:
         conf = json.load(f)
 
+    csvdir = conf.get("graph_files_path")
+    pathlib.Path(csvdir).mkdir(parents=True, exist_ok=True)
+    if csvdir[-1] != '/':
+        csvdir += '/'
+
     pops = conf.get("populations")
     if em_pop:
         pops = em_pop
@@ -227,16 +233,19 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     if freq_file == "default":
         freq_file = os.path.dirname(os.path.realpath(__file__)) +  '/output/hpf.csv'
     dict_count_of_pop = {}
-    #if em:
-    for pop in pops:
-            dict_count_of_pop[pop] = freq_trim
-    """else:
-        project_dir = "../../"
-        pop_ratio_dir = project_dir + conf.get("pops_count_file", 'imputation/graph_generation/output/pop_ratio.txt')
+
+    pop_ratio_dir = conf.get("pops_count_file", os.path.dirname(os.path.realpath(__file__)) + '/imputation/graph_generation/output/pop_ratio.txt')
+    path = pathlib.Path(pop_ratio_dir)
+
+
+    if em or not path.is_file():
+        for pop in pops:
+                dict_count_of_pop[pop] = freq_trim
+    else:
         with open(pop_ratio_dir) as f_count:
             for line in f_count:
                 pop, count_pop, ratio = line.strip().split(',')
-                dict_count_of_pop[pop] = freq_trim / float(count_pop)"""
+                dict_count_of_pop[pop] = freq_trim / float(count_pop)
 
 
     # Display the configurations we are using
@@ -360,7 +369,7 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     # #### Build Nodes file
 
     header = ['haplotypeId:ID(HAPLOTYPE)', 'name', 'loci:LABEL', 'frequency:DOUBLE[]']
-    node_file = csvdir + '/nodes.csv'
+    node_file = csvdir + conf.get("node_csv_file")
     with open(node_file, mode='w') as csvfile:
         csv_writer = csv.writer(csvfile)
         csv_writer.writerow(header)
@@ -373,7 +382,7 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     # #### Build Edges File
 
     edgeheader = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', 'CP:DOUBLE[]', ':TYPE']
-    edge_file = csvdir + '/edges.csv'
+    edge_file = csvdir + conf.get("edges_csv_file")
     with open(edge_file, mode='w') as csvfile:
         csv_writer = csv.writer(csvfile)
         csv_writer.writerow(edgeheader)
@@ -396,7 +405,7 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     # #### Generate Top Links file
 
     topheader = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', ':TYPE']
-    top_links_file = csvdir + '/top_links.csv'
+    top_links_file = csvdir + conf.get("top_links_csv_file")
     with open(top_links_file, mode='w') as csvfile:
         csv_writer = csv.writer(csvfile)
         csv_writer.writerow(topheader)
@@ -412,7 +421,7 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop
     # #### Generate Info Node file
 
     infonode_header = ['INFO_NODE_ID:ID(INFO_NODE)', 'populations:STRING[]', 'INFO_NODE:LABEL']
-    top_links_file = csvdir + '/info_node.csv'
+    top_links_file = csvdir + conf.get("info_node_csv_file")
     with open(top_links_file, mode='w') as csvfile:
         csv_writer = csv.writer(csvfile)
         csv_writer.writerow(infonode_header)
diff --git a/grim/imputation/imputegl/impute.py b/grim/imputation/imputegl/impute.py
@@ -110,6 +110,7 @@ def __init__(self, net=None,config=None,  count_by_prob=None, verbose=False):
             self.populations = config["pops"]
             self.netGraph = net
             self.priorMatrix = np.ones((len(self.populations), len(self.populations)))
+            self.unk_priors = config["UNK_priors"]
 
             # For plan b
             #self.full_loci = config["full_loci"]
@@ -1419,7 +1420,11 @@ def call_comp_phase_prob(self, epsilon, n, phases, chr, MUUG_output, planb):
         # no plan b
         for level in range(2):
             if level == 1:
-                self.priorMatrix = np.ones((len(self.populations), len(self.populations)))  ####
+                if self.unk_priors == "MR":
+                    self.priorMatrix = np.ones((len(self.populations), len(self.populations)))
+                else:
+                    self.priorMatrix = np.identity(len(self.populations))
+                #self.priorMatrix = np.ones((len(self.populations), len(self.populations)))  ####
             if planb and len(res['Haps']) == 0:
                 self.plan = 'b'
                 epsilon = 1e-14
@@ -1615,7 +1620,10 @@ def update_prob_by_priority(self, res, race1, race2, priority):
 
     def impute_one(self, subject_id, gl, binary, race1, race2, priority, epsilon, n, MUUG_output, haps_output, planb, em):#em
         clean_gl = clean_up_gl(gl)
-        self.priorMatrix =  np.ones((len(self.populations), len(self.populations)))
+        if self.unk_priors == "MR":
+            self.priorMatrix =  np.ones((len(self.populations), len(self.populations)))
+        else:
+            self.priorMatrix = np.identity(len(self.populations))
         to_calc_prior_matrix = False
         if race1 or race2:
             race1 = race1.split(';')
diff --git a/grim/validation/runfile.py b/grim/validation/runfile.py
@@ -67,7 +67,8 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra
         "max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase",100 ),
         "bin_imputation_input_file": project_dir_in_file + json_conf.get("bin_imputation_in_file", "None"),
         "nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []),
-        "save_mode": json_conf.get("save_space_mode", False)
+        "save_mode": json_conf.get("save_space_mode", False),
+        "UNK_priors" : json_conf.get("UNK_priors", "MR")
 
     }
 
@@ -76,6 +77,7 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra
     print("Performing imputation based on:")
     print("\tPopulation: {}".format(config["pops"]))
     print("\tPriority: {}".format(config["priority"]))
+    print("\tPriority: {}".format(config["UNK_priors"]))
     print("\tEpsilon: {}".format(config["epsilon"]))
     print("\tPlan B: {}".format(config["planb"]))
     print("\tNumber of Results: {}".format(config["number_of_results"]))