Merge pull request #72 from CostaLab/develop

Develop
CostaLab · May 29, 2018 · 79d5b2c · 79d5b2c
2 parents 2685abb + 2566d3d
commit 79d5b2c
Show file tree

Hide file tree

Showing 63 changed files with 5,998 additions and 3,296 deletions.
diff --git a/data/motifs/createFpr.py b/data/motifs/createFpr.py
@@ -1,8 +1,8 @@
-
 # Import
 import sys
 from glob import glob
 from os.path import basename
+
 from MOODS import tools, parsers
 
 # Input
@@ -14,10 +14,10 @@
 pseudocounts = 1.0
 
 outFile = open(outFileName, "w")
-outFile.write("\t".join(["MOTIF"]+[str(e) for e in fprList])+"\n")
+outFile.write("\t".join(["MOTIF"] + [str(e) for e in fprList]) + "\n")
 
 # Iterating on all PWMs
-for pwmFileName in sorted(glob(inFolder+"*.pwm")):
+for pwmFileName in sorted(glob(inFolder + "*.pwm")):
     # Creating PSSM
     name = ".".join(basename(pwmFileName).split(".")[:-1])
 
@@ -33,6 +33,6 @@
         resVec.append(str(tools.threshold_from_p(pssm, bg, fpr)))
 
     # Writing results
-    outFile.write("\t".join(resVec)+"\n")
+    outFile.write("\t".join(resVec) + "\n")
 
 outFile.close()
diff --git a/data/motifs/createMtf.py b/data/motifs/createMtf.py
@@ -1,23 +1,23 @@
-
 ###################################################################################################
 ##### Annotation File Standard (tab-separated):
 # MATRIX_ID: The matrices' ID. It may change format for different repositories. (STRING)
 # PWM_NAME: Name of the PWM inside the respective repository. (STRING - without the .pwm)
 # SOURCE: The source repository of such matrix. (STRING)
 # VERSION: The version of such matrix (1 for 'primary motif', 2 for 'secondary motif', etc). (INT)
 # GENE_NAMES: Name of genes associated with such TF matrix. (LIST)
-# GROUP: <To be used in future - Will represent clusters of motifs>. (STRING)
+# GROUP: Name of factor "family" or "class" or "cluster", depending on repository. (STRING)
+# UniProt: UniProt accession for the transcription factor. (STRING)
 ###################################################################################################
 # * Mandatory fields: MATRIX_ID, SOURCE, VERSION, GENE_NAMES.
 # * Fields with multiple entries should be separated by ';' (no spaces).
 # * Fields with missing/non-existing/doesn't matter data should be filled with '.'
 # * Co-binding should be represented by '+' (no spaces).
+# * Group can be any string, and it may also contain whitespaces and punctuation (no tab).
 ###################################################################################################
 
 # Import
-import os
-import sys
 import glob
+import csv
 
 # Parameters
 dataLocation = "./"
@@ -28,27 +28,35 @@
 ###################################################################################################
 
 # Fetching file names
-# TODO: check if this still works for hocomoco v10
 source = "hocomoco"
-inputLocation = dataLocation+source+"/"
+inputLocation = dataLocation + source + "/"
 resultMatrix = []
-for inputFileName in glob.glob(inputLocation+"*.pwm"):
+hocomoco_anno = {}
+with open("hocomoco_anno.csv", "rb") as f:
+    csvf = csv.reader(f)
+    for l in csvf:
+        hocomoco_anno[l[0]] = l[1:]
+for inputFileName in glob.glob(inputLocation + "*.pwm"):
     ll = inputFileName.split("/")[-1].split(".")[0].split("_")
-    matrix_id = ll[1]
+    matrix_id = ll[0]
     pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
     version = "1"
-    gene_names = ll[0]
-    resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
+    gene_names = hocomoco_anno[pwm_name][0]
+    group = hocomoco_anno[pwm_name][1]
+    if not group:
+        group = "."
+    uniprot = hocomoco_anno[pwm_name][2]
+    data_source = hocomoco_anno[pwm_name][3]
+    resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, group, uniprot, data_source])
 
 # Sorting results by ID
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
+resultMatrix = sorted(resultMatrix, key=lambda x: x[0])
 
 # Writing to output file
-outputFileName = dataLocation+source+".mtf"
-outputFile = open(outputFileName,"w")
+outputFileName = dataLocation + source + ".mtf"
+outputFile = open(outputFileName, "w")
 for resultVec in resultMatrix:
-    outputFile.write("\t".join(resultVec)+"\n")
+    outputFile.write("\t".join(resultVec) + "\n")
 outputFile.close()
 
 ###################################################################################################
@@ -57,26 +65,38 @@
 
 # Fetching file names
 source = "jaspar_vertebrates"
-inputLocation = dataLocation+source+"/"
+inputLocation = dataLocation + source + "/"
 resultMatrix = []
-for inputFileName in glob.glob(inputLocation+"*.pwm"):
+jaspar_anno = {}
+with open("jaspar_anno.csv", "rb") as f:
+    csvf = csv.reader(f)
+    for l in csvf:
+        if not l:
+            continue
+        jaspar_anno[l[0]] = l[1:]
+for inputFileName in glob.glob(inputLocation + "*.pwm"):
     ll = inputFileName.split("/")[-1].split(".")
     matrix_id = ll[0]
     pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
     version = "1"
-    if(len(ll) > 4): version = ll[4]
-    gene_names = ll[2].replace("::","+")
-    resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
+    if len(ll) > 4:
+        version = ll[4]
+    gene_names = jaspar_anno[pwm_name][0]
+    group = jaspar_anno[pwm_name][1]
+    if not group:
+        group = "."
+    uniprot = jaspar_anno[pwm_name][2]
+    data_source = jaspar_anno[pwm_name][3]
+    resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, group, uniprot, data_source])
 
 # Sorting results by ID
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
+resultMatrix = sorted(resultMatrix, key=lambda x: x[0])
 
 # Writing to output file
-outputFileName = dataLocation+source+".mtf"
-outputFile = open(outputFileName,"w")
+outputFileName = dataLocation + source + ".mtf"
+outputFile = open(outputFileName, "w")
 for resultVec in resultMatrix:
-    outputFile.write("\t".join(resultVec)+"\n")
+    outputFile.write("\t".join(resultVec) + "\n")
 outputFile.close()
 
 ###################################################################################################
@@ -85,25 +105,24 @@
 
 # Fetching file names
 source = "transfac_public"
-inputLocation = dataLocation+source+"/"
+inputLocation = dataLocation + source + "/"
 resultMatrix = []
-for inputFileName in glob.glob(inputLocation+"*.pwm"):
+for inputFileName in glob.glob(inputLocation + "*.pwm"):
     ll = inputFileName.split("/")[-1].split(".")[0].split("_")
     matrix_id = ll[0]
     pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
     version = "1"
     gene_names = ll[1]
-    resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
+    resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, ".", ".", ".", "."])
 
 # Sorting results by ID
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
+resultMatrix = sorted(resultMatrix, key=lambda x: x[0])
 
 # Writing to output file
-outputFileName = dataLocation+source+".mtf"
-outputFile = open(outputFileName,"w")
+outputFileName = dataLocation + source + ".mtf"
+outputFile = open(outputFileName, "w")
 for resultVec in resultMatrix:
-    outputFile.write("\t".join(resultVec)+"\n")
+    outputFile.write("\t".join(resultVec) + "\n")
 outputFile.close()
 
 ###################################################################################################
@@ -112,25 +131,24 @@
 
 # Fetching file names
 source = "uniprobe_primary"
-inputLocation = dataLocation+source+"/"
+inputLocation = dataLocation + source + "/"
 resultMatrix = []
-for inputFileName in glob.glob(inputLocation+"*.pwm"):
+for inputFileName in glob.glob(inputLocation + "*.pwm"):
     ll = inputFileName.split("/")[-1].split(".")[0].split("_")
     matrix_id = ll[0]
     pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
     version = ll[1]
     gene_names = ll[2]
-    resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
+    resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, ".", ".", ".", "."])
 
 # Sorting results by ID
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
+resultMatrix = sorted(resultMatrix, key=lambda x: x[0])
 
 # Writing to output file
-outputFileName = dataLocation+source+".mtf"
-outputFile = open(outputFileName,"w")
+outputFileName = dataLocation + source + ".mtf"
+outputFile = open(outputFileName, "w")
 for resultVec in resultMatrix:
-    outputFile.write("\t".join(resultVec)+"\n")
+    outputFile.write("\t".join(resultVec) + "\n")
 outputFile.close()
 
 ###################################################################################################
@@ -139,25 +157,22 @@
 
 # Fetching file names
 source = "uniprobe_secondary"
-inputLocation = dataLocation+source+"/"
+inputLocation = dataLocation + source + "/"
 resultMatrix = []
-for inputFileName in glob.glob(inputLocation+"*.pwm"):
+for inputFileName in glob.glob(inputLocation + "*.pwm"):
     ll = inputFileName.split("/")[-1].split(".")[0].split("_")
     matrix_id = ll[0]
     pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
     version = ll[1]
     gene_names = ll[2]
-    resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
+    resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, ".", ".", ".", "."])
 
 # Sorting results by ID and version
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
-resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
+resultMatrix = sorted(resultMatrix, key=lambda x: x[0])
 
 # Writing to output file
-outputFileName = dataLocation+source+".mtf"
-outputFile = open(outputFileName,"w")
+outputFileName = dataLocation + source + ".mtf"
+outputFile = open(outputFileName, "w")
 for resultVec in resultMatrix:
-    outputFile.write("\t".join(resultVec)+"\n")
+    outputFile.write("\t".join(resultVec) + "\n")
 outputFile.close()
-
-