Skip to content

Commit

Permalink
Merge pull request #72 from CostaLab/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
lzj1769 authored May 29, 2018
2 parents 2685abb + 2566d3d commit 79d5b2c
Show file tree
Hide file tree
Showing 63 changed files with 5,998 additions and 3,296 deletions.
8 changes: 4 additions & 4 deletions data/motifs/createFpr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

# Import
import sys
from glob import glob
from os.path import basename

from MOODS import tools, parsers

# Input
Expand All @@ -14,10 +14,10 @@
pseudocounts = 1.0

outFile = open(outFileName, "w")
outFile.write("\t".join(["MOTIF"]+[str(e) for e in fprList])+"\n")
outFile.write("\t".join(["MOTIF"] + [str(e) for e in fprList]) + "\n")

# Iterating on all PWMs
for pwmFileName in sorted(glob(inFolder+"*.pwm")):
for pwmFileName in sorted(glob(inFolder + "*.pwm")):
# Creating PSSM
name = ".".join(basename(pwmFileName).split(".")[:-1])

Expand All @@ -33,6 +33,6 @@
resVec.append(str(tools.threshold_from_p(pssm, bg, fpr)))

# Writing results
outFile.write("\t".join(resVec)+"\n")
outFile.write("\t".join(resVec) + "\n")

outFile.close()
117 changes: 66 additions & 51 deletions data/motifs/createMtf.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@

###################################################################################################
##### Annotation File Standard (tab-separated):
# MATRIX_ID: The matrices' ID. It may change format for different repositories. (STRING)
# PWM_NAME: Name of the PWM inside the respective repository. (STRING - without the .pwm)
# SOURCE: The source repository of such matrix. (STRING)
# VERSION: The version of such matrix (1 for 'primary motif', 2 for 'secondary motif', etc). (INT)
# GENE_NAMES: Name of genes associated with such TF matrix. (LIST)
# GROUP: <To be used in future - Will represent clusters of motifs>. (STRING)
# GROUP: Name of factor "family" or "class" or "cluster", depending on repository. (STRING)
# UniProt: UniProt accession for the transcription factor. (STRING)
###################################################################################################
# * Mandatory fields: MATRIX_ID, SOURCE, VERSION, GENE_NAMES.
# * Fields with multiple entries should be separated by ';' (no spaces).
# * Fields with missing/non-existing/doesn't matter data should be filled with '.'
# * Co-binding should be represented by '+' (no spaces).
# * Group can be any string, and it may also contain whitespaces and punctuation (no tab).
###################################################################################################

# Import
import os
import sys
import glob
import csv

# Parameters
dataLocation = "./"
Expand All @@ -28,27 +28,35 @@
###################################################################################################

# Fetching file names
# TODO: check if this still works for hocomoco v10
source = "hocomoco"
inputLocation = dataLocation+source+"/"
inputLocation = dataLocation + source + "/"
resultMatrix = []
for inputFileName in glob.glob(inputLocation+"*.pwm"):
hocomoco_anno = {}
with open("hocomoco_anno.csv", "rb") as f:
csvf = csv.reader(f)
for l in csvf:
hocomoco_anno[l[0]] = l[1:]
for inputFileName in glob.glob(inputLocation + "*.pwm"):
ll = inputFileName.split("/")[-1].split(".")[0].split("_")
matrix_id = ll[1]
matrix_id = ll[0]
pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
version = "1"
gene_names = ll[0]
resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
gene_names = hocomoco_anno[pwm_name][0]
group = hocomoco_anno[pwm_name][1]
if not group:
group = "."
uniprot = hocomoco_anno[pwm_name][2]
data_source = hocomoco_anno[pwm_name][3]
resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, group, uniprot, data_source])

# Sorting results by ID
resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
resultMatrix = sorted(resultMatrix, key=lambda x: x[0])

# Writing to output file
outputFileName = dataLocation+source+".mtf"
outputFile = open(outputFileName,"w")
outputFileName = dataLocation + source + ".mtf"
outputFile = open(outputFileName, "w")
for resultVec in resultMatrix:
outputFile.write("\t".join(resultVec)+"\n")
outputFile.write("\t".join(resultVec) + "\n")
outputFile.close()

###################################################################################################
Expand All @@ -57,26 +65,38 @@

# Fetching file names
source = "jaspar_vertebrates"
inputLocation = dataLocation+source+"/"
inputLocation = dataLocation + source + "/"
resultMatrix = []
for inputFileName in glob.glob(inputLocation+"*.pwm"):
jaspar_anno = {}
with open("jaspar_anno.csv", "rb") as f:
csvf = csv.reader(f)
for l in csvf:
if not l:
continue
jaspar_anno[l[0]] = l[1:]
for inputFileName in glob.glob(inputLocation + "*.pwm"):
ll = inputFileName.split("/")[-1].split(".")
matrix_id = ll[0]
pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
version = "1"
if(len(ll) > 4): version = ll[4]
gene_names = ll[2].replace("::","+")
resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
if len(ll) > 4:
version = ll[4]
gene_names = jaspar_anno[pwm_name][0]
group = jaspar_anno[pwm_name][1]
if not group:
group = "."
uniprot = jaspar_anno[pwm_name][2]
data_source = jaspar_anno[pwm_name][3]
resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, group, uniprot, data_source])

# Sorting results by ID
resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
resultMatrix = sorted(resultMatrix, key=lambda x: x[0])

# Writing to output file
outputFileName = dataLocation+source+".mtf"
outputFile = open(outputFileName,"w")
outputFileName = dataLocation + source + ".mtf"
outputFile = open(outputFileName, "w")
for resultVec in resultMatrix:
outputFile.write("\t".join(resultVec)+"\n")
outputFile.write("\t".join(resultVec) + "\n")
outputFile.close()

###################################################################################################
Expand All @@ -85,25 +105,24 @@

# Fetching file names
source = "transfac_public"
inputLocation = dataLocation+source+"/"
inputLocation = dataLocation + source + "/"
resultMatrix = []
for inputFileName in glob.glob(inputLocation+"*.pwm"):
for inputFileName in glob.glob(inputLocation + "*.pwm"):
ll = inputFileName.split("/")[-1].split(".")[0].split("_")
matrix_id = ll[0]
pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
version = "1"
gene_names = ll[1]
resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, ".", ".", ".", "."])

# Sorting results by ID
resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
resultMatrix = sorted(resultMatrix, key=lambda x: x[0])

# Writing to output file
outputFileName = dataLocation+source+".mtf"
outputFile = open(outputFileName,"w")
outputFileName = dataLocation + source + ".mtf"
outputFile = open(outputFileName, "w")
for resultVec in resultMatrix:
outputFile.write("\t".join(resultVec)+"\n")
outputFile.write("\t".join(resultVec) + "\n")
outputFile.close()

###################################################################################################
Expand All @@ -112,25 +131,24 @@

# Fetching file names
source = "uniprobe_primary"
inputLocation = dataLocation+source+"/"
inputLocation = dataLocation + source + "/"
resultMatrix = []
for inputFileName in glob.glob(inputLocation+"*.pwm"):
for inputFileName in glob.glob(inputLocation + "*.pwm"):
ll = inputFileName.split("/")[-1].split(".")[0].split("_")
matrix_id = ll[0]
pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
version = ll[1]
gene_names = ll[2]
resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, ".", ".", ".", "."])

# Sorting results by ID
resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
resultMatrix = sorted(resultMatrix, key=lambda x: x[0])

# Writing to output file
outputFileName = dataLocation+source+".mtf"
outputFile = open(outputFileName,"w")
outputFileName = dataLocation + source + ".mtf"
outputFile = open(outputFileName, "w")
for resultVec in resultMatrix:
outputFile.write("\t".join(resultVec)+"\n")
outputFile.write("\t".join(resultVec) + "\n")
outputFile.close()

###################################################################################################
Expand All @@ -139,25 +157,22 @@

# Fetching file names
source = "uniprobe_secondary"
inputLocation = dataLocation+source+"/"
inputLocation = dataLocation + source + "/"
resultMatrix = []
for inputFileName in glob.glob(inputLocation+"*.pwm"):
for inputFileName in glob.glob(inputLocation + "*.pwm"):
ll = inputFileName.split("/")[-1].split(".")[0].split("_")
matrix_id = ll[0]
pwm_name = ".".join(inputFileName.split("/")[-1].split(".")[:-1])
version = ll[1]
gene_names = ll[2]
resultMatrix.append([matrix_id,pwm_name,source,version,gene_names,group])
resultMatrix.append([matrix_id, pwm_name, source, version, gene_names, ".", ".", ".", "."])

# Sorting results by ID and version
resultMatrix = sorted(resultMatrix ,key=lambda x: x[2])
resultMatrix = sorted(resultMatrix ,key=lambda x: x[0])
resultMatrix = sorted(resultMatrix, key=lambda x: x[0])

# Writing to output file
outputFileName = dataLocation+source+".mtf"
outputFile = open(outputFileName,"w")
outputFileName = dataLocation + source + ".mtf"
outputFile = open(outputFileName, "w")
for resultVec in resultMatrix:
outputFile.write("\t".join(resultVec)+"\n")
outputFile.write("\t".join(resultVec) + "\n")
outputFile.close()


Loading

0 comments on commit 79d5b2c

Please sign in to comment.