Skip to content

Commit

Permalink
V1.16 (#26)
Browse files Browse the repository at this point in the history
* install smart from user download, update flps2

* option to install different pfam version

* fixed parser for flps2

* added option to update json file
  • Loading branch information
trvinh authored Jan 19, 2023
1 parent 816871e commit c644921
Show file tree
Hide file tree
Showing 7 changed files with 425 additions and 71 deletions.
129 changes: 129 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export PATH=$HOME/.local/bin:$PATH
# Usage

## Download and install annotation tools
Before using FAS, some annotation tools and databases need to be installed. FAS' standard databases/annotation tools are: [PFAM](https://pfam.xfam.org/), [SMART](http://smart.embl-heidelberg.de/), [fLPS](http://biology.mcgill.ca/faculty/harrison/flps.html), [SEG](http://www.biology.wustl.edu/gcg/seg.html), [COILS](https://embnet.vital-it.ch/software/COILS_form.html), [THMHH 2.0c](http://www.cbs.dtu.dk/services/TMHMM/) and [SignalP 4.1g](http://www.cbs.dtu.dk/services/SignalP/). To get these tools and make a configuration file for FAS, please use the `setupFAS` function:
Before using FAS, some annotation tools and databases need to be installed. FAS' standard databases/annotation tools are: [PFAM](https://pfam.xfam.org/), [SMART](http://smart.embl-heidelberg.de/), [fLPS](http://biology.mcgill.ca/faculty/harrison/flps.html), [SEG](https://mendel.imp.ac.at/METHODS/seg.server.html), [COILS](https://mybiosoftware.com/coils-2-2-prediction-coiled-coil-regions-proteins.html), [THMHH 2.0c](http://www.cbs.dtu.dk/services/TMHMM/) and [SignalP 4.1g](http://www.cbs.dtu.dk/services/SignalP/). To get these tools and make a configuration file for FAS, please use the `setupFAS` function:
```
fas.setup -t /directory/where/you/want/to/save/annotation/tools
```
Expand Down
77 changes: 67 additions & 10 deletions greedyFAS/annoFAS/annoModules.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,24 @@ def get_property(prop, project):
return result.group(1)


def getFasToolPath():
cmd = 'fas.setup -t ~/ -c'
try:
flpsOut = subprocess.run([cmd], shell=True, capture_output=True, check=True)
except:
sys.exit('Error running\n%s' % cmd)
lines = flpsOut.stdout.decode().split('\n')
if 'FAS is ready to run' in lines[0]:
return(lines[0].replace('Annotation tools can be found at ','').replace('. FAS is ready to run!',''))
else:
sys.exit('FAS has not been setup!')


def printMsg(silent, msg):
if silent == False:
print(msg)


# functions for doing annotation with single tool
def doFlps(args):
(seqFile, toolPath, threshold) = args
Expand All @@ -92,12 +110,17 @@ def doFlps(args):
annoOut[tmp[0]] = {}
annoOut[tmp[0]]['length'] = len(inSeq[tmp[0]])
annoOut[tmp[0]]['flps'] = {}
if not 'flps_' + tmp[1] + '_' + tmp[7] in annoOut[tmp[0]]['flps']:
annoOut[tmp[0]]['flps']['flps_' + tmp[1] + '_' + tmp[7]] = {}
annoOut[tmp[0]]['flps']['flps_' + tmp[1] + '_' + tmp[7]]['evalue'] = float(threshold)
annoOut[tmp[0]]['flps']['flps_' + tmp[1] + '_' + tmp[7]]['instance'] = []
annoOut[tmp[0]]['flps']['flps_' + tmp[1] + '_' + tmp[7]]['instance'].append((int(tmp[3]), int(tmp[4]),
float(tmp[6])))
flps_id = 'flps_' + tmp[1] + '_' + tmp[7]
if len(tmp) > 7:
flps_id = 'flps_' + tmp[2] + '_' + tmp[8]
if not flps_id in annoOut[tmp[0]]['flps']:
annoOut[tmp[0]]['flps'][flps_id] = {}
annoOut[tmp[0]]['flps'][flps_id]['evalue'] = float(threshold)
annoOut[tmp[0]]['flps'][flps_id]['instance'] = []
if len(tmp) > 7:
annoOut[tmp[0]]['flps'][flps_id]['instance'].append((int(tmp[4]), int(tmp[5]), float(tmp[7])))
else:
annoOut[tmp[0]]['flps'][flps_id]['instance'].append((int(tmp[3]), int(tmp[4]), float(tmp[6])))
annotatedSeq[tmp[0]] = 1

for id in inSeq:
Expand Down Expand Up @@ -513,28 +536,47 @@ def getVersions(tools, toolPath, cutoffs):
pfamVersion = subprocess.run([pfamCmd], shell=True, capture_output=True, check=True)
versionDict['pfam']['version'] = pfamVersion.stdout.decode().strip()
versionDict['pfam']['evalue'] = (eFeature, eInstance)
except:
except subprocess.CalledProcessError as e:
print(e.output.decode(sys.stdout.encoding))
print('Error getting Pfam version! %s' % (pfamVersion))
if 'smart' in tools:
versionDict['smart']['version'] = 'NA'
versionDict['smart']['evalue'] = (eFeature, eInstance)
smartCmd = 'head %s/SMART/version.txt' % toolPath
try:
smartVersion = subprocess.run([smartCmd], shell=True, capture_output=True, check=True)
if (len(smartVersion.stdout.decode().strip()) > 1):
versionDict['smart']['version'] = smartVersion.stdout.decode().strip()
except subprocess.CalledProcessError as e:
print(e.output.decode(sys.stdout.encoding))
print('SMART version cannot be identified! %s' % (smartVersion))
if 'flps' in tools:
versionDict['flps']['version'] = 'NA'
versionDict['flps']['version'] = '1.0'
versionDict['flps']['evalue'] = eFlps
flpsCmd = 'grep README %s/fLPS/README.first | grep version | cut -d " " -f 4 | sed "s/)//"' % toolPath
try:
fLPSVersion = subprocess.run([flpsCmd], shell=True, capture_output=True, check=True)
if (len(fLPSVersion.stdout.decode().strip()) > 1):
versionDict['flps']['version'] = fLPSVersion.stdout.decode().strip()
except subprocess.CalledProcessError as e:
print(e.output.decode(sys.stdout.encoding))
print('Error getting fLPS version! %s' % (fLPSVersion))
if 'signalp' in tools:
signalpCmd = 'head %s/SignalP/signalp-*.readme | grep "INSTALLATION INSTRUCTIONS" | cut -f1 | cut -d " " -f2' % toolPath
try:
signalpVersion = subprocess.run([signalpCmd], shell=True, capture_output=True, check=True)
versionDict['signalp']['version'] = signalpVersion.stdout.decode().strip()
versionDict['signalp']['org'] = (signalpOrg)
except:
except subprocess.CalledProcessError as e:
print(e.output.decode(sys.stdout.encoding))
print('Error getting SignalP version! %s' % (signalpVersion))
if 'tmhmm' in tools:
tmhmmCmd = 'head -n1 %s/TMHMM/README' % toolPath
try:
tmhmmVersion = subprocess.run([tmhmmCmd], shell=True, capture_output=True, check=True)
versionDict['tmhmm']['version'] = tmhmmVersion.stdout.decode().strip()
except:
except subprocess.CalledProcessError as e:
print(e.output.decode(sys.stdout.encoding))
print('Error getting TMHMM version! %s' % (tmhmmVersion))
return(versionDict)

Expand All @@ -558,3 +600,18 @@ def extractAnno(seqFile, existingAnno):
annoDict = {}
annoDict['feature'] = dict((prot, existingDict['feature'][prot]) for prot in list(inSeq.keys()))
return annoDict

# update old json file to add inteprotIDs and tool versions
def updateAnnoFile(jsonFile):
with open(jsonFile) as f:
annoDict = json.load(f)
if not "inteprotID" in annoDict:
toolPath = getFasToolPath()
cutoffs = (0.001, 0.01, 0.0000001, 'euk')
taxon = jsonFile.split('/')[-1].replace('.json', '')
annoPath = jsonFile.replace('%s.json' % taxon, '')
shutil.move('%s' % jsonFile, '%s.old' % jsonFile)
annoDict['inteprotID'] = getPfamAcc(toolPath, annoDict['feature'])
annoDict['version'] = getVersions(getAnnoTools('', toolPath), toolPath, cutoffs)
save2json(annoDict, taxon, annoPath)
return('%s.old' % jsonFile)
77 changes: 58 additions & 19 deletions greedyFAS/annoFAS/checkAnno.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,6 @@
from pkg_resources import get_distribution


def getFasToolPath():
cmd = 'fas.setup -t ~/ -c'
try:
flpsOut = subprocess.run([cmd], shell=True, capture_output=True, check=True)
except:
sys.exit('Error running\n%s' % cmd)
lines = flpsOut.stdout.decode().split('\n')
if 'FAS is ready to run' in lines[0]:
return(lines[0].replace('Annotation tools can be found at ','').replace('. FAS is ready to run!',''))
else:
sys.exit('FAS has not been setup!')

def checkCompleteAnno(seqFile, jsonFile):
allSeq = []
allSeqDict = SeqIO.to_dict((SeqIO.parse(open(seqFile), 'fasta')))
Expand All @@ -61,8 +49,37 @@ def checkCompleteAnno(seqFile, jsonFile):
else:
return()


def checkOutdatedAnno(jsonFile):
with open(jsonFile) as jf:
annoDict = json.load(jf)
if not 'inteprotID' in annoDict:
return('inteprotID')
elif not 'version' in annoDict:
return('version')
elif 'version' in annoDict:
tools = list(annoDict['version'].keys())
toolPath = annoModules.getFasToolPath()
cutoffs = (0.001, 0.01, 0.0000001, 'euk')
currentVer = annoModules.getVersions(tools, toolPath, cutoffs)
outdatedTools = []
for tool in tools:
if 'version' in currentVer[tool]:
if 'version' in annoDict['version'][tool]:
if not annoDict['version'][tool]['version'] == currentVer[tool]['version']:
outdatedTools.append('%s (%s vs %s)' % (tool, annoDict['version'][tool]['version'], currentVer[tool]['version']))
else:
return('version')
if len(outdatedTools) > 0:
return(', '.join(outdatedTools))
else:
return('updated')
else:
return('updated')


def doAnnoForMissing(taxon, missingAnno, jsonFile, outPath, cpus, silent, annoToolFile):
toolPath = getFasToolPath()
toolPath = annoModules.getFasToolPath()
# do annotation for missing proteins
faFile = '%s/%s_tmp.fa' % (outPath, taxon)
with open(faFile, 'w') as f:
Expand Down Expand Up @@ -90,6 +107,7 @@ def doAnnoForMissing(taxon, missingAnno, jsonFile, outPath, cpus, silent, annoTo
os.remove('%s/%s_tmp.json' % (outPath, taxon))
shutil.rmtree('%s/tmp' % outPath)


def main():
version = get_distribution('greedyFAS').version
parser = argparse.ArgumentParser(description='You are running FAS version ' + str(version) + '.',
Expand All @@ -103,6 +121,8 @@ def main():
required=True)
required.add_argument('-o', '--outPath', help='Output directory', action='store', default='', required=True)
optional.add_argument('-n', '--noAnno', help='Do not annotate missing proteins', action='store_true')
optional.add_argument('-u', '--update', help='Update data structure of annotation file (not the annotations themselves)', action='store_true')
optional.add_argument('--keep', help='Keep old annotaion file', action='store_true')
optional.add_argument('--silent', help='Turn off terminal output', action='store_true')
optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1',
action='store', default=0, type=int)
Expand All @@ -123,23 +143,42 @@ def main():
if cpus == 0:
cpus = mp.cpu_count()-1
noAnno = args.noAnno
update = args.update
keep = args.keep
silent = args.silent
annoToolFile = args.annoToolFile
annoModules.checkFileExist(annoToolFile)

taxon = annoFile.split('/')[-1].replace('.json', '')

### check for missing annotation
missingAnno = checkCompleteAnno(seqFile, annoFile)
if len(missingAnno) > 0:
if noAnno == False:
doAnnoForMissing(taxon, missingAnno, annoFile, outPath, cpus, silent, annoToolFile)
if not silent:
print('%s missing proteins of %s has been annotated!' % (len(missingAnno), taxon))
annoModules.printMsg(silent, '%s missing proteins of %s has been annotated!' % (len(missingAnno), taxon))
else:
print('WARNING: Annotation for %s proteins of %s are missing!\n%s' % (len(missingAnno), taxon, missingAnno))
else:
annoModules.printMsg(silent, 'Annotations found for all %s sequences!' % (taxon))


### check for outdated annotation file
outdatedCheck = checkOutdatedAnno(annoFile)
if outdatedCheck == 'updated':
annoModules.printMsg(silent, 'Annotation file updated!')
elif outdatedCheck == 'inteprotID' or outdatedCheck == 'version':
if update == True:
oldAnnoFile = annoModules.updateAnnoFile(annoFile)
annoModules.printMsg(silent, '%s updated!' % annoFile)
if keep == False:
os.remove(oldAnnoFile)
else:
if not silent:
print('Annotation for %s proteins of %s are missing!\n%s' % (len(missingAnno), taxon, missingAnno))
print('WARNING: %s is not updated! Please consider update it with --update option' % annoFile)
else:
if not silent:
print('Annotations found for all %s sequences!' % (taxon))
print('WARNING: Annotations from the tools below are outdated. Please consider to re-annotate them!\n%s' % outdatedCheck)



if __name__ == '__main__':
main()
Loading

0 comments on commit c644921

Please sign in to comment.