-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtaxosize.py
101 lines (83 loc) · 2.35 KB
/
taxosize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#
#Provides the size of the taxonomy to download
#
import os
import io
import sys
insideDirList = []
totalSize=0
def addToInsideDirs(newDir):
insideDirList.append(newDir)
def getsize():
ls = []
ftp.retrlines('MLSD', ls.append)
runSize = 0
for entry in ls:
#print(entry)
size = int(entry.split(";")[2].replace("size=",""))
name = entry.split(";")[8].lstrip()
fileType = entry.split(";")[3].replace("type=","")
if name == "." or name == "..":
#print("skipping")
pass
elif fileType == "dir":
#print("going into: " + name)
ftp.cwd(name)
runSize += getsize()
ftp.cwd("..")
else:
#print(str(size))
#print(str(runSize))
runSize += size
return runSize
try:
taxonomy = ""
#getting arguments from command line and validating
if len(sys.argv) == 2:
taxonomy = sys.argv[1]
print("Getting size of: " + taxonomy)
else:
print("usage: paython taxosize taxonomy")
print("example: python taxosize human")
sys.exit(0)
#runing query with esearch on NCBI assembly DB to get file refseq folders
taxoFileNames = os.popen("/opt/edirect/esearch -db assembly -query '" + taxonomy + " AND latest[SB]' | efetch -format docsum | xtract -pattern DocumentSummary -element FtpPath_RefSeq").read()
#open ftp connection
from ftplib import FTP
ftp = FTP('ftp.ncbi.nih.gov')
ftp.login()
#set initial total size
#parse each refseq forder to get file sizes
buf = io.StringIO(taxoFileNames)
fileDir = buf.readline()
while fileDir != "":
#print("Processing Taxo: " + fileDir)
#Remove ftp portion of the path
filePath = fileDir.replace("ftp://ftp.ncbi.nlm.nih.gov/", "").replace('\n','').replace('\r','')
#print("Stripped down ftp: " + filePath)
#split the folder paths
pathItems = filePath.split('/')
#move into the end of the path
x=0
while x < len(pathItems):
path = pathItems[x]
#print("Changing path to: " + path)
ftp.cwd(path)
x += 1
totalSize += getsize()
#Get next file directory to process
fileDir = buf.readline()
#move ftp location back to the root
ftp.cwd("/")
#quit ftp client
ftp.quit()
#format total file size
from hurry.filesize import size
strSize = size(totalSize)
print("Total files size: " + strSize)
except Exception:
print("Error while running taxosize...")
print("usage: python taxosize taxonomy")
print("example: python taxosize human")
#traceback.print_exc(file=sys.stdout)
sys.exit(0)