-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclipboard.txt
246 lines (203 loc) · 12.3 KB
/
clipboard.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# Just a place to dump various code and info
class AtlasProjection(Resource):
# Check the bulk data file format
def check_format(self, expression, samples, sample_column):
# Check number of columns in expression matrix and number of rows in sample matrix is the same
if len(expression.columns) != len(samples):
return {'error': 'The number of columns in the expression matrix and rows in the sample matrix need to match. Check the file formats.'}
# Check the expression matrix columns are present in the row IDs in the sample matrix
for column in expression.columns:
if column not in samples.index:
return {'error': 'The column headings in the expression matrix need to be present in the row IDs in the sample matrix. Check the file formats.'}
# Check the expression matrix uses Ensembl IDs
for index in expression.index:
if index[0:4] != 'ENSG':
return {'error': 'The expression matrix must use Ensembl IDs as row IDs. Check the file format.'}
# Check the sample column given exists in the sample matrix
if sample_column not in samples.columns and sample_column != "":
return {'error': 'The sample column must be a column that exists in the sample matrix. Check the sample column.'}
return {'error': ""}
def post(self, atlasType, dataSource):
"""Project data onto the atlas of atlasType. dataSource is one of ['stemformatics','user','user-single'].
"""
try:
# Project single-cell-data - user upload
if dataSource.lower()=="user-single":
parser = reqparse.RequestParser()
parser.add_argument('email', type=str, required=False)
parser.add_argument('data', type=werkzeug.datastructures.FileStorage, location='files')
args = parser.parse_args()
now = datetime.now()
email = args.get('email')
df = pandas.read_csv(args.get('data'), sep='\t', index_col=0)
if len(df)==0:
print('error')
return {'error': 'The data matrix came back as zero length. Check its format.'}
# Create unique ID
id = uuid.uuid4()
# Check for unique ID in ids.tsv
try:
path = "/mnt/stemformatics-data/user_projection_data/ids.csv"
ids_df = pandas.read_csv(path, index_col=0)
unique = False
# Loop until unique ID
while not unique:
if id in ids_df.index:
id = uuid.uuid4()
else:
unique = True
# Create file called ID
new_path = "/mnt/stemformatics-data/user_projection_data/{}".format(id)
if not os.path.exists(new_path):
os.makedirs(new_path)
# Save df to input
df.to_csv(new_path + '/input.tsv', sep='\t')
# Append details to ids.csv
now = datetime.now()
try:
data = [id, email, atlasType, now.strftime('%d/%m/%Y %H:%M:%S'),'','']
with open(path, 'a', newline='') as fd:
writer_object = csv.writer(fd)
writer_object.writerow(data)
fd.close()
except FileNotFoundError:
print('File not found.')
except IOError:
print('File IO error.')
except Exception as e:
print(e)
return
except IOError:
print("Error: file does not exist.")
return
# Stemformatics or Bulk data options
else:
parser = reqparse.RequestParser()
parser.add_argument('name', type=str, required=False) # selected Stemformatics dataset name (eg. Helft_2017_28723558)
parser.add_argument('test_name', type=str, required=False, default="test-data") # user projected dataset name
parser.add_argument('test_sample_column', type=str, required=False) # user projected sample column to use for mapping
# following returns a FileStorage object in this key
parser.add_argument('test_expression', type=werkzeug.datastructures.FileStorage, location='files')
parser.add_argument('test_samples', type=werkzeug.datastructures.FileStorage, location='files')
args = parser.parse_args()
if dataSource.lower()=="stemformatics":
name = args.get('name').split("_")[0] # only take the author name for this
# Find dataset with same name from Stemformatics
publicOnly = auth.AuthUser().username()==None # public datasets only if authenticated username returns None
dsId = datasets.datasetIdFromName(args.get('name'), publicOnly=publicOnly)
ds = datasets.Dataset(dsId)
df = ds.expressionMatrix(key="genes" if ds.metadata()['platform_type']=='Microarray' else "raw")
samples = ds.samples()
# Select column to use - best column may not be cell_type though
column = 'cell_type'
for col in ['cell_type','sample_type','final_cell_type']:
if len(samples[col].unique())>=2: # use this
column = col
break
samples = samples.fillna('[not assigned]')
else:
name = args.get('test_name')
df = pandas.read_csv(args.get('test_expression'), sep='\t', index_col=0)
samples = pandas.read_csv(args.get('test_samples'), sep='\t', index_col=0)
# Check validation on the uploaded data files format
check = self.check_format(df, samples, args.get('test_sample_column'))
if check["error"] != "":
return {"error": check["error"]}
# Some validation on user supplied data
if len(df)==0:
return {'error': 'The expression matrix came back as zero length. Check its format.'}
elif len(samples)==0:
return {'error': 'The sample table came back as zero length. Check its format and ensure its row index match columns of expression matrix.'}
samples = samples.loc[df.columns]
column = args.get('test_sample_column')
if column not in samples.columns: column = samples.columns[0]
# Create atlas data instance
atlas = atlases.Atlas(atlasType)
# Perform projection
result = atlas.projection(name, df, includeCombinedCoords=False)
if result["error"] !="": # Returning empty data frame may cause exception when trying to parse as json, so just return error string
return {"error": result["error"]}
# Prepare the dictionary to return - each object must be JSON serializable (so don't return data frame).
result["coords"] = result["coords"].to_dict(orient="records")
result["samples"] = samples.reset_index().fillna('').to_dict(orient="records")
result["sampleIds"] = ["%s_%s" % (name, item) for item in samples.index]
result["column"] = column
if "combinedCoords" in result:
result["combinedCoords"] = result["combinedCoords"].to_dict(orient="split")
if "capybara" in result:
for col in result["capybara"]:
result["capybara"][col] = result["capybara"][col].to_dict(orient="split")
return result
except:
raise errors.DatasetProjectionFailedError
def datasetMetadataFromQuery(**kwargs):
"""Return DataFrame of dataset metadata which match a query. Rows will have dataset ids,
while columns will be attributes of dataset metadata. Use this instead of Dataset instance
for fetching large numbers of datasets.
If ids_only=True, only a list of dataset ids will be returned, instead of a DataFrame.
Note that query_string will search samples collection as well if include_samples_query=true.
Use datasetMetadataFromQuery() to fetch all datasets.
"""
limit = kwargs.get("limit")
ids_only = kwargs.get('ids_only', False)
public_only = kwargs.get("public_only", True)
include_samples_query = kwargs.get("include_samples_query", False)
dataset_id = kwargs.get("dataset_id") # list of dataset ids specified in the query
name = kwargs.get("name")
query_string = kwargs.get("query_string")
platform_type = kwargs.get("platform_type")
projects = kwargs.get("projects")
organism = kwargs.get("organism")
status = kwargs.get("status")
# params for find function (ie. fetch all records matching params) and attributes for what to return
params = {'dataset_id': {"$nin": _exclude_list}}
attributes = {"dataset_id":1, "_id":0} if ids_only==True else {"_id":0}
if public_only:
params['private'] = False
datasetIds = [] # this is additional dataset ids to search, based on sample search
searched = False # need to know if datasetIds becomes empty again after performing searches
if query_string:
if include_samples_query:
# perform text search in both datasets and samples and use union
sampleSearch = database["samples"].find({'$text': {'$search':query_string}}, {'dataset_id':1})
datasetIds = [item['dataset_id'] for item in sampleSearch]
datasetsSearch = database["datasets"].find({'$text': {'$search':query_string}}, {'dataset_id':1})
datasetIds = list(set(datasetIds).union(set([item['dataset_id'] for item in datasetsSearch])))
if len(datasetIds)==0:
params['dataset_id']["$in"] = []
else: # otherwise it's been done already above
params['$text'] = {"$search": query_string}
searched = True
if organism and organism!='all': # restrict datasets to samples with this organism
sampleSearch = database["samples"].find({'organism': organism}, {'dataset_id':1})
datasetIds = list(set(datasetIds).intersection(set([item['dataset_id'] for item in sampleSearch])))
searched = True
if dataset_id is not None and len(datasetIds)>0: # find common dataset ids
datasetIds = list(set(datasetIds).intersection(set([int(item) for item in dataset_id])))
elif dataset_id is not None and len(datasetIds)==0: # just specified by parameter
datasetIds = [int(item) for item in dataset_id]
if len(datasetIds)>0:
params['dataset_id']["$in"] = datasetIds
else:
if searched: # after searching we found no matching datasets
return [] if ids_only else pandas.DataFrame()
if platform_type:
# assume a list of platform types supplied if there's a comma
params['platform_type'] = {"$in": platform_type.split(',')} if ',' in platform_type else platform_type
if projects:
if projects=='atlas': # any atlas project
params["projects"] = {"$in":["%s_atlas" % atlasType for atlasType in Atlas.all_atlas_types]}
else:
params["projects"] = {"$in":[projects]}
if status:
params["status"] = status
if name:
params["name"] = name
if limit:
cursor = database["datasets"].find(params, attributes).limit(limit)
else:
cursor = database["datasets"].find(params, attributes)
if ids_only:
return [item["dataset_id"] for item in cursor]
else:
return pandas.DataFrame(cursor).set_index("dataset_id") if cursor.count()!=0 else pandas.DataFrame()