Skip to content

Commit 5ae4bed

Browse files
antgonzawasade
andauthored
add backend to specify analysis metadata categories (#3176)
* add backend to specify analysis metadata categories * Update qiita_db/analysis.py Co-authored-by: Daniel McDonald <[email protected]> Co-authored-by: Daniel McDonald <[email protected]>
1 parent cd1dbeb commit 5ae4bed

File tree

2 files changed

+83
-4
lines changed

2 files changed

+83
-4
lines changed

qiita_db/analysis.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from itertools import product
2020
from os.path import join, exists
2121
from os import mkdir
22+
from collections import defaultdict
2223

2324
from biom import load_table
2425
from biom.util import biom_open
@@ -442,6 +443,37 @@ def mapping_file(self):
442443
else:
443444
return None
444445

446+
@property
447+
def metadata_categories(self):
448+
"""Returns all metadata categories in the current analyses based
449+
on the available studies
450+
451+
Returns
452+
-------
453+
dict of dict
454+
a dict with study_id as the key & the values are another dict with
455+
'sample' & 'prep' as keys and the metadata categories as values
456+
"""
457+
ST = qdb.metadata_template.sample_template.SampleTemplate
458+
PT = qdb.metadata_template.prep_template.PrepTemplate
459+
with qdb.sql_connection.TRN:
460+
sql = """SELECT DISTINCT study_id, artifact_id
461+
FROM qiita.analysis_sample
462+
LEFT JOIN qiita.study_artifact USING (artifact_id)
463+
WHERE analysis_id = %s"""
464+
qdb.sql_connection.TRN.add(sql, [self._id])
465+
466+
metadata = defaultdict(dict)
467+
for sid, aid in qdb.sql_connection.TRN.execute_fetchindex():
468+
if sid not in metadata:
469+
metadata[sid]['sample'] = set(ST(sid).categories)
470+
metadata[sid]['prep'] = set()
471+
for pt in qdb.artifact.Artifact(aid).prep_templates:
472+
metadata[sid]['prep'] = metadata[sid]['prep'] | set(
473+
PT(pt.id).categories)
474+
475+
return metadata
476+
445477
@property
446478
def tgz(self):
447479
"""Returns the tgz file of the analysis
@@ -795,7 +827,7 @@ def remove_samples(self, artifacts=None, samples=None):
795827
qdb.sql_connection.TRN.add(sql, args, many=True)
796828
qdb.sql_connection.TRN.execute()
797829

798-
def build_files(self, merge_duplicated_sample_ids):
830+
def build_files(self, merge_duplicated_sample_ids, categories=None):
799831
"""Builds biom and mapping files needed for analysis
800832
801833
Parameters
@@ -804,6 +836,8 @@ def build_files(self, merge_duplicated_sample_ids):
804836
If the duplicated sample ids in the selected studies should be
805837
merged or prepended with the artifact ids. If false prepends
806838
the artifact id
839+
categories : set of str, optional
840+
If not None, use _only_ these categories for the metaanalysis
807841
808842
Notes
809843
-----
@@ -858,7 +892,8 @@ def build_files(self, merge_duplicated_sample_ids):
858892
# We need to negate merge_duplicated_sample_ids because in
859893
# _build_mapping_file is acually rename: merge yes == rename no
860894
rename_dup_samples = not merge_duplicated_sample_ids
861-
self._build_mapping_file(samples, rename_dup_samples)
895+
self._build_mapping_file(
896+
samples, rename_dup_samples, categories=categories)
862897

863898
if post_processing_cmds:
864899
biom_files = self._build_biom_tables(
@@ -1034,7 +1069,8 @@ def _build_biom_tables(self,
10341069
# the user.
10351070
return biom_files
10361071

1037-
def _build_mapping_file(self, samples, rename_dup_samples=False):
1072+
def _build_mapping_file(self, samples, rename_dup_samples=False,
1073+
categories=None):
10381074
"""Builds the combined mapping file for all samples
10391075
Code modified slightly from qiime.util.MetadataMap.__add__"""
10401076
with qdb.sql_connection.TRN:
@@ -1045,9 +1081,14 @@ def _build_mapping_file(self, samples, rename_dup_samples=False):
10451081
artifact = qdb.artifact.Artifact(aid)
10461082
si = artifact.study.sample_template
10471083
if si not in sample_infos:
1048-
sample_infos[si] = si.to_dataframe()
1084+
si_df = si.to_dataframe()
1085+
if categories is not None:
1086+
si_df = si_df[categories & set(si_df.columns)]
1087+
sample_infos[si] = si_df
10491088
pt = artifact.prep_templates[0]
10501089
pt_df = pt.to_dataframe()
1090+
if categories is not None:
1091+
pt_df = pt_df[categories & set(pt_df.columns)]
10511092

10521093
qm = pt_df.join(sample_infos[si], lsuffix="_prep")
10531094

qiita_db/test/test_analysis.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,30 @@ def test_retrieve_mapping_file(self):
298298
qdb.util.get_filepath_information(obs)['fullpath'], exp)
299299
self.assertTrue(exists(exp))
300300

301+
def test_metadata_categories(self):
302+
exp = {1: {
303+
'sample': {
304+
'env_package', 'water_content_soil', 'collection_timestamp',
305+
'anonymized_name', 'sample_type', 'env_biome', 'host_taxid',
306+
'ph', 'env_feature', 'temp', 'country', 'scientific_name',
307+
'assigned_from_geo', 'physical_specimen_location',
308+
'common_name', 'longitude', 'depth', 'season_environment',
309+
'description', 'tot_org_carb', 'tot_nitro', 'dna_extracted',
310+
'texture', 'samp_salinity', 'taxon_id', 'host_subject_id',
311+
'description_duplicate', 'latitude',
312+
'physical_specimen_remaining', 'altitude', 'elevation'},
313+
'prep': {
314+
'run_prefix', 'platform', 'study_center',
315+
'library_construction_protocol', 'emp_status',
316+
'target_subfragment', 'target_gene', 'center_project_name',
317+
'illumina_technology', 'experiment_title', 'instrument_model',
318+
'run_date', 'run_center', 'pcr_primers', 'sequencing_meth',
319+
'experiment_center', 'experiment_design_description',
320+
'barcode', 'samp_size', 'sample_center', 'primer',
321+
'center_name'}}}
322+
obs = self.analysis.metadata_categories
323+
self.assertDictEqual(obs, exp)
324+
301325
def test_retrieve_tgz(self):
302326
# generating here as the tgz is only generated once the analysis runs
303327
# to completion (un)successfully
@@ -395,6 +419,20 @@ def test_build_mapping_file(self):
395419

396420
assert_frame_equal(obs, exp, check_like=True)
397421

422+
# testing categories
423+
analysis._build_mapping_file(
424+
samples, categories=set(
425+
['env_package', 'experiment_design_description']))
426+
obs = qdb.util.get_filepath_information(
427+
analysis.mapping_file)['fullpath']
428+
obs = qdb.metadata_template.util.load_template_to_dataframe(
429+
obs, index='#SampleID').columns
430+
exp = ['experiment_design_description', 'env_package',
431+
'qiita_artifact_id', 'qiita_prep_deprecated',
432+
'qiita_study_title', 'qiita_study_alias', 'qiita_owner',
433+
'qiita_principal_investigator']
434+
self.assertCountEqual(obs, exp)
435+
398436
def test_build_mapping_file_duplicated_samples_no_merge(self):
399437
analysis = self._create_analyses_with_samples()
400438
samples = {4: ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'],

0 commit comments

Comments
 (0)