Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update sample-name validation #88

Merged
merged 3 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 105 additions & 47 deletions qp_klp/Step.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,9 +711,9 @@ def _get_tube_ids_from_qiita(self, qclient):
# sample-names are used as keys and tube-ids are their values.
qsam, tids = self.get_samples_in_qiita(qclient, qiita_id)

if tids is None:
sample_names_by_qiita_id[str(qiita_id)] = qsam
else:
sample_names_by_qiita_id[str(qiita_id)] = qsam

if tids is not None:
# fix values in tids to be a string instead of a list of one.
# also, remove the qiita_id prepending each sample-name.
tids = {k.replace(f'{qiita_id}.', ''): tids[k][0] for k in
Expand All @@ -733,59 +733,132 @@ def _get_tube_ids_from_qiita(self, qclient):

def _compare_samples_against_qiita(self, qclient):
projects = self.pipeline.get_project_info(short_names=True)
self._get_tube_ids_from_qiita(qclient)

results = []
for project in projects:
project_name = project['project_name']
msgs = []
self._get_tube_ids_from_qiita(qclient)
p_name = project['project_name']
qiita_id = str(project['qiita_id'])
contains_replicates = project['contains_replicates']

# get list of samples as presented by the sample-sheet or mapping
# file and confirm that they are all registered in Qiita.
samples = set(self.pipeline.get_sample_names(project_name))
if contains_replicates:
# don't match against sample-names with a trailing well-id
# if project contains replicates.
msgs.append("This sample-sheet contains replicates. sample-"
"names will be sourced from orig_name column.")
samples = set(self.pipeline.get_orig_names_from_sheet(p_name))
else:
samples = set(self.pipeline.get_sample_names(p_name))

# do not include BLANKs. If they are unregistered, we will add
# them downstream.
samples = {smpl for smpl in samples
if not smpl.startswith('BLANK')}

# just get a list of the tube-ids themselves, not what they map
# to.
if qiita_id in self.tube_id_map:
# if map is not empty
tids = [self.tube_id_map[qiita_id][sample] for sample in
self.tube_id_map[qiita_id]]
msgs.append(f"The total number of samples found in {p_name} that "
f"aren't BLANK is: {len(samples)}")

results_sn = self._process_sample_names(p_name, qiita_id,
samples)

not_in_qiita = samples - set(tids)
msgs.append("Number of values in sheet that aren't sample-names in"
" Qiita: %s" % len(results_sn[0]))

if not_in_qiita:
# strip any leading zeroes from the sample-ids. Note that
# if a sample-id has more than one leading zero, all of
# them will be removed.
not_in_qiita = set([x.lstrip('0') for x in samples]) - \
set(tids)
use_tids = False

examples = tids[:5]
used_tids = True
if len(results_sn[0]) == 0:
msgs.append(f"All values in sheet matched sample-names "
f"registered with {p_name}")
else:
# assume project is in samples_in_qiita
not_in_qiita = samples - set(self.samples_in_qiita[qiita_id])
examples = list(samples)[:5]
used_tids = False
# not all values were matched to sample-names.
# check for possible match w/tube-ids, if defined in project.
results_tid = self._process_tube_ids(p_name, qiita_id,
samples)
if results_tid:
msgs.append("Number of values in sheet that aren't "
"tube-ids in Qiita: %s" % len(results_tid[0]))

if len(results_tid[0]) == 0:
# all values were matched to tube-ids.
use_tids = True
msgs.append(f"All values in sheet matched tube-ids "
f"registered with {p_name}")
else:
# we have sample-names and tube-ids and neither is
# a perfect match.
if len(results_tid[0]) < len(results_sn[0]):
# more tube-ids matched than sample-names.
use_tids = True
msgs.append(f"More values in sheet matched tube-"
f"ids than sample-names with {p_name}")
elif len(results_tid[0]) == len(results_sn[0]):
msgs.append("Sample-names and tube-ids were "
"equally non-represented in the "
"sample-sheet")
else:
msgs.append(f"More values in sheet matched sample-"
f"names than tube-ids with {p_name}")
else:
msgs.append("there are no tube-ids registered with "
f"{p_name}")

# convert to strings before returning
examples = [str(example) for example in examples]
if use_tids:
not_in_qiita = results_tid[0]
examples = results_tid[1]
total_in_qiita = results_tid[2]
else:
not_in_qiita = results_sn[0]
examples = results_sn[1]
total_in_qiita = results_sn[2]

# return an entry for all projects, even when samples_not_in_qiita
# is an empty list, as the information is still valuable.

results.append({'samples_not_in_qiita': not_in_qiita,
'examples_in_qiita': examples,
'project_name': project_name,
'tids': used_tids})
'project_name': p_name,
'total_in_qiita': total_in_qiita,
'used_tids': use_tids,
'messages': msgs})

return results

def _process_sample_names(self, project_name, qiita_id, samples):
not_in_qiita = samples - set(self.samples_in_qiita[qiita_id])
examples = list(samples)[:5]

# convert to strings before returning
examples = [str(example) for example in examples]

number_in_project = len(set(self.samples_in_qiita[qiita_id]))

return not_in_qiita, examples, number_in_project

def _process_tube_ids(self, project_name, qiita_id, samples):
if qiita_id in self.tube_id_map:
tids = [self.tube_id_map[qiita_id][sample] for sample in
self.tube_id_map[qiita_id]]

not_in_qiita = samples - set(tids)

if not_in_qiita:
# strip any leading zeroes from the sample-ids. Note that
# if a sample-id has more than one leading zero, all of
# them will be removed.
not_in_qiita = set([x.lstrip('0') for x in samples]) - \
set(tids)

# convert examples to strings before returning
examples = [str(example) for example in tids[:5]]

number_in_project = len(set(tids))

return not_in_qiita, examples, number_in_project

# return None otherwise

@classmethod
def _replace_with_tube_ids(cls, prep_file_path, tube_id_map):
# passing tube_id_map as a parameter allows for easier testing.
Expand Down Expand Up @@ -899,23 +972,8 @@ def precheck(self, qclient):

if missing_counts:
msgs = []
for comparison in results:
not_in_qiita = list(comparison['samples_not_in_qiita'])
not_in_qiita_count = len(not_in_qiita)
examples_in_qiita = ', '.join(comparison['examples_in_qiita'])
p_name = comparison['project_name']
uses_tids = comparison['tids']

msgs.append(
f"<br/><b>Project '{p_name}'</b> has {not_in_qiita_count} "
f"samples not registered in Qiita: {not_in_qiita[:5]}")

msgs.append(f"Some registered samples in Project '{p_name}'"
f" include: {examples_in_qiita}")

if uses_tids:
msgs.append(f"Project '{p_name}' is using tube-ids. You "
"may be using sample names in your file.")
for result in results:
msgs += result['messages']

if msgs:
raise PipelineError('\n'.join(msgs))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
experiment_design_description well_description library_construction_protocol platform run_center run_date run_prefix sequencing_meth center_name center_project_name instrument_model runid lane sample_project sample_well sample_name index i7_index_id sample_plate index2 i5_index_id raw_reads_r1r2 quality_filtered_reads_r1r2 non_host_reads fraction_passing_quality_filter fraction_non_human old_sample_name
test description 13059.SP331130A04 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 G17 SP331130A04 GGAAGGAT iTru7_110_08 SAMPLE CACAAGTC iTru5_01_E SP331130A-4
test description 13059.AP481403B02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 O8 AP481403B02 TCCGTATG iTru7_111_08 SAMPLE ACAGCTCA iTru5_03_B AP481403B-2
test description 13059.LP127829A02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 D24 LP127829A02 CCGGAATT iTru7_112_06 SAMPLE TTCGTACC iTru5_05_A LP127829A-2
test description 13059.BLANK3.3B Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 H18 BLANK3.3B GGAAGGAT iTru7_110_08 SAMPLE CTACAGTG iTru5_02_C BLANK3.3B
test description 13059.EP529635B02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 B19 EP529635B02 CGATAGAG iTru7_111_01 SAMPLE AGCGTGTT iTru5_02_F EP529635B-2
test description 13059.EP542578B04 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 O22 EP542578B04 AGTCTCAC iTru7_112_04 SAMPLE GTTCATGG iTru5_04_C EP542578B-4
test description 13059.EP446602B01 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 L3 EP446602B01 AACCGTTC iTru7_110_02 SAMPLE TTGCCACT iTru5_01_B EP446602B-1
test description 13059.EP121011B01 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 B24 EP121011B01 AGTTGGCT iTru7_112_05 SAMPLE TGGCACTA iTru5_04_H EP121011B-1
test description 13059.EP636802A01 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 I2 EP636802A01 AAGTCGAG iTru7_109_05 SAMPLE TGGCATGT iTru5_05_G EP636802A-1
test description 13059.SP573843A04 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 F9 SP573843A04 CTTCGTTC iTru7_111_11 SAMPLE CGTTATGC iTru5_03_G SP573843A-4
2 changes: 1 addition & 1 deletion qp_klp/tests/data/good-sample-prep.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ test description 13059.SP331130A04 Knight Lab Kapa HyperPlus Illumina IGM 2017-0
test description 13059.AP481403B02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 O8 AP481403B-2 TCCGTATG iTru7_111_08 SAMPLE ACAGCTCA iTru5_03_B
test description 13059.LP127829A02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 D24 LP127829A-2 CCGGAATT iTru7_112_06 SAMPLE TTCGTACC iTru5_05_A
test description 13059.BLANK3.3B Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 H18 BLANK3.3B GGAAGGAT iTru7_110_08 SAMPLE CTACAGTG iTru5_02_C
test description 13059.EP529635B02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 B19 EP529635B02 CGATAGAG iTru7_111_01 SAMPLE AGCGTGTT iTru5_02_F
test description 13059.EP529635B02 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 B19 EP529635B-2 CGATAGAG iTru7_111_01 SAMPLE AGCGTGTT iTru5_02_F
test description 13059.EP542578B04 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 O22 EP542578B-4 AGTCTCAC iTru7_112_04 SAMPLE GTTCATGG iTru5_04_C
test description 13059.EP446602B01 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 L3 EP446602B-1 AACCGTTC iTru7_110_02 SAMPLE TTGCCACT iTru5_01_B
test description 13059.EP121011B01 Knight Lab Kapa HyperPlus Illumina IGM 2017-09-02 SAMPLE sequencing by synthesis UCSD SAMPLE Illumina HiSeq 4000 211021_A00000_0000_SAMPLE 1 NYU_BMS_Melanoma_13059 B24 EP121011B-1 AGTTGGCT iTru7_112_05 SAMPLE TGGCACTA iTru5_04_H
Expand Down
8 changes: 4 additions & 4 deletions qp_klp/tests/data/good-sample-sheet.csv
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_I
1,SP464350A04,SP464350A04,NYU_BMS_Melanoma_13059_P1,A5,iTru7_109_01,CTCGTCTT,iTru5_05_B,AAGCATCG,NYU_BMS_Melanoma_13059,pool2,SP464350A04
1,C9,C9,NYU_BMS_Melanoma_13059_P1,C5,iTru7_109_02,CGAACTGT,iTru5_06_B,TACTCCAG,NYU_BMS_Melanoma_13059,pool1,C9
1,ep256643b01,ep256643b01,NYU_BMS_Melanoma_13059_P1,E5,iTru7_109_03,CATTCGGT,iTru5_07_B,GATACCTG,NYU_BMS_Melanoma_13059,pool2,ep256643b01
1,EP121011B01,EP121011B01,NYU_BMS_Melanoma_13059_P1,G5,iTru7_109_04,TCGGTTAC,iTru5_08_B,ACCTCTTC,NYU_BMS_Melanoma_13059,pool1,EP121011B01
1,EP121011B01,EP121011B-1,NYU_BMS_Melanoma_13059_P1,G5,iTru7_109_04,TCGGTTAC,iTru5_08_B,ACCTCTTC,NYU_BMS_Melanoma_13059,pool1,EP121011B-1
1,AP616837B04,AP616837B04,NYU_BMS_Melanoma_13059_P1,I5,iTru7_109_05,AAGTCGAG,iTru5_09_B,ACGGACTT,NYU_BMS_Melanoma_13059,pool2,AP616837B04
1,SP506933A04,SP506933A04,NYU_BMS_Melanoma_13059_P1,K5,iTru7_109_06,TATCGGTC,iTru5_10_B,CATGTGTG,NYU_BMS_Melanoma_13059,pool1,SP506933A04
1,EP159695B01,EP159695B01,NYU_BMS_Melanoma_13059_P1,M5,iTru7_109_07,TATTCGCC,iTru5_11_B,TGCCTCAA,NYU_BMS_Melanoma_13059,pool2,EP159695B01
Expand Down Expand Up @@ -492,7 +492,7 @@ Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_I
1,EP320438B01,EP320438B01,NYU_BMS_Melanoma_13059_P1,M17,iTru7_113_07,AAGTGTCG,iTru5_11_F,ACAACAGC,NYU_BMS_Melanoma_13059,pool2,EP320438B01
1,SP612495A04,SP612495A04,NYU_BMS_Melanoma_13059_P1,O17,iTru7_113_08,GAACGCTT,iTru5_12_F,TGTGGCTT,NYU_BMS_Melanoma_13059,pool1,SP612495A04
1,EP446604B03,EP446604B03,NYU_BMS_Melanoma_13059_P1,A19,iTru7_113_09,TCAAGGAC,iTru5_01_G,GTTCCATG,NYU_BMS_Melanoma_13059,pool2,EP446604B03
1,EP446602B01,EP446602B01,NYU_BMS_Melanoma_13059_P1,C19,iTru7_113_10,TCAACTGG,iTru5_02_G,TGGATGGT,NYU_BMS_Melanoma_13059,pool1,EP446602B01
1,EP446602B01,EP446602B-1,NYU_BMS_Melanoma_13059_P1,C19,iTru7_113_10,TCAACTGG,iTru5_02_G,TGGATGGT,NYU_BMS_Melanoma_13059,pool1,EP446602B-1
1,EP182243B02,EP182243B02,NYU_BMS_Melanoma_13059_P1,E19,iTru7_113_11,GGTTGATG,iTru5_03_G,GCATAACG,NYU_BMS_Melanoma_13059,pool2,EP182243B02
1,EP333541B04,EP333541B04,NYU_BMS_Melanoma_13059_P1,G19,iTru7_113_12,AAGGACAC,iTru5_04_G,TCGAACCT,NYU_BMS_Melanoma_13059,pool1,EP333541B04
1,EP238034B01,EP238034B01,NYU_BMS_Melanoma_13059_P1,I19,iTru7_114_01,TTGATCCG,iTru5_05_G,ACATGCCA,NYU_BMS_Melanoma_13059,pool2,EP238034B01
Expand Down Expand Up @@ -673,7 +673,7 @@ Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_I
1,EP554501B04,EP554501B04,NYU_BMS_Melanoma_13059_P3,H15,iTru7_304_08,GGTCAGAT,iTru5_112_E,AGTGCATC,NYU_BMS_Melanoma_13059,pool1,EP554501B04
1,EP542577B04,EP542577B04,NYU_BMS_Melanoma_13059_P3,J15,iTru7_304_09,TCGTGGAT,iTru5_101_F,TTGGACTG,NYU_BMS_Melanoma_13059,pool2,EP542577B04
1,EP487995B04,EP487995B04,NYU_BMS_Melanoma_13059_P3,L15,iTru7_304_10,CGTGTGTA,iTru5_102_F,GTCGATTG,NYU_BMS_Melanoma_13059,pool1,EP487995B04
1,EP542578B04,EP542578B04,NYU_BMS_Melanoma_13059_P3,N15,iTru7_304_11,GTGTCTGA,iTru5_103_F,GGCATTCT,NYU_BMS_Melanoma_13059,pool2,EP542578B04
1,EP542578B04,EP542578B-4,NYU_BMS_Melanoma_13059_P3,N15,iTru7_304_11,GTGTCTGA,iTru5_103_F,GGCATTCT,NYU_BMS_Melanoma_13059,pool2,EP542578B-4
1,EP573310B01,EP573310B01,NYU_BMS_Melanoma_13059_P3,P15,iTru7_304_12,GAATCGTG,iTru5_104_F,TGGTATCC,NYU_BMS_Melanoma_13059,pool1,EP573310B01
1,EP244366B01,EP244366B01,NYU_BMS_Melanoma_13059_P3,B17,iTru7_305_01,GCGATAGT,iTru5_105_F,GGCAAGTT,NYU_BMS_Melanoma_13059,pool2,EP244366B01
1,EP533389B03,EP533389B03,NYU_BMS_Melanoma_13059_P3,D17,iTru7_305_02,GGCTATTG,iTru5_106_F,GTCTGAGT,NYU_BMS_Melanoma_13059,pool1,EP533389B03
Expand Down Expand Up @@ -778,7 +778,7 @@ Lane,Sample_ID,Sample_Name,Sample_Plate,well_id_384,I7_Index_ID,index,I5_Index_I
1,SP388683A02,SP388683A02,NYU_BMS_Melanoma_13059_P4,J18,iTru7_105_05,TCTCTTCC,iTru5_121_F,GATGTCGA,NYU_BMS_Melanoma_13059,pool2,SP388683A02
1,SP232309A01,SP232309A01,NYU_BMS_Melanoma_13059_P4,L18,iTru7_105_06,AGTGTTGG,iTru5_122_F,GAAGTGCT,NYU_BMS_Melanoma_13059,pool1,SP232309A01
1,EP899038A04,EP899038A04,NYU_BMS_Melanoma_13059_P4,N18,iTru7_105_07,TGGCATGT,iTru5_123_F,TCACTCGA,NYU_BMS_Melanoma_13059,pool2,EP899038A04
1,EP636802A01,EP636802A01,NYU_BMS_Melanoma_13059_P4,P18,iTru7_105_08,AGAAGCGT,iTru5_124_F,ACGCAGTA,NYU_BMS_Melanoma_13059,pool1,EP636802A01
1,EP636802A01,EP636802A-1,NYU_BMS_Melanoma_13059_P4,P18,iTru7_105_08,AGAAGCGT,iTru5_124_F,ACGCAGTA,NYU_BMS_Melanoma_13059,pool1,EP636802A-1
1,AP046327B02,AP046327B02,NYU_BMS_Melanoma_13059_P4,B20,iTru7_105_09,AGCGGAAT,iTru5_113_G,ATCTCCTG,NYU_BMS_Melanoma_13059,pool2,AP046327B02
1,EP905975A04,EP905975A04,NYU_BMS_Melanoma_13059_P4,D20,iTru7_105_10,TAACCGGT,iTru5_114_G,ATGTGGAC,NYU_BMS_Melanoma_13059,pool1,EP905975A04
1,SP410796A02,SP410796A02,NYU_BMS_Melanoma_13059_P4,F20,iTru7_105_11,CATGGAAC,iTru5_115_G,CAAGCCAA,NYU_BMS_Melanoma_13059,pool2,SP410796A02
Expand Down
Loading
Loading