From c6402962eca36db033f625fe8fd5050c00f272f9 Mon Sep 17 00:00:00 2001 From: wshuai294 Date: Thu, 28 Mar 2024 14:40:18 +0800 Subject: [PATCH] fix the bug in read name recognition --- script/assign_reads_to_genes.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/script/assign_reads_to_genes.py b/script/assign_reads_to_genes.py index 10ee4e4..e125f8c 100644 --- a/script/assign_reads_to_genes.py +++ b/script/assign_reads_to_genes.py @@ -148,10 +148,11 @@ def assign_fastq(file, gene, index, assign_dict): for line in f: line = line.strip() if i % 4 == 0: - if re.search('/1',line) or re.search('/2',line): - read_name = line.split()[0][1:-2] + raw_read_name = line.split()[0] + if re.search('/1',raw_read_name) or re.search('/2',raw_read_name): + read_name = raw_read_name[1:-2] else: - read_name = line.split()[0][1:] + read_name = raw_read_name[1:] if read_name in assign_dict.keys() and assign_dict[read_name] == gene: flag = True num = 1 @@ -187,15 +188,24 @@ def main(): t1 = time.time() print ("read bam cost %s"%(t1 - t0)) + count_read_for_each_gene = {} # assign genes for each read for read_name in read_dict: assigned_locus = read_dict[read_name].assign() if assigned_locus == 'REMOVE': continue assign_dict[read_name] = assigned_locus - + if assigned_locus not in count_read_for_each_gene: + count_read_for_each_gene[assigned_locus] = 0 + count_read_for_each_gene[assigned_locus] += 1 + # generate gene-specific fastq for gene in ['A', 'B', 'C', 'DPA1', 'DPB1', 'DQA1', 'DQB1', 'DRB1']: + if gene in count_read_for_each_gene: + read_num = count_read_for_each_gene[gene] + else: + read_num = 0 + print (f"read no. for {gene} is {read_num}.") assign_fastq(options.fq1, gene, 1, assign_dict) assign_fastq(options.fq2, gene, 2, assign_dict) t2 = time.time()