|
| 1 | +#!/usr/bin/env ruby |
| 2 | +# == NAME |
| 3 | +# report.rb |
| 4 | +# |
| 5 | +# == USAGE |
| 6 | +# ./this_script.rb [ -h | --help ] |
| 7 | +#[ -i | --infile ] |[ -o | --outfile ] | |
| 8 | +# == DESCRIPTION |
| 9 | +# A script to parse raw results from the ikmb/hla pipeline |
| 10 | +# |
| 11 | +# == OPTIONS |
| 12 | +# -h,--help Show help |
| 13 | +# -o,--outfile=OUTFILE : output file |
| 14 | + |
| 15 | +# |
| 16 | +# == EXPERT OPTIONS |
| 17 | +# |
| 18 | +# == AUTHOR |
| 19 | +# Marc Hoeppner, [email protected] |
| 20 | + |
| 21 | +require 'optparse' |
| 22 | +require 'ostruct' |
| 23 | +require 'json' |
| 24 | +require 'date' |
| 25 | + |
| 26 | +### Define modules and classes here |
| 27 | + |
| 28 | +def trim_allele(call,precision) |
| 29 | + |
| 30 | + answer = nil |
| 31 | + |
| 32 | + call = call.split("*")[-1] if call.include?("*") |
| 33 | + |
| 34 | + e = call.split(":") |
| 35 | + |
| 36 | + if e.length > precision |
| 37 | + answer = e[0..precision-1].join(":") |
| 38 | + else |
| 39 | + answer = call |
| 40 | + end |
| 41 | + |
| 42 | + return answer |
| 43 | +end |
| 44 | + |
| 45 | +### Get the script arguments and open relevant files |
| 46 | +options = OpenStruct.new() |
| 47 | +opts = OptionParser.new() |
| 48 | +opts.banner = "A script description here" |
| 49 | +opts.separator "" |
| 50 | +opts.on("-s","--sample", "=SAMPLE","Sample name") {|argument| options.sample = argument } |
| 51 | +opts.on("-c","--coverage", "=COVERAGE","Coverage of exons") {|argument| options.coverage = argument } |
| 52 | +opts.on("-p","--precision", "=PRECISION","Precision of calls") {|argument| options.precision = argument } |
| 53 | +opts.on("-v","--version", "=VERSION","PipelineVersion") {|argument| options.version = argument } |
| 54 | +opts.on("-o","--outfile", "=OUTFILE","Output file") {|argument| options.outfile = argument } |
| 55 | +opts.on("-h","--help","Display the usage information") { |
| 56 | + puts opts |
| 57 | + exit |
| 58 | +} |
| 59 | + |
| 60 | +opts.parse! |
| 61 | + |
| 62 | +options.precision ? precision = options.precision.to_i : precision = 3 |
| 63 | + |
| 64 | +date = Date.today.strftime("%d.%m.%Y") |
| 65 | + |
| 66 | +sample = options.sample |
| 67 | + |
| 68 | +alleles = { "A" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 69 | + "B" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 70 | + "C" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 71 | + "DQB1" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 72 | + "DRB1" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 73 | + "DRB4" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 74 | + "DRB5" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 75 | + "DQA1" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 76 | + "DRB3" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 77 | + "DPA1" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] }, |
| 78 | + "DPB1" => { "xHLA" => [], "Hisat" => [], "Optitype" => [], "HLAscan" => [], "HLA-HD" => [] } |
| 79 | +} |
| 80 | + |
| 81 | +################### |
| 82 | +# Coverage of exons |
| 83 | +################### |
| 84 | + |
| 85 | +coverage = {} |
| 86 | + |
| 87 | +if options.coverage |
| 88 | + bedcov = File.open(options.coverage) |
| 89 | + |
| 90 | + while (line = bedcov.gets) |
| 91 | + |
| 92 | + # chr6 29942554 29942626 HLA-A.ENST00000376809.1 100 + 9425 |
| 93 | + |
| 94 | + seq,from,to,exon,score,strand,cov = line.strip.split("\t") |
| 95 | + gene = exon.split(".")[0] |
| 96 | + |
| 97 | + from = from.to_i |
| 98 | + to = to.to_i |
| 99 | + len = to-from |
| 100 | + mean_cov = (cov.to_f/len.to_f).round() |
| 101 | + |
| 102 | + this_target = { "exon" => exon, "seq" => seq, "start" => from, "stop" => to, "strand" => strand, "mean_cov" => mean_cov } |
| 103 | + coverage.has_key?(gene) ? coverage[gene] << this_target : coverage[gene] = [ this_target ] |
| 104 | + |
| 105 | + end |
| 106 | + |
| 107 | + bedcov.close |
| 108 | + |
| 109 | +end |
| 110 | + |
| 111 | +# Options |
| 112 | +############ |
| 113 | + |
| 114 | +## Only show hisat results supported at this level |
| 115 | +hisat_cutoff = 0.3 |
| 116 | + |
| 117 | +# Find result files |
| 118 | +files = Dir["*"] |
| 119 | +xhla = files.find{|f| f.upcase.include?("XHLA") } |
| 120 | +hisat = files.find{|f| f.upcase.include?("HISAT") } |
| 121 | +optitype = files.find{|f| f.upcase.include?("OPTI") } |
| 122 | +hlascan = files.select {|f| f.upcase.include?("HLASCAN") } |
| 123 | +hlahd = files.find {|f| f.upcase.include?("HLAHD") } |
| 124 | + |
| 125 | +# The header of the result table, only including those tools we actually have data for. |
| 126 | +rheader = [ "HLA Genes" ] |
| 127 | + |
| 128 | +######################## |
| 129 | +# HLA-HD data processing |
| 130 | +######################## |
| 131 | + |
| 132 | +if hlahd |
| 133 | + |
| 134 | + IO.readlines(hlahd).each do |line| |
| 135 | + |
| 136 | + line.strip! |
| 137 | + # A HLA-A*24:02:01 HLA-A*01:01:01 |
| 138 | + |
| 139 | + gene,a,b = line.split("\t") |
| 140 | + if alleles.has_key?(gene) |
| 141 | + these_alleles = [] |
| 142 | + these_alleles << trim_allele(a,precision) unless a.include?("Not typed") or a == "-" |
| 143 | + these_alleles << trim_allele(b,precision) unless b.include?("Not typed") or b == "-" |
| 144 | + |
| 145 | + alleles[gene]["HLA-HD"] = these_alleles |
| 146 | + end |
| 147 | + |
| 148 | + end |
| 149 | + rheader << "HLA-HD" |
| 150 | +end |
| 151 | + |
| 152 | +######################## |
| 153 | +### xHLA data processing |
| 154 | +######################## |
| 155 | + |
| 156 | +if xhla |
| 157 | + |
| 158 | + json = JSON.parse( IO.readlines(xhla).join ) |
| 159 | + |
| 160 | + this_alleles = json["hla"]["alleles"] |
| 161 | + |
| 162 | + alleles.keys.each do |k| |
| 163 | + |
| 164 | + this_alleles.select {|al| al.match(/^#{k}.*/) }.each {|a| alleles[k]["xHLA"] << trim_allele(a,precision) } |
| 165 | + end |
| 166 | + |
| 167 | + rheader << "xHLA" |
| 168 | +end |
| 169 | + |
| 170 | +########################### |
| 171 | +### HLAscan data processing |
| 172 | +########################### |
| 173 | + |
| 174 | +unless hlascan.empty? |
| 175 | + |
| 176 | + hlascan.each do |h| |
| 177 | + |
| 178 | + lines = IO.readlines(h) |
| 179 | + gene_line = lines.find {|l| l.include?("HLA gene") } |
| 180 | + |
| 181 | + next unless gene_line |
| 182 | + |
| 183 | + gene = gene_line.split(" ")[-1].split("-")[-1] |
| 184 | + |
| 185 | + next unless alleles.has_key?(gene) |
| 186 | + |
| 187 | + allele_1 = "" |
| 188 | + allele_2 = "" |
| 189 | + |
| 190 | + first = lines.find {|l| l.include?("Type 1") } |
| 191 | + second = lines.find {|l| l.include?("Type 2") } |
| 192 | + |
| 193 | + these_alleles = [] |
| 194 | + if first |
| 195 | + allele_1 = "#{gene}*#{first.split(/\s+/)[2]}" |
| 196 | + these_alleles << allele_1 unless allele_1.length == 0 |
| 197 | + end |
| 198 | + if second |
| 199 | + allele_2 = "#{gene}*#{second.split(/\s+/)[2]}" |
| 200 | + these_alleles << allele_2 unless allele_1.length == 0 |
| 201 | + end |
| 202 | + |
| 203 | + these_alleles.each {|a| alleles[gene]["HLAscan"] << trim_allele(a,precision) } |
| 204 | + |
| 205 | + end |
| 206 | + |
| 207 | + rheader << "HLAscan" |
| 208 | + |
| 209 | +end |
| 210 | + |
| 211 | +############################ |
| 212 | +### Optitype data processing |
| 213 | +############################ |
| 214 | + |
| 215 | +if optitype |
| 216 | + |
| 217 | + lines = IO.readlines(optitype)[0..1] |
| 218 | + header = lines.shift.strip.split(/\t/) |
| 219 | + |
| 220 | + # 0 A*26:01 A*30:01 B*13:02 B*38:01 C*06:02 C*12:03 4526.0 4281.585999999999 |
| 221 | + |
| 222 | + e = lines.shift.strip.split(/\t/) |
| 223 | + |
| 224 | + header.each_with_index do |h,i| |
| 225 | + if h.match(/^A.*/) |
| 226 | + alleles["A"]["Optitype"] << trim_allele(e[i+1],precision) |
| 227 | + elsif h.match(/^B.*/) |
| 228 | + alleles["B"]["Optitype"] << trim_allele(e[i+1],precision) |
| 229 | + elsif h.match(/^C.*/) |
| 230 | + alleles["C"]["Optitype"] << trim_allele(e[i+1],precision) |
| 231 | + end |
| 232 | + end |
| 233 | + |
| 234 | + rheader << "Optitype" |
| 235 | + |
| 236 | +end |
| 237 | + |
| 238 | + |
| 239 | +############################# |
| 240 | +#### Hisat Genotype |
| 241 | +############################# |
| 242 | + |
| 243 | +if hisat |
| 244 | + |
| 245 | + # 1 ranked B*35:08:01 (abundance: 50.20%) |
| 246 | + |
| 247 | + lines = IO.readlines(hisat) |
| 248 | + header = lines.shift.split("\t") |
| 249 | + |
| 250 | + info = lines.shift.split("\t") |
| 251 | + |
| 252 | + header.each_with_index do |h,i| |
| 253 | + |
| 254 | + if h.include?("Allele splitting: ") |
| 255 | + gene = h.split(" ")[-1] |
| 256 | + tmp = info[i] |
| 257 | + tmp.split(",").each do |t| |
| 258 | + # C*08 - Trimmed (score: 0.9090),C*08:02 - Trimmed (score: 0.3636) |
| 259 | + # We show percentages for ambiguous calls, but only >= 20% fraction |
| 260 | + c = trim_allele(t.split(" ")[0].strip,precision) |
| 261 | + abundance = t.split(" ")[-1].gsub(")","") |
| 262 | + f = abundance.split("%")[0].to_f |
| 263 | + next if f < hisat_cutoff |
| 264 | + alleles[gene]["Hisat"] << "#{c} (#{abundance})" |
| 265 | + end |
| 266 | + end |
| 267 | + end |
| 268 | + |
| 269 | + rheader << "Hisat" |
| 270 | +end |
| 271 | + |
| 272 | +f = File.new("#{sample}.json","w+") |
| 273 | +data = { "sample" => sample, "coverage" => coverage, "calls" => alleles, "pipeline_version" => options.version, "date" => date } |
| 274 | +f.puts data.to_json |
| 275 | +f.close |
0 commit comments