Skip to content

Commit

Permalink
mapping quality
Browse files Browse the repository at this point in the history
  • Loading branch information
gaoyan07 committed Dec 1, 2015
1 parent 5c20fce commit a2985cd
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 37 deletions.
8 changes: 4 additions & 4 deletions bwt_aln.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ void bwt_aln_res(int ref_id, uint8_t is_rev, bntseq_t *bns, uint8_t *pac, uint8_
ksw_global(qlen, query+(left->read_pos-1+left_eta_len), tlen, target+left->ref_pos-ref_start, 5, bwasw_sc_mat, AP.gapo, AP.gape, abs(qlen-tlen)+3, &mid_cigar_n, &mid_cigar);

cigar32_t *cigar=NULL; int cigar_n, cigar_m;
//if (left->read_pos > 1) { // left extend
if (left->read_pos > 1 || left_eta_len > 0) { // left extend
// invert query and target,
qlen = left->read_pos - 1 + left_eta_len;
_q = (uint8_t*)malloc(qlen * sizeof(uint8_t));
Expand All @@ -265,13 +265,13 @@ void bwt_aln_res(int ref_id, uint8_t is_rev, bntseq_t *bns, uint8_t *pac, uint8_
} else _push_cigar1(&(cur_res->cigar), &(cur_res->cigar_len), &(cur_res->c_m), (qlen<<4)|CSOFT_CLIP);

free(_q); free(_t);
//}
}
cur_res->offset = left->ref_pos;
// push mid-cigar
_push_cigar(&(cur_res->cigar), &(cur_res->cigar_len), &(cur_res->c_m), mid_cigar, mid_cigar_n);
free(mid_cigar);

//if (right->read_pos < reg_len) { // right extend
if (right->read_pos < reg_len || right_eta_len > 0) { // right extend
qlen = reg_len - right->read_pos + right_eta_len;
tlen = ref_start+ref_len-1-right->ref_pos;
ksw_extend_core(qlen, query+right->read_pos+left_eta_len, tlen, target+ref_len-tlen, 5, bwasw_sc_mat, abs(qlen-tlen)+3, AP.bwt_seed_len*bwasw_sc_mat[0], AP, &_qle, &_tle, &cigar, &cigar_n, &cigar_m);
Expand All @@ -282,7 +282,7 @@ void bwt_aln_res(int ref_id, uint8_t is_rev, bntseq_t *bns, uint8_t *pac, uint8_
free(cigar);
}
_push_cigar1(&(cur_res->cigar), &(cur_res->cigar_len), &(cur_res->c_m), ((reg_len-right->read_pos+right_eta_len)<<4)|CSOFT_CLIP);
//}
}

// push tail 'S' (after right_extra)
if (is_rev)_push_cigar1(&(cur_res->cigar), &(cur_res->cigar_len), &(cur_res->c_m), ((extra_beg-1) << 4) | CSOFT_CLIP);
Expand Down
4 changes: 1 addition & 3 deletions frag_check.c
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ void split_mapping(bntseq_t *bns, uint8_t *pac,
//fprintf(stdout, "s1: %lld, exp_s2: %lld act_s2: %lld %d\n", (long long)pos, (long long)exp, (long long)act, dis);
#ifdef __DEBUG__
if (abs(dis) > AP.match_dis)
fprintf(stdout, "%d\t%lld\t%d\t%s\n", at1.chr, (long long)pos, abs(dis), dis>0?"DEL":"INS");
fprintf(stderr, "%d\t%lld\t%d\t%s\n", at1.chr, (long long)pos, abs(dis), dis>0?"DEL":"INS");
#endif
if (dis > AP.match_dis) { //DEL
s_tlen = s_qlen + dis;
Expand Down Expand Up @@ -940,7 +940,6 @@ void copy_res(res_t *f, res_t *t)
t->readend = f->readend;

t->score = f->score;
t->mapq = f->mapq;
t->NM = f->NM;
}

Expand Down Expand Up @@ -991,7 +990,6 @@ void lsat_res_aux(line_aln_res *la, bntseq_t *bns, uint8_t *pac, uint8_t *read_b
// calculate NM and AS
r->NM = n_mm + n_e;
r->score = n_m * AP.match - n_mm * AP.mis - n_o * AP.gapo - n_e * AP.gape;
r->mapq = 100; //XXX
if (r->score < 0) { // XXX
for (i = m+1; i <= la->cur_res_n; ++i) {
copy_res(la->res+i, la->res+i-1);
Expand Down
3 changes: 2 additions & 1 deletion frag_check.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ typedef struct {
uint64_t refend; int readend; // for merge_cigar

int score; // alignment score
uint8_t mapq; // mapping quality
int NM; // edit distance
//cigar32_t *MD; // mis-match postion string XXX
//int m_m, m_n;
Expand All @@ -67,13 +66,15 @@ typedef struct {
int res_m, cur_res_n;
res_t *res;
int tol_score, tol_NM; // tol score of all res, including split penalty
uint8_t mapQ;
} line_aln_res;

typedef struct {
int l_m, l_n;
line_aln_res *la;

int read_len;
float cov_f;
} aln_res;

extern const int8_t sc_mat[25];
Expand Down
36 changes: 20 additions & 16 deletions gem_parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,12 @@ void _push_cigar1(cigar32_t **cigar, int *cigar_n, int *cigar_m, cigar32_t _ciga
}
#endif

void md2m(char *c, int *m, int *mm)
void md2m(char *c, int len, int *m, int *mm)
{
*m=0; *mm=0;
if (*c == 0) return;
int i; char *s;

int len = strlen(c);
for (i = 0; i < len; ++i) {
if (c[i] >= 'A' && c[i] <= 'T')
{
Expand Down Expand Up @@ -80,28 +79,31 @@ void md2cigar(char *md, map_t *map)
int md_len = strlen(md);
int i=0;
char *c = (char*)malloc((md_len+1) * sizeof(char)); int c_i;
char *indel = (char*)malloc((md_len+1) * sizeof(char));
char id; int indel_n=0;
int m=0, mm=0;
int m, mm;
map->cigar->cigar_n = 0;
map->NM = 0;
while (i < md_len) {
if (md[i] == '>') { // indel
sscanf(md+i, ">%d%[+-]", &indel_n, &id);
sscanf(md+i, ">%[^+-]%[+-]", indel, &id);
indel_n = atoi(indel);
_push_cigar1(&(map->cigar->cigar), &(map->cigar->cigar_n), &(map->cigar->cigar_m), (indel_n) << 4 | (id=='+'?CDEL:CINS));
map->NM += (indel_n+mm);
map->NM += indel_n;
i += get_d(indel_n) + 2;
} else {
c_i = i;
for (; md[i] && md[i]!='>'; ++i) {
c[i-c_i] = md[i];
}
c[i-c_i] = '\0';
md2m(c, &m, &mm);
m = 0, mm = 0;
c[i-c_i] = '\0';
md2m(c, i-c_i, &m, &mm);
_push_cigar1(&(map->cigar->cigar), &(map->cigar->cigar_n), &(map->cigar->cigar_m), (m+mm) << 4 | CMATCH);
map->NM += mm;
}
}
free(c);
free(c); free(indel);
}

map_msg *map_init_msg(int n)
Expand Down Expand Up @@ -176,7 +178,7 @@ int main(int argc, char* argv[])
char name[100]; int msg_len;
char *t, *s; int t_i, _t_i;
char chr[10], strand, md[200];
long long offset;
long long offset; char os[100];
int i;

while (fgetline(mapf, &gem_line, &line_len) != EOF) {
Expand Down Expand Up @@ -204,7 +206,8 @@ int main(int argc, char* argv[])
}
}
_t_i = t_i;
sscanf(t, "%[^:]:%c:%lld:%[^:]", chr, &strand, &offset, md);
sscanf(t, "%[^:]:%c:%[^:]:%[^:]", chr, &strand, os, md);
offset = atoll(os);
t_i = _t_i;
md2cigar(md, &(m_msg->map[m_msg->map_n]));
m_msg->map[m_msg->map_n].offset = offset;
Expand Down Expand Up @@ -241,7 +244,7 @@ int gem_map_read(FILE *mapf, map_msg *m_msg, int max_n)
char name[100]; int msg_len;
char *t; int t_i, _t_i;
char chr[10], strand, md[200];
long long offset;
long long offset; char os[100];
int i;

fgetline(mapf, &gem_line, &line_len, &gem_line_m);
Expand All @@ -250,7 +253,7 @@ int gem_map_read(FILE *mapf, map_msg *m_msg, int max_n)
sscanf(gem_line, "%[^\t]\t%*[^\t]\t%*[^\t]\t%[^\n]\n", name, aln_msg);
if (strcmp(aln_msg, "-") == 0) {
#ifdef __DEBUG__
fprintf(stdout, "-\n");
fprintf(stderr, "-\n");
#endif
goto RET;
}
Expand All @@ -275,7 +278,8 @@ int gem_map_read(FILE *mapf, map_msg *m_msg, int max_n)
}
}
_t_i = t_i;
sscanf(t, "%[^:]:%c:%lld:%[^:]", chr, &strand, &offset, md);
sscanf(t, "%[^:]:%c:%[^:]:%[^:]", chr, &strand, os, md);
offset = atoll(os);
t_i = _t_i;
md2cigar(md, &(m_msg->map[m_msg->map_n]));
if (strand == '-') _invert_cigar(&(m_msg->map[m_msg->map_n].cigar->cigar), m_msg->map[m_msg->map_n].cigar->cigar_n);
Expand All @@ -284,9 +288,9 @@ int gem_map_read(FILE *mapf, map_msg *m_msg, int max_n)
strcpy(m_msg->map[m_msg->map_n].chr, chr);

#ifdef __DEBUG__
fprintf(stdout, "%s, %c, %lld, ", chr, strand, offset);
printcigar(stdout, m_msg->map[m_msg->map_n].cigar->cigar, m_msg->map[m_msg->map_n].cigar->cigar_n);
fprintf(stdout, "\n");
fprintf(stderr, "%s, %c, %lld, ", chr, strand, offset);
printcigar(stderr, m_msg->map[m_msg->map_n].cigar->cigar, m_msg->map[m_msg->map_n].cigar->cigar_n);
fprintf(stderr, "\n");
#endif
if (strcmp(chr, "chrM")!=0)
++(m_msg->map_n);
Expand Down
67 changes: 59 additions & 8 deletions lsat_aln.c
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,20 @@ int res_comp(const void*a,const void*b)
else return (*(qua_node*)b).b-(*(qua_node*)a).b;
}

float get_cov_f(aln_res *res, aln_reg *reg)
{
float cov_f; int i, cov=0;

reg->reg_n = 0;
for (i = 0; i < 3; ++i) get_reg(res+i, reg);
aln_sort_reg(reg); aln_merg_reg(reg, 0);

for (i = 0; i < reg->reg_n; ++i)
cov += (reg->reg[i].end-reg->reg[i].beg+1);
cov_f = (cov+0.0)/reg->read_len;
return cov_f;
}

// rearrange the aln_res (merge and filter by the score and cover-region)
void rearr_aln_res(aln_res *res, int n)
{
Expand Down Expand Up @@ -528,6 +542,9 @@ void rearr_aln_res(aln_res *res, int n)
head[head_n++] = 0;
// NOT merg
(res+qua[0].x)->la[qua[0].y].merg_msg = (line_node){1, 0}; // NOT merge and ONLY best
for (i = 0; i < qua_n; ++i) (res+qua[i].x)->la[qua[i].y].mapQ = 255;

int mapq_max = MAPQ_MAX * res->cov_f;

int cov_qua_i;
for (i = 1; i < qua_n; ++i) {
Expand All @@ -545,13 +562,25 @@ void rearr_aln_res(aln_res *res, int n)
(res+qua[cov_qua_i].x)->la[qua[cov_qua_i].y].XA_n++;
}
(res+qua[cov_qua_i].x)->la[qua[cov_qua_i].y].merg_msg.y = 1; // {1,1}, Merge, has alternative
// i.merg_flag = MERG_BODY
// set mapQ for primary
uint8_t tmpQ = mapq_max * (qua[cov_qua_i].a - qua[i].a)/qua[cov_qua_i].a;
if (tmpQ < ((res+qua[cov_qua_i].x)->la[qua[cov_qua_i].y].mapQ))
(res+qua[cov_qua_i].x)->la[qua[cov_qua_i].y].mapQ = tmpQ;

(res+qua[i].x)->la[qua[i].y].merg_msg = (line_node){2, 0}; // Merge, body
} else {
// DUMP
(res+qua[i].x)->la[qua[i].y].merg_msg = (line_node){0, -1}; // DUMP
}
}

// re-set mapQ, based on head_n
for (i = 0; i < qua_n; ++i) {
if ((res+qua[i].x)->la[qua[i].y].merg_msg.x != 1) continue;
if ((res+qua[i].x)->la[qua[i].y].mapQ == 255) {
(res+qua[i].x)->la[qua[i].y].mapQ = mapq_max / head_n;
} else (res+qua[i].x)->la[qua[i].y].mapQ /= head_n;
}
free(qua); aln_free_reg(reg);
}

Expand Down Expand Up @@ -818,11 +847,12 @@ int lsat_main_aln(thread_aux_t *aux)
bwt_aln_remain(a_reg, p->a_res+2, bwt, bns, pac, p->seq, &(p->rseq), *APP, *AP);
get_reg(p->a_res+2, a_reg);

aln_free_reg(a_reg);

p->a_res->cov_f = get_cov_f(p->a_res, a_reg);
// rearrange a_res(0,1,2)
rearr_aln_res(p->a_res, 3);

aln_free_reg(a_reg);

#ifdef __DEBUG__
COUNT++; fprintf(stderr, "%16d reads have been aligned.\n", COUNT);
#endif
Expand Down Expand Up @@ -975,14 +1005,19 @@ void aln_res_output(FILE *outp, aln_res *res, int res_n, char *name, uint8_t *se
all++;
//if (p->la[i].res[j].dump) continue;
// QNAME/FLAG/RNAME/POS/MAPQ //XXX FLAG: primary/secondary
ksprintf(&sam_str, "%s\t%d\t%s\t%lld\t%d\t", name, p->la[i].res[j].nsrand?0:16, bns->anns[p->la[i].res[j].chr-1].name, (long long)p->la[i].res[j].offset, p->la[i].res[j].mapq);
ksprintf(&sam_str, "%s\t%d\t%s\t%lld\t%d\t", name, p->la[i].res[j].nsrand?0:16, bns->anns[p->la[i].res[j].chr-1].name, (long long)p->la[i].res[j].offset, p->la[i].mapQ);
// CIGAR
for (l = 0; l < p->la[i].res[j].cigar_len; ++l)
ksprintf(&sam_str, "%d%c", p->la[i].res[j].cigar[l]>>4, CIGAR_STR[p->la[i].res[j].cigar[l]&0xf]);
if (prim_flag == 0) {
for (l = 0; l < p->la[i].res[j].cigar_len; ++l)
ksprintf(&sam_str, "%d%c", p->la[i].res[j].cigar[l]>>4, CIGAR_STR[p->la[i].res[j].cigar[l]&0xf]);
} else {
for (l = 0; l < p->la[i].res[j].cigar_len; ++l)
ksprintf(&sam_str, "%d%c", p->la[i].res[j].cigar[l]>>4, CIGAR_STR_HC[p->la[i].res[j].cigar[l]&0xf]);
}
// mate infomation
kputs("\t*\t0\t0", &sam_str);
if (prim_flag == 0) {
// SEQ and QUAL // for secondary res, do NOT print SEQ and QUAL
// SEQ and QUAL
int si;
if (p->la[i].res[j].nsrand == 1) {
kputc('\t', &sam_str);
Expand All @@ -998,7 +1033,23 @@ void aln_res_output(FILE *outp, aln_res *res, int res_n, char *name, uint8_t *se
else kputc('*', &sam_str);
}
prim_flag = 1;
} else kputs("\t*\t*", &sam_str);
} else { // HARD_CLIPINT
// SEQ and QUAL
int si;
if (p->la[i].res[j].nsrand == 1) {
kputc('\t', &sam_str);
for (si = p->la[i].res[j].reg_beg-1; si < p->la[i].res[j].reg_end; ++si) kputc("ACGTN"[seq[si]], &sam_str);
kputc('\t', &sam_str);
if (qual) for (si = p->la[i].res[j].reg_beg-1; si < p->la[i].res[j].reg_end; ++si) kputc(qual[si], &sam_str);
else kputc('*', &sam_str);
} else { // reverse strand
kputc('\t', &sam_str);
for (si = p->la[i].res[j].reg_end-1; si >= p->la[i].res[j].reg_beg-1; --si) kputc("TGCAN"[seq[si]], &sam_str);
kputc('\t', &sam_str);
if (qual) for (si = p->la[i].res[j].reg_end-1; si >= p->la[i].res[j].reg_beg-1; --si) kputc(qual[si], &sam_str);
else kputc('*', &sam_str);
}
}
// optional_tags
ksprintf(&sam_str, "\tNM:i:%d\tAS:i:%d", p->la[i].res[j].NM, p->la[i].res[j].score);
check_cigar(p->la[i].res[j].cigar, p->la[i].res[j].cigar_len, name, res->read_len);
Expand Down
3 changes: 3 additions & 0 deletions lsat_aln.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,12 @@

#define SOAP2_DP_DIR "./soap2-dp"

#define MAPQ_MAX 254

/* CIGAR operations */
/* from samtools-0.1.19 */
#define CIGAR_STR "MIDNSHP=XB"
#define CIGAR_STR_HC "MIDNHHP=XB"
/* M: match or mismatch */
#define CMATCH 0
/* I: insertion to the reference */
Expand Down
9 changes: 4 additions & 5 deletions lsat_dp_con.c
Original file line number Diff line number Diff line change
Expand Up @@ -1171,11 +1171,10 @@ int frag_dp_path(aln_msg *a_msg, frag_msg **f_msg,

#ifdef __DEBUG__
for (i = 0; i < line_n; ++i) {
printf("%d(%d,%d score: %d):\t", i+1, L_MF(line, line_end, i), L_MH(line, line_end,i), L_LS(line, line_end, i));
fprintf(stderr, "%d(%d,%d score: %d):\t", i+1, L_MF(line, line_end, i), L_MH(line, line_end,i), L_LS(line, line_end, i));
for (l = 0; l < line_end[i]; ++l)
printf("(%d, %d)\t", a_msg[line[i][l].x].read_id-1, line[i][l].y);
//printf("(%d, %d)\t", line[i][l].x, line[i][l].y);
printf("\n");
fprintf(stderr, "(%d, %d)\t", a_msg[line[i][l].x].read_id-1, line[i][l].y);
fprintf(stderr, "\n");
}
#endif

Expand Down Expand Up @@ -1249,7 +1248,7 @@ int frag_dp_path(aln_msg *a_msg, frag_msg **f_msg,
((*f_msg)+l)->frag_left_bound = left_bound;
((*f_msg)+l)->line_score = L_LS(line, line_end, l);
#ifdef __DEBUG__
fprintf(stdout, "#%d: left %d, right %d\n", l+1, left_bound, right_bound);
fprintf(stderr, "#%d: left %d, right %d\n", l+1, left_bound, right_bound);
#endif
}
*l_n = l;
Expand Down

0 comments on commit a2985cd

Please sign in to comment.