Skip to content

Commit d30dea2

Browse files
committed
Merge branch 'mp15-sample_r2' into develop
2 parents b3bfc34 + 1874cff commit d30dea2

File tree

2 files changed

+84
-14
lines changed

2 files changed

+84
-14
lines changed

test/stats.chk

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,12 @@ SN 2 number of samples: 3
6969
GCsAF 2 49.000000 2 1 0 1 2 3 0.375000 2
7070
NRDs 2 85.714286 33.333333 66.666667 100.000000
7171
NRDi 2 0.000000 0.000000 0.000000 0.000000
72-
GCsS 2 A 100.000 2 0 0 1 0 0
73-
GCsS 2 B 66.667 0 1 0 0 2 0
74-
GCsS 2 C 100.000 0 0 0 0 0 3
75-
GCiS 2 A 0.000 0 0 0 0 0 0
76-
GCiS 2 B 0.000 0 0 0 0 0 0
77-
GCiS 2 C 0.000 0 0 0 0 0 0
72+
GCsS 2 A 100.000 2 0 0 1 0 0 0
73+
GCsS 2 B 66.667 0 1 0 0 2 0 0
74+
GCsS 2 C 100.000 0 0 0 0 0 3 0
75+
GCiS 2 A 0.000 0 0 0 0 0 0 0
76+
GCiS 2 B 0.000 0 0 0 0 0 0 0
77+
GCiS 2 C 0.000 0 0 0 0 0 0 0
7878
PSC 0 A 0 0 0 0 0 0 0.0 0
7979
PSC 0 B 0 0 0 0 0 0 0.0 0
8080
PSC 0 C 0 0 0 0 0 0 0.0 0

vcfstats.c

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ THE SOFTWARE. */
4343
#define QUAL_STATS 1
4444
#define IRC_STATS 1
4545
#define IRC_RLEN 10
46+
#define NA_STRING "0"
4647

4748
typedef struct
4849
{
@@ -61,6 +62,17 @@ typedef struct
6162
}
6263
idist_t;
6364

65+
typedef struct
66+
{
67+
double x;
68+
double x2;
69+
double y;
70+
double y2;
71+
double xy;
72+
double n;
73+
}
74+
smpl_r_t;
75+
6476
typedef struct
6577
{
6678
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records;
@@ -133,6 +145,9 @@ typedef struct
133145
char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
134146
int argc, verbose_sites, first_allele_only, samples_is_file;
135147
int split_by_id, nstats;
148+
// Per Sample r working data arrays of size equal to number of samples
149+
smpl_r_t* smpl_r_snps;
150+
smpl_r_t* smpl_r_indels;
136151
}
137152
args_t;
138153

@@ -397,6 +412,8 @@ static void init_stats(args_t *args)
397412
args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
398413
args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
399414
args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
415+
args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
416+
args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
400417
}
401418
for (i=0; i<args->nstats; i++)
402419
{
@@ -517,12 +534,14 @@ static void destroy_stats(args_t *args)
517534
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
518535
free(args->usr);
519536
free(args->tmp_frm);
520-
if (args->tmp_iaf) free(args->tmp_iaf);
537+
free(args->tmp_iaf);
521538
if (args->exons) bcf_sr_regions_destroy(args->exons);
522-
if (args->af_gts_snps) free(args->af_gts_snps);
523-
if (args->af_gts_indels) free(args->af_gts_indels);
524-
if (args->smpl_gts_snps) free(args->smpl_gts_snps);
525-
if (args->smpl_gts_indels) free(args->smpl_gts_indels);
539+
free(args->af_gts_snps);
540+
free(args->af_gts_indels);
541+
free(args->smpl_gts_snps);
542+
free(args->smpl_gts_indels);
543+
free(args->smpl_r_snps);
544+
free(args->smpl_r_indels);
526545
if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
527546
}
528547

@@ -868,8 +887,28 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
868887
gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
869888
gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
870889

890+
//
891+
// Calculates r squared
892+
// x is mean dosage of x at given site
893+
// x2 is mean squared dosage of x at given site
894+
// y is mean dosage of x at given site
895+
// y2 is mean squared dosage of x at given site
896+
// xy is mean dosage of x*y at given site
897+
// r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
898+
// r2n is number of sites considered
899+
// output as r2sum/r2n for each AF bin
871900
int r2n = 0;
872901
float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
902+
// Select smpl_r
903+
smpl_r_t *smpl_r = NULL;
904+
if (line_type&VCF_SNP)
905+
{
906+
smpl_r = args->smpl_r_snps;
907+
}
908+
else if (line_type&VCF_INDEL)
909+
{
910+
smpl_r = args->smpl_r_indels;
911+
}
873912
for (is=0; is<files->n_smpl; is++)
874913
{
875914
// Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
@@ -903,7 +942,18 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
903942
smpl_stats[is].mm[idx]++;
904943
}
905944

945+
// Now do it across samples
946+
947+
if (smpl_r) {
948+
smpl_r[is].xy += dsg0*dsg1;
949+
smpl_r[is].x += dsg0;
950+
smpl_r[is].x2 += dsg0*dsg0;
951+
smpl_r[is].y += dsg1;
952+
smpl_r[is].y2 += dsg1*dsg1;
953+
++(smpl_r[is].n);
954+
}
906955
}
956+
907957
if ( r2n )
908958
{
909959
x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
@@ -1229,23 +1279,43 @@ static void print_stats(args_t *args)
12291279
for (x=0; x<2; x++)
12301280
{
12311281
gtcmp_t *stats;
1282+
smpl_r_t *smpl_r_array;
12321283
if ( x==0 )
12331284
{
1234-
printf("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n");
1285+
printf("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
12351286
stats = args->smpl_gts_snps;
1287+
smpl_r_array = args->smpl_r_snps;
12361288
}
12371289
else
12381290
{
1239-
printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n");
1291+
printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
12401292
stats = args->smpl_gts_indels;
1293+
smpl_r_array = args->smpl_r_indels;
12411294
}
12421295
for (i=0; i<args->files->n_smpl; i++)
12431296
{
12441297
uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
12451298
uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
1299+
// Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
1300+
smpl_r_t *smpl_r = smpl_r_array + i;
1301+
double r = NAN;
1302+
if (smpl_r->n) {
1303+
double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
1304+
double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
1305+
double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
1306+
r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
1307+
}
12461308
printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
12471309
printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
1248-
printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"\n", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
1310+
printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
1311+
if (isnan(r))
1312+
{
1313+
printf("\t"NA_STRING"\n");
1314+
}
1315+
else
1316+
{
1317+
printf("\t%f\n", r*r);
1318+
}
12491319
}
12501320
}
12511321
}

0 commit comments

Comments
 (0)