@@ -43,6 +43,7 @@ THE SOFTWARE. */
4343#define QUAL_STATS 1
4444#define IRC_STATS 1
4545#define IRC_RLEN 10
46+ #define NA_STRING "0"
4647
4748typedef struct
4849{
@@ -61,6 +62,17 @@ typedef struct
6162}
6263idist_t ;
6364
65+ typedef struct
66+ {
67+ double x ;
68+ double x2 ;
69+ double y ;
70+ double y2 ;
71+ double xy ;
72+ double n ;
73+ }
74+ smpl_r_t ;
75+
6476typedef struct
6577{
6678 int n_snps , n_indels , n_mnps , n_others , n_mals , n_snp_mals , n_records ;
@@ -133,6 +145,9 @@ typedef struct
133145 char * * argv , * exons_fname , * regions_list , * samples_list , * targets_list ;
134146 int argc , verbose_sites , first_allele_only , samples_is_file ;
135147 int split_by_id , nstats ;
148+ // Per Sample r working data arrays of size equal to number of samples
149+ smpl_r_t * smpl_r_snps ;
150+ smpl_r_t * smpl_r_indels ;
136151}
137152args_t ;
138153
@@ -397,6 +412,8 @@ static void init_stats(args_t *args)
397412 args -> af_gts_indels = (gtcmp_t * ) calloc (args -> m_af ,sizeof (gtcmp_t ));
398413 args -> smpl_gts_snps = (gtcmp_t * ) calloc (args -> files -> n_smpl ,sizeof (gtcmp_t ));
399414 args -> smpl_gts_indels = (gtcmp_t * ) calloc (args -> files -> n_smpl ,sizeof (gtcmp_t ));
415+ args -> smpl_r_snps = (smpl_r_t * ) calloc (args -> files -> n_smpl , sizeof (smpl_r_t ));
416+ args -> smpl_r_indels = (smpl_r_t * ) calloc (args -> files -> n_smpl , sizeof (smpl_r_t ));
400417 }
401418 for (i = 0 ; i < args -> nstats ; i ++ )
402419 {
@@ -517,12 +534,14 @@ static void destroy_stats(args_t *args)
517534 for (j = 0 ; j < args -> nusr ; j ++ ) free (args -> usr [j ].tag );
518535 free (args -> usr );
519536 free (args -> tmp_frm );
520- if ( args -> tmp_iaf ) free (args -> tmp_iaf );
537+ free (args -> tmp_iaf );
521538 if (args -> exons ) bcf_sr_regions_destroy (args -> exons );
522- if (args -> af_gts_snps ) free (args -> af_gts_snps );
523- if (args -> af_gts_indels ) free (args -> af_gts_indels );
524- if (args -> smpl_gts_snps ) free (args -> smpl_gts_snps );
525- if (args -> smpl_gts_indels ) free (args -> smpl_gts_indels );
539+ free (args -> af_gts_snps );
540+ free (args -> af_gts_indels );
541+ free (args -> smpl_gts_snps );
542+ free (args -> smpl_gts_indels );
543+ free (args -> smpl_r_snps );
544+ free (args -> smpl_r_indels );
526545 if (args -> indel_ctx ) indel_ctx_destroy (args -> indel_ctx );
527546}
528547
@@ -868,8 +887,28 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
868887 gtcmp_t * af_stats = line_type & VCF_SNP ? args -> af_gts_snps : args -> af_gts_indels ;
869888 gtcmp_t * smpl_stats = line_type & VCF_SNP ? args -> smpl_gts_snps : args -> smpl_gts_indels ;
870889
890+ //
891+ // Calculates r squared
892+ // x is mean dosage of x at given site
893+ // x2 is mean squared dosage of x at given site
894+ // y is mean dosage of x at given site
895+ // y2 is mean squared dosage of x at given site
896+ // xy is mean dosage of x*y at given site
897+ // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
898+ // r2n is number of sites considered
899+ // output as r2sum/r2n for each AF bin
871900 int r2n = 0 ;
872901 float x = 0 , y = 0 , xy = 0 , x2 = 0 , y2 = 0 ;
902+ // Select smpl_r
903+ smpl_r_t * smpl_r = NULL ;
904+ if (line_type & VCF_SNP )
905+ {
906+ smpl_r = args -> smpl_r_snps ;
907+ }
908+ else if (line_type & VCF_INDEL )
909+ {
910+ smpl_r = args -> smpl_r_indels ;
911+ }
873912 for (is = 0 ; is < files -> n_smpl ; is ++ )
874913 {
875914 // Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
@@ -903,7 +942,18 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
903942 smpl_stats [is ].mm [idx ]++ ;
904943 }
905944
945+ // Now do it across samples
946+
947+ if (smpl_r ) {
948+ smpl_r [is ].xy += dsg0 * dsg1 ;
949+ smpl_r [is ].x += dsg0 ;
950+ smpl_r [is ].x2 += dsg0 * dsg0 ;
951+ smpl_r [is ].y += dsg1 ;
952+ smpl_r [is ].y2 += dsg1 * dsg1 ;
953+ ++ (smpl_r [is ].n );
954+ }
906955 }
956+
907957 if ( r2n )
908958 {
909959 x /= r2n ; y /= r2n ; x2 /= r2n ; y2 /= r2n ; xy /= r2n ;
@@ -1229,23 +1279,43 @@ static void print_stats(args_t *args)
12291279 for (x = 0 ; x < 2 ; x ++ )
12301280 {
12311281 gtcmp_t * stats ;
1282+ smpl_r_t * smpl_r_array ;
12321283 if ( x == 0 )
12331284 {
1234- printf ("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n" );
1285+ printf ("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\ n" );
12351286 stats = args -> smpl_gts_snps ;
1287+ smpl_r_array = args -> smpl_r_snps ;
12361288 }
12371289 else
12381290 {
1239- printf ("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n" );
1291+ printf ("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\ n" );
12401292 stats = args -> smpl_gts_indels ;
1293+ smpl_r_array = args -> smpl_r_indels ;
12411294 }
12421295 for (i = 0 ; i < args -> files -> n_smpl ; i ++ )
12431296 {
12441297 uint64_t m = stats [i ].m [T2S (GT_HET_RA )] + stats [i ].m [T2S (GT_HOM_AA )];
12451298 uint64_t mm = stats [i ].mm [T2S (GT_HOM_RR )] + stats [i ].mm [T2S (GT_HET_RA )] + stats [i ].mm [T2S (GT_HOM_AA )];
1299+ // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
1300+ smpl_r_t * smpl_r = smpl_r_array + i ;
1301+ double r = NAN ;
1302+ if (smpl_r -> n ) {
1303+ double sum_crossprod = smpl_r -> xy - (smpl_r -> x * smpl_r -> y )/smpl_r -> n ;//per 17.3 machine formula
1304+ double x2_xx = smpl_r -> x2 - (smpl_r -> x * smpl_r -> x )/smpl_r -> n ;
1305+ double y2_yy = smpl_r -> y2 - (smpl_r -> y * smpl_r -> y )/smpl_r -> n ;
1306+ r = (sum_crossprod )/sqrt (x2_xx * y2_yy );
1307+ }
12461308 printf ("GC%cS\t2\t%s\t%.3f" , x == 0 ? 's' : 'i' , args -> files -> samples [i ], m + mm ? mm * 100.0 /(m + mm ) : 0 );
12471309 printf ("\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "" , stats [i ].m [T2S (GT_HOM_RR )],stats [i ].m [T2S (GT_HET_RA )],stats [i ].m [T2S (GT_HOM_AA )]);
1248- printf ("\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n" , stats [i ].mm [T2S (GT_HOM_RR )],stats [i ].mm [T2S (GT_HET_RA )],stats [i ].mm [T2S (GT_HOM_AA )]);
1310+ printf ("\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "" , stats [i ].mm [T2S (GT_HOM_RR )],stats [i ].mm [T2S (GT_HET_RA )],stats [i ].mm [T2S (GT_HOM_AA )]);
1311+ if (isnan (r ))
1312+ {
1313+ printf ("\t" NA_STRING "\n" );
1314+ }
1315+ else
1316+ {
1317+ printf ("\t%f\n" , r * r );
1318+ }
12491319 }
12501320 }
12511321 }
0 commit comments