forked from MariaNattestad/pca-on-genotypes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_to_matrix.py
51 lines (42 loc) · 1.42 KB
/
vcf_to_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from pysam import VariantFile
import numpy as np
from sklearn import decomposition
import pandas as pd
vcf_filename = "ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
panel_filename = "phase1_integrated_calls.20101123.ALL.panel"
genotypes = []
samples = []
variant_ids = []
with VariantFile(vcf_filename) as vcf_reader:
counter = 0
for record in vcf_reader:
counter += 1
if counter % 100 == 0:
alleles = [record.samples[x].allele_indices for x in record.samples]
samples = [sample for sample in record.samples]
genotypes.append(alleles)
variant_ids.append(record.id)
if counter % 4943 == 0:
print(counter)
print(f'{round(100 * counter / 494328)}%')
# if counter >= 10000:
# break
with open(panel_filename) as panel_file:
labels = {} # {sample_id: population_code}
for line in panel_file:
line = line.strip().split('\t')
labels[line[0]] = line[1]
print(variant_ids)
genotypes = np.array(genotypes)
print(genotypes.shape)
matrix = np.count_nonzero(genotypes, axis=2)
matrix = matrix.T
print(matrix.shape)
pca = decomposition.PCA(n_components=2)
pca.fit(matrix)
print(pca.singular_values_)
to_plot = pca.transform(matrix)
print(to_plot.shape)
df = pd.DataFrame(matrix, columns=variant_ids, index=samples)
df['Population code'] = df.index.map(labels)
df.to_csv("matrix.csv")