forked from Nadolina/Rapid-curation-2.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchromosome_assignment.py
121 lines (98 loc) · 3.29 KB
/
chromosome_assignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
## For the modification of scaffold names to reflect chromosomal assignment.
import csv
import sys
import re
from Bio import SeqIO, Seq
from Bio.SeqRecord import SeqRecord
#FUNCTIONS ----
def sex_chr_asn(sex_chr,chr_name):
if 'unloc' not in record.id:
if sex_chr!=record.id:
exit
else:
inter_chr_dict[record.id]=((record.id).replace(sex_chr,chr_name))
record.id=((record.id).replace(sex_chr,chr_name))
if 'unloc' in record.id:
if re.sub("_unloc_\d+$","",record.id) == sex_chr:
inter_chr_dict[record.id]=((record.id)).replace(sex_chr,chr_name)
record.id=((record.id).replace(sex_chr,chr_name))
else:
exit
#MAIN ----
outdir=sys.argv[1]
hap_agp=outdir + "/hap.unlocs.no_hapdups.agp"
hap_sort=outdir + "/hap.sorted.fa"
agp_lines=[]
with open(hap_agp) as file:
agp = csv.reader(file,delimiter='\t')
for line in agp:
agp_lines.append(line)
file.close()
x=0
unlocs_haps={}
chr_list=[]
X_chr=""
Y_chr=""
W_chr=""
Z_chr=""
sex_chr=[]
while x < (len(agp_lines)):
line = agp_lines[x]
if "#" in line[0]:
x+=1
continue
elif line[9]=="Painted" or line[8]=="proximity_ligation":
if line[10]=="X":
X_chr=line[0]
sex_chr.append(line[0])
elif line[10]=="Y":
Y_chr=line[0]
sex_chr.append(line[0])
elif line[10]=="W":
W_chr=line[0]
sex_chr.append(line[0])
elif line[10]=="Z":
Z_chr=line[0]
sex_chr.append(line[0])
elif line[10]=="Unloc":
orig_name=re.sub('_unloc_[0-9]+$','',line[0])
unlocs_haps[line[0]]=orig_name
else:
chr_list.append(line[0])
x+=1
chr_list_filter = set([chr for chr in chr_list if chr != X_chr and chr != Y_chr and chr != W_chr and chr != Z_chr])
scaff_num=1
new_records=[]
inter_chr_dict={}
with open(hap_sort) as original:
records = SeqIO.parse(original, 'fasta')
for record in records:
if record.id in chr_list_filter:
inter_chr_dict[record.id]=("SUPER_"+str(scaff_num))
record.id=("SUPER_"+str(scaff_num))
scaff_num += 1
elif X_chr in record.id and X_chr != "":
sex_chr_asn(X_chr,"SUPER_X")
elif Y_chr in record.id and Y_chr != "":
sex_chr_asn(Y_chr,"SUPER_Y")
elif W_chr in record.id and W_chr !="":
sex_chr_asn(W_chr,"SUPER_W")
elif Z_chr in record.id and Z_chr !="":
sex_chr_asn(Z_chr,"SUPER_Z")
elif record.id in unlocs_haps:
orig_name=unlocs_haps[record.id]
super_name=inter_chr_dict[orig_name]
inter_chr_dict[record.id]=re.sub(orig_name,super_name,record.id)
record.id=(re.sub(orig_name,super_name,record.id))
new_records.append(SeqRecord(record.seq,id=record.id, description=""))
print (inter_chr_dict)
with open((outdir+"/inter_chr.tsv"),'w') as file:
for key in inter_chr_dict.keys():
file.write("%s\t%s\n"%(key,inter_chr_dict[key]))
file.close()
# with open((outdir+"/chromosomes.csv")) as file2:
# for value in inter_chr_dict.values():
# print (value)
handle=open((outdir+"/hap.chr_level.fa"),"w")
SeqIO.write(new_records,handle,"fasta")
handle.close()