-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhomology.py
106 lines (98 loc) · 5.3 KB
/
homology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import utils
def get_eggnog_groups(filepath, proteins):
"""
Obtains all the EggNOG groups which contains a list of given proteins
:param str filepath: path to the EggNOG groups file
:param list proteins: list of Ensembl protein identifiers
:return: dictionary with all the valid EggNOG groups. Key -> group, value -> list proteins in the group
"""
sum_prots = 0
valid_groups = {}
groups = utils.read_gzipped_file(filepath)
first = True
for line in groups:
if first:
first = False
continue
data = line.decode("utf-8").rstrip().split('\t')
group = data[1]
gproteins = data[4].split(',')
int_proteins = list(set(proteins).intersection(gproteins))
if len(int_proteins) > 0:
valid_groups[group] = int_proteins
sum_prots += len(int_proteins)
return valid_groups
def get_links(filepath, valid_groups, proteins, ouput_filepath, config_file):
"""
Obtain the transferred interactions at the EggNOG group level from STRING
Writes into a file 'predictions.tsv' with the list of predicted links based on homology.
Structure of the file:
["taxid1", "taxid1_label", "source_color", "source_shape", "source", "source_name", \
"taxid2", "taxid2_label", "target_color", "target_shape", "target", "target_name", \
"experimental_evidence_score", "databases_evidence_score", "weight", \
"group1", "group2", "edge_type"]
:param str filepath: path to STRING file with the groups links
:param dict valid_groups: dictionary with all the valid groups
:param dict proteins: mapping from ENSP to protein name
:param str output_filepath: path to output file
:param str config_file: path to the configuration file
"""
links = []
seen = set()
cog_links = utils.read_gzipped_file(filepath)
hosts = utils.read_config(filepath=config_file, field='hosts')
parasites = utils.read_config(filepath=config_file, field='parasites')
first = True
cols = ["taxid1", "taxid1_label", "source_color", "source_shape", "source", "source_name", \
"taxid2", "taxid2_label", "target_color", "target_shape", "target", "target_name", \
"experimental_evidence_score", "databases_evidence_score", "weight", \
"group1", "group2", "edge_type"]
i = 0
for line in cog_links:
if first:
first = False
continue
i += 1
data = line.decode("utf-8").rstrip().split(' ')
group1 = data[0]
group2 = data[1]
experimental_evidence = round(int(data[6])/1000, 3)
databases_evidence = round(int(data[7])/1000, 3)
if group1 in valid_groups and group2 in valid_groups:
if experimental_evidence >= 0.7 or databases_evidence >= 0.7:
average_score = (experimental_evidence + databases_evidence) / 2
average_score = round(average_score, 3)
for protein1 in valid_groups[group1]:
taxid1 = protein1.split('.')[0]
for protein2 in valid_groups[group2]:
taxid2 = protein2.split('.')[0]
if int(taxid1) in hosts or int(taxid2) in hosts:
if taxid1 != taxid2:
if (protein1, protein2) not in seen:
if int(taxid1) in hosts:
target_taxid = taxid1
target_group = group1
target_protein = protein1
source_taxid = taxid2
source_group = group2
source_protein = protein2
else:
target_taxid = taxid2
target_group = group2
target_protein = protein2
source_taxid = taxid1
source_group = group1
source_protein = protein1
links.append([source_taxid, parasites[int(source_taxid)]['label'],
parasites[int(source_taxid)]['color'], 'diamond', source_protein,
proteins[source_protein],
target_taxid, hosts[int(target_taxid)]['label'],
hosts[int(target_taxid)]['color'], 'dot', target_protein,
proteins[target_protein],
str(experimental_evidence), str(databases_evidence), str(average_score),
source_group, target_group, "inter-species"])
seen.add((protein1, protein2))
seen.add((protein2, protein1))
links_df = pd.DataFrame(links, columns=cols)
utils.save_to_parquet(links_df, ouput_filepath)