-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathregpot_single_gene_hg38_mm10.py
262 lines (192 loc) · 7.92 KB
/
regpot_single_gene_hg38_mm10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/env python
import argparse
import itertools
import sqlite3
from pybedtools import BedTool
import pandas as pd
# essential data locations
gene_database = 'regpot_single_gene.db'
hg38_remap2022 = 'remap2022/remap2022_nr_macs2_hg38_v1_0.bed'
mm10_remap2022 = 'remap2022/remap2022_nr_macs2_mm10_v1_0.bed'
def get_gene_info(gene_symbol, species, conn):
c = conn.cursor()
# Determine the table name
table_name = species + '_ncbiRefSeq'
# Query the database
c.execute(f"SELECT chrom, strand, txStart, txEnd, name2 FROM {table_name} WHERE name2=?", (gene_symbol,))
# Fetch the results
results = c.fetchone()
# Calculate TSS
chrom, strand, txStart, txEnd, name2 = results
TSS = txStart if strand == '+' else txEnd
# Return the results
return chrom, strand, txStart, txEnd, name2, TSS, species
def drop_table(table_name, conn):
# Connect to the SQLite database
cursor = conn.cursor()
# Execute the DROP TABLE statement
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
# Commit the changes and close the connection
conn.commit()
#conn.close()
def create_and_insert_into_gene_info(gene_info, conn):
# Create a new table
c = conn.cursor()
c.execute("""
CREATE TABLE IF NOT EXISTS gene_info (
chrom TEXT,
strand TEXT,
txStart INTEGER,
txEnd INTEGER,
name2 TEXT,
TSS INTEGER,
species TEXT
)
""")
# Insert the gene info into the table
c.execute("""
INSERT INTO gene_info (chrom, strand, txStart, txEnd, name2, TSS, species)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", gene_info)
# Commit the changes
conn.commit()
def get_overlapping_regions(gene_name, half_decay, conn):
# 1. Extract the TSS and species of the gene from the gene_info table
c = conn.cursor()
c.execute("SELECT chrom, strand, TSS, species FROM gene_info WHERE name2=?", (gene_name,))
chrom, strand, TSS, species = c.fetchone()
# 2. Create a region from the TSS by extending it to the left and right with the half_decay distance
half_decay_distance = int(half_decay[:-2]) * 1000 # Convert from kb to base pairs
region_start = TSS - half_decay_distance
region_end = TSS + half_decay_distance
# Create a region in the specified format
region = {'chrom': chrom, 'strand': strand, 'start': region_start, 'end': region_end, 'TSS': TSS}
# 3. Determine the bed file to use based on the species
bed_file = hg38_remap2022 if species == 'hg38' else mm10_remap2022
# Create a BedTool object for the region and the bed file
region_bed = BedTool(f"{region['chrom']} {region['start']} {region['end']}", from_string=True)
bed = BedTool(bed_file)
# Get the overlapping regions
overlapping_regions = bed.intersect(region_bed)
return overlapping_regions
def create_table_from_overlapping_regions(overlapping_regions, species, conn):
# Create a new table
c = conn.cursor()
c.execute("""
CREATE TABLE IF NOT EXISTS overlapping_regions (
chrom TEXT,
start INTEGER,
end INTEGER,
name TEXT,
peak INTEGER,
TF TEXT,
species TEXT
)
""")
# Parse the overlapping_regions and insert the data into the table
for region in overlapping_regions:
chrom = region.chrom
start = region.start
end = region.end
name = region.name
peak = region[6] # Access the thickStart field
TF = name.split(':')[0]
c.execute("""
INSERT INTO overlapping_regions (chrom, start, end, name, peak, TF, species)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (chrom, start, end, name, peak, TF, species))
# Commit the changes
conn.commit()
def list_TFs(table_name, species, conn):
# Connect to the SQLite database
cursor = conn.cursor()
# Execute the query
cursor.execute(f"SELECT DISTINCT TF FROM {table_name} WHERE species = '{species}'")
# Fetch the results
results = cursor.fetchall()
# Extract the TFs from the results and return them
TFs = [row[0] for row in results]
return TFs
def get_peak_TSS_distances(species, TF, conn):
# Get a cursor
c = conn.cursor()
# Perform the JOIN operation and calculate the distances
c.execute("""
SELECT ABS(overlapping_regions.peak - gene_info.TSS) AS distance
FROM overlapping_regions
JOIN gene_info ON overlapping_regions.chrom = gene_info.chrom AND overlapping_regions.species = gene_info.species
WHERE overlapping_regions.species = ? AND overlapping_regions.TF = ?
""", (species, TF))
# Fetch all the distances
distances = [row[0] for row in c.fetchall()]
return distances
def calculate_fraction(distances):
# Check if distances is empty
if not distances:
return 0
small_distances = [d for d in distances if d <= 1000]
# Calculate the fraction
fraction = len(small_distances) / len(distances)
return fraction
def calculate_rp_score(d0, distances):
rp_score = 0
for distance in distances:
if distance / d0 > 100: # Adjust the threshold as needed
weight = 0
else:
weight = 1 / (2 ** (distance / d0))
rp_score += weight
return rp_score
def calculate_regulatory_potential(TF, species, conn):
# Get the distances
distances = get_peak_TSS_distances(species, TF, conn)
# Calculate the fraction
fraction = calculate_fraction(distances)
# Set d0 based on the fraction
d0 = 1000 if fraction > 0.2 else 10000
# Calculate the regulatory potential score
rp_score = calculate_rp_score(d0, distances)
return fraction, rp_score
def create_rp_score_dataframe(TFs, gene_symbol, species, conn):
# Initialize a list to store the data
data = []
# Calculate the rp_score for each TF and species
for TF in TFs:
rp_score = calculate_regulatory_potential(TF, species, conn)[1]
#data.append([gene_symbol, TF, rp_score, species])
data.append([TF, gene_symbol, species, rp_score])
# Create a DataFrame from the data
df = pd.DataFrame(data, columns=['TF', 'Gene', 'species', 'RP_score'])
return df
def main(gene_symbol, species, half_decay):
print(f"Gene symbol: {gene_symbol}")
print(f"Species: {species}")
print(f"Half-decay distance: {half_decay}")
# Connect to the SQLite database
conn = sqlite3.connect(gene_database)
gene_info = get_gene_info(gene_symbol, species, conn)
print(gene_info)
# remove any existing gene_info table
drop_table('gene_info', conn)
# Create the gene_info table and insert the gene info
create_and_insert_into_gene_info(gene_info, conn)
overlapping_regions = get_overlapping_regions(gene_symbol, half_decay, conn)
# remove any existing overlapping_regions table
drop_table('overlapping_regions', conn)
create_table_from_overlapping_regions(overlapping_regions, species, conn)
TFs = list_TFs('overlapping_regions', species, conn)
print(f"Transcription factors: {TFs}")
rp_score_all_TF_df = create_rp_score_dataframe(TFs, gene_symbol, species, conn)
#print(rp_score_all_TF_df)
with open('rp_score_all_TF.csv', 'a') as f:
rp_score_all_TF_df.to_csv(f, header=False, index=False)
# Close the connection
conn.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Process a single gene - human or mouse.",
usage="python regpot_gene_hg38_mm10.py <gene_symbol> <species> <half_decay>")
parser.add_argument("gene_symbol", help="The gene symbol")
parser.add_argument("species", help="species of the gene, either 'hg38' or 'mm10'")
parser.add_argument("half_decay", choices=["1kb", "10kb", "100kb"], help="The half-decay distance")
args = parser.parse_args()
main(args.gene_symbol, args.species, args.half_decay)