-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathoption_scraperBS.py
271 lines (229 loc) · 10.1 KB
/
option_scraperBS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
""""
Option Parser for Black Scholes data for S&P500 companies
Author: Juan Diego Herrera
"""
# Set up arguments
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--batches", type= int, default=5, help="number of batches to extract data from")
parser.add_argument("--bs", type= int, default=10, help="number of companies whose data will be extracted in a batch")
parser.add_argument("--rf", type= float, default=0.0088, help="current risk free rate")
parser.add_argument("--wait", type= float, default=5, help="time to wait between page requests")
parser.add_argument("--waitb", type= float, default=100, help="time to wait between batches to avoid server denial")
parser.add_argument("--verbose", type= int, default=True, help="flag to print progress")
parser.add_argument("--startIdx", type= int, default=0, help="company index to start scraping from")
args = parser.parse_args()
# Imports
from bs4 import BeautifulSoup
import datetime, time
import requests
import pandas as pd
import os.path
from os import path
# header to fix yahoo automation detection
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
"Version/15.4 Safari/605.1.15"}
def getTickers():
"""Returns the tickers for all the S&P500 companies using the Wikipedia page
Outputs:
tickers - list of tickers for every company in the S&P500
"""
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find("table") # tickers are contained in a table
tickers = []
for row in table.find_all('tr'):
cols = row.find_all('td')
if cols:
tickers.append(cols[0].text.strip())
return tickers
def getStockVol(ticker):
"""Returns a stock's 30-day implied volatility from alphaqueries
Inputs:
ticker - a string representing a stock's ticker
Outputs:
volatility - implied volatility for the stock
"""
url = "https://www.alphaquery.com/stock/"+ ticker+ "/volatility-option-statistics/30-day/iv-mean"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find("table")
rows = table.find_all('tr')
volatility = float(rows[5].find_all('td')[1].text.strip()) # Specific table entry in AlphaQuery containing data
return volatility
def getStockData(ticker):
"""Returns a stock's price, dividend yield, and implied volatility
Inputs:
ticker - a string representing a stock's ticker
Outputs:
stock_price - stock's price
div_yield - stock's dividend yield
volatility - stock's implied volatility
"""
url = "https://finance.yahoo.com/quote/"+ticker # Change url based on ticker
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all("table") # tables in Yahoo finance
# Get stock price
rows = tables[0].find_all('tr')
stock_price = rows[3].find_all('td')[1].text.strip() # Item that returns ask price (4th row in a table)
# Extract ask price by parsing out other data
x = stock_price.find('x')
stock_price = float(stock_price[0:x].replace(",","")) # delete any comma to cast to float
# Get dividend yield
rows = tables[1].find_all('tr')
div_yield = rows[5].find_all('td')[1].text.strip() # Item that returns ask yield (6th row in a table)
# Extract yield by parsing out other data
x = div_yield.find('(')
if "N" not in div_yield[x+1:-2]: # Only set dividend if not 'N/A'
div_yield = float(div_yield[x+1:-2])/100
else:
div_yield = 0
# Get volatility
ticker = ticker.replace("-",".") #BRK.B exception
volatility = getStockVol(ticker)
return stock_price, div_yield, volatility
def getDates(url):
"""Returns all valid option dates for a given Yahoo options url
Inputs:
url - Yahoo finance options url
Outputs:
dates - list of dates (UNIX time) for the underlying ticker
"""
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
dates = []
selector = soup.find("select")
for item in list(soup.find("select").children):
dates.append(int(item['value']))
return dates
def getOptionData(url):
"""Returns a list of strike and call prices for a given Yahoo finance option url
Inputs:
url - string representing Yahoo finance option url
Outputs:
strikes - list of all strike prices
prices - list of all call prices
"""
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find("table")
if table == None: #Avoid crashes with empty options page
return None, None
strikes = []
prices = []
for row in table.find_all('tr'): # Iterate through every table entry
cols = row.find_all('td')
if cols:
strikes.append(float(cols[2].text.strip().replace(",",""))) # 3rd column is strike, kill comma to cast
# Use ask, bid, or last price depending on availability
ask = cols[5].text.strip().replace(",","")
bid = cols[4].text.strip().replace(",","")
last = cols[3].text.strip().replace(",","")
ask = float(ask) if ask != "-" else 0
bid = float(bid) if bid != "-" else 0
last = float(last) if last != "-" else 0
if ask != 0:
prices.append(ask)
elif bid != 0:
prices.append(bid)
else:
prices.append(last)
return strikes, prices
def scrapeData(startIndex, bs, rf, wait, verbose = True):
"""Writes into a csv the option values for a batch of stock tickers. During execution may print url and company tickers
Inputs:
startIndex - company index to start scraping from (0 - 499)
bs - integer representing the batch size to scrape
rf - risk free rate
verbose - boolean that determines if you want output to be printed
wait - wait period between page requests
"""
if startIndex < 0 or startIndex >499:
raise Exception("Invalid start index!")
cols = ['Stock Price', 'Strike Price', 'Maturity', 'Dividends', 'Volatility', 'Risk-free', 'Call Price']
results = pd.DataFrame(columns = cols)
RISK_FREE = rf
tickers = getTickers() # list of company tickers
unixToday = int(time.time()) # Today's date used to calculate maturity
frames=[] # list of dataframes - will be appended to a single df and exported to csv
# iterate through every ticker
for i,ticker in enumerate(tickers[startIndex:startIndex+bs]):
ticker = ticker.replace(".","-") #BRK.B exception
frame = pd.DataFrame(columns = cols) # fresh frame
if verbose:
print(ticker, (i+startIndex))
# Get stock data
stock_price, div_yield, volatility = getStockData(ticker)
# Start option extraction
url = "https://finance.yahoo.com/quote/"+ticker+"/options"
if verbose:
print(url)
dates = getDates(url)
time.sleep(wait)
# Firt entry receives special treament in case maturity is today
strikes, prices = getOptionData(url)
if strikes == None: #Avoid crashes with empty options page
pass
maturity = (dates[0] - unixToday)/(60*60*24*365.25) # Convert UNIX time difference to fraction of a year
if maturity <= 0:
maturity = 1e-5 # trivial maturity for options that expire today
# Insert data into a dataframe
frame['Strike Price'] = strikes
frame['Call Price'] = prices
frame['Stock Price'] = stock_price
frame['Dividends'] = div_yield
frame['Volatility'] = volatility
frame['Risk-free'] = RISK_FREE
frame['Maturity'] = maturity
frames.append(frame) # first entry to dataframes
# Loop through the rest of the option contracts
for date in dates[1:]:
frame = pd.DataFrame(columns = cols)
url = "https://finance.yahoo.com/quote/"+ticker+"/options"+"?date="+str(date) # Special URL for future dates
if verbose:
print(url)
maturity = (date - unixToday)/(60*60*24*365.25) # Convert UNIX time difference to fraction of a year
strikes, call_prices = getOptionData(url)
time.sleep(wait)
# Add data to dataframe
frame['Strike Price'] = strikes
frame['Call Price'] = call_prices
frame['Stock Price'] = stock_price
frame['Dividends'] = div_yield
frame['Volatility'] = volatility
frame['Risk-free'] = RISK_FREE
frame['Maturity'] = maturity
frames.append(frame)
frames.append(results)
results = pd.concat(frames)
frames = []
if verbose:
print()
print('----------------------------------------------------')
print()
# End of loop, concatenate all frames and export to csv
results.to_csv('SNP.csv', mode = 'a', index = False, header = False)
# Main code
cols = ['Stock Price', 'Strike Price', 'Maturity', 'Dividends', 'Volatility', 'Risk-free', 'Call Price']
results = pd.DataFrame(columns = cols)
if not path.exists('SNP.csv'): #Only create new file if it does not exist
results.to_csv('SNP.csv', mode='a', index = False)
num_batches = args.batches
bs = args.bs
rf = args.rf
wait_period = args.wait
verbose = args.verbose
startIdx = args.startIdx
waitB = args.waitb
for i in range(num_batches):
if (startIdx + (i*bs)) < (499 - bs): # only scrape data if we won't exceed the ticker list
scrapeData( (startIdx+i*bs), bs, rf, wait_period, verbose)
if verbose:
print("Waiting for to avoid server denial")
print()
time.sleep(waitB)
else:
break