Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
RichardHallgren authored Mar 30, 2020
1 parent f767157 commit 81663be
Show file tree
Hide file tree
Showing 6 changed files with 1,027 additions and 0 deletions.
61 changes: 61 additions & 0 deletions Points.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
Rank;Points
1;200
2;160
3;140
4;125
5;110
6;95
7;90
8;85
9;80
10;75
11;70
12;65
13;60
14;55
15;50
16;45
17;44
18;43
19;42
20;41
21;40
22;39
23;38
24;37
25;36
26;35
27;34
28;33
29;32
30;31
31;30
32;29
33;28
34;27
35;26
36;25
37;24
38;23
39;22
40;21
41;20
42;19
43;18
44;17
45;16
46;15
47;14
48;13
49;12
50;11
51;10
52;9
53;8
54;7
55;6
56;5
57;4
58;3
59;2
60;1
112 changes: 112 additions & 0 deletions Scraped_web_data_ETL.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#This function will scrape results data from the web and save it to a csv file.\n",
"\n",
"def scrape_web_data():\n",
"\n",
" import requests\n",
" from bs4 import BeautifulSoup\n",
" import html5lib\n",
" import lxml\n",
" import urllib.request\n",
" from selenium import webdriver\n",
" import time\n",
" from selenium.webdriver.common.by import By\n",
" from selenium.webdriver.support.ui import WebDriverWait\n",
" from selenium.webdriver.support import expected_conditions as EC\n",
"\n",
" #list of urls to scrape\n",
" url_list_df = pd.read_excel('race_list.xlsx')\n",
" url_list = url_list_df['URL'].to_list()\n",
" date_list = url_list_df['Date'].to_list()\n",
" race_dfs = pd.DataFrame()\n",
"\n",
" \n",
" for link, date in zip(url_list, date_list):\n",
" driver = webdriver.Safari()\n",
"\n",
"\n",
" stripped_link = link.replace('worldcup.eliotjackson.com/results/', '')\n",
" split_link = stripped_link.split('/')\n",
" year = split_link[0]\n",
" venue = split_link[1]\n",
" \n",
" url = 'http://' + link\n",
" print(url)\n",
"\n",
" driver.get(url)\n",
" \n",
" #Allow driver to load the entire webpage\n",
" time.sleep(5)\n",
"\n",
"\n",
" res = driver.execute_script(\"return document.documentElement.outerHTML\")\n",
"\n",
" driver.quit()\n",
"\n",
" soup = BeautifulSoup(res, 'lxml')\n",
" data = []\n",
" table = soup.find('div', attrs={'class':'react-bs-container-body'})\n",
" table_body = table.find('tbody')\n",
" \n",
" #Create a table from the web page's source code\n",
" rows = table_body.find_all('tr')\n",
" for row in rows:\n",
" cols = row.find_all('td')\n",
" cols = [ele.text.strip() for ele in cols]\n",
" data.append([ele for ele in cols if ele]) # Get rid of empty values\n",
" \n",
" #Filter out relevant data from table, position and rider name\n",
" newdata = data[0::2]\n",
" pos_list = []\n",
" name_list = []\n",
" for item in newdata:\n",
" pos_list.append(item[0])\n",
" name_list.append(item[2])\n",
"\n",
" race_results = list(zip(pos_list, name_list))\n",
" race_df = pd.DataFrame(race_results, columns=['Rank', 'Name'])\n",
" race_df['Year'] = year\n",
" race_df['Venue'] = venue\n",
" race_df['Date'] = date\n",
" race_df['FirstName'], race_df['LastName'] = race_df['Name'].str.split(' ', 1).str\n",
" race_df['Name'] = race_df.FirstName + ' ' + race_df.LastName.str.upper()\n",
" race_df.drop(['FirstName', 'LastName'], axis=1, inplace=True)\n",
" \n",
" race_dfs = race_dfs.append(race_df, sort=False)\n",
" \n",
" print(race_dfs)\n",
" race_dfs.to_excel(\"race_dfs_output.xlsx\", engine='xlsxwriter')\n",
" return\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 81663be

Please sign in to comment.