Skip to content

Commit 52eaeb4

Browse files
authored
Merge pull request #208 from janevin/main
Clean up Spark Dataset API and make the notebook work successfully.
2 parents 1ba71cf + 6dc41b6 commit 52eaeb4

File tree

1 file changed

+177
-49
lines changed

1 file changed

+177
-49
lines changed

bootcamp/materials/3-spark-fundamentals/notebooks/DatasetApi.ipynb

Lines changed: 177 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
{
1919
"data": {
2020
"text/plain": [
21-
"Spark Web UI available at http://0a64d2ba5c88:4042\n",
22-
"SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1733519375641)\n",
21+
"Spark Web UI available at http://120fa5ac9a2d:4041\n",
22+
"SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1734316680820)\n",
2323
"SparkSession available as 'spark'\n"
2424
]
2525
},
@@ -29,7 +29,7 @@
2929
{
3030
"data": {
3131
"text/plain": [
32-
"res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@591302c9\n"
32+
"res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@442b0b59\n"
3333
]
3434
},
3535
"execution_count": 1,
@@ -42,11 +42,9 @@
4242
]
4343
},
4444
{
45-
"cell_type": "code",
46-
"execution_count": null,
47-
"id": "d8d4270e-b96d-4437-808a-994b0bb996b5",
45+
"cell_type": "markdown",
46+
"id": "2295da8f-3623-43ba-af46-6fe19ec9ffca",
4847
"metadata": {},
49-
"outputs": [],
5048
"source": [
5149
"# If something is nullabe, you need to wrap the value type in Option[] - this helps enforce assumptions about the pipeline"
5250
]
@@ -58,15 +56,16 @@
5856
"metadata": {},
5957
"outputs": [
6058
{
61-
"ename": "<console>",
62-
"evalue": "80: error: illegal start of simple expression",
63-
"output_type": "error",
64-
"traceback": [
65-
"<console>:80: error: illegal start of simple expression",
66-
" .map( case (row: EventWithDeviceInfo) => {",
67-
" ^",
68-
""
69-
]
59+
"data": {
60+
"text/plain": [
61+
"import org.apache.spark.sql.SparkSession\n",
62+
"sparkSession: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@442b0b59\n",
63+
"defined class Event\n"
64+
]
65+
},
66+
"execution_count": 2,
67+
"metadata": {},
68+
"output_type": "execute_result"
7069
}
7170
],
7271
"source": [
@@ -84,12 +83,54 @@
8483
" url: String,\n",
8584
" event_time: String\n",
8685
")\n",
86+
"\n"
87+
]
88+
},
89+
{
90+
"cell_type": "code",
91+
"execution_count": 3,
92+
"id": "25c3320b-03ab-4969-88a7-f7771534d70c",
93+
"metadata": {},
94+
"outputs": [
95+
{
96+
"data": {
97+
"text/plain": [
98+
"dummyData: List[Event] = List(Event(Some(1),Some(2),Some(linkedin),eczachly.com,/signup,2023-01-01), Event(Some(3),Some(7),Some(twitter),eczachly.com,/signup,2023-01-01))\n"
99+
]
100+
},
101+
"execution_count": 3,
102+
"metadata": {},
103+
"output_type": "execute_result"
104+
}
105+
],
106+
"source": [
87107
"\n",
88-
"\n",
89-
"dummyData = List(\n",
90-
" Event(user_id=1, device_id=2, referrer=\"linkedin\", host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\"),\n",
91-
" Event(user_id=3, device_id=7, referrer=\"twitter\", host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\")\n",
108+
"val dummyData = List(\n",
109+
" Event(user_id=Some(1), device_id=Some(2), referrer=Some(\"linkedin\"), host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\"),\n",
110+
" Event(user_id=Some(3), device_id=Some(7), referrer=Some(\"twitter\"), host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\")\n",
92111
" )\n",
112+
"\n"
113+
]
114+
},
115+
{
116+
"cell_type": "code",
117+
"execution_count": 4,
118+
"id": "f7190f95-33b1-4779-8f0d-40e9046e8157",
119+
"metadata": {},
120+
"outputs": [
121+
{
122+
"data": {
123+
"text/plain": [
124+
"defined class Device\n",
125+
"defined class EventWithDeviceInfo\n"
126+
]
127+
},
128+
"execution_count": 4,
129+
"metadata": {},
130+
"output_type": "execute_result"
131+
}
132+
],
133+
"source": [
93134
"\n",
94135
"//TODO Illustrate how this fails if you change from Option[Long] to Long\n",
95136
"case class Device (\n",
@@ -109,9 +150,36 @@
109150
" host: String,\n",
110151
" url: String,\n",
111152
" event_time: String\n",
112-
")\n",
153+
")"
154+
]
155+
},
156+
{
157+
"cell_type": "code",
158+
"execution_count": 5,
159+
"id": "d045ae06-0f21-42f1-a271-f455e95b0879",
160+
"metadata": {},
161+
"outputs": [
162+
{
163+
"data": {
164+
"text/plain": [
165+
"import org.apache.spark.sql.Dataset\n",
166+
"import sparkSession.implicits._\n",
167+
"events: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]\n",
168+
"devices: org.apache.spark.sql.Dataset[Device] = [device_id: int, browser_type: string ... 2 more fields]\n",
169+
"filteredViaDataset: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]\n",
170+
"filteredViaDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: int, device_id: int ... 4 more fields]\n",
171+
"filteredViaSparkSql: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 4 more fields]\n"
172+
]
173+
},
174+
"execution_count": 5,
175+
"metadata": {},
176+
"output_type": "execute_result"
177+
}
178+
],
179+
"source": [
180+
"\n",
181+
"import org.apache.spark.sql.Dataset\n",
113182
"\n",
114-
"// When should you use each type?\n",
115183
"import sparkSession.implicits._\n",
116184
"\n",
117185
"// Applying this case class before hand is very powerful, enforces Nullability/non-nullability at runtime!\n",
@@ -132,7 +200,27 @@
132200
"val filteredViaDataset = events.filter(event => event.user_id.isDefined && event.device_id.isDefined)\n",
133201
"val filteredViaDataFrame = events.toDF().where($\"user_id\".isNotNull && $\"device_id\".isNotNull)\n",
134202
"val filteredViaSparkSql = sparkSession.sql(\"SELECT * FROM events WHERE user_id IS NOT NULL AND device_id IS NOT NULL\")\n",
135-
"\n",
203+
"\n"
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 6,
209+
"id": "80517409-a26e-4aa3-b383-56b928f0c9d3",
210+
"metadata": {},
211+
"outputs": [
212+
{
213+
"data": {
214+
"text/plain": [
215+
"combinedViaDatasets: org.apache.spark.sql.Dataset[EventWithDeviceInfo] = [user_id: int, device_id: int ... 7 more fields]\n"
216+
]
217+
},
218+
"execution_count": 6,
219+
"metadata": {},
220+
"output_type": "execute_result"
221+
}
222+
],
223+
"source": [
136224
"\n",
137225
"// This will fail if user_id is None\n",
138226
"val combinedViaDatasets = filteredViaDataset\n",
@@ -143,18 +231,70 @@
143231
" browser_type=device.browser_type,\n",
144232
" os_type=device.os_type,\n",
145233
" device_type=device.device_type,\n",
146-
" referrer=event.referrer,\n",
234+
" referrer=event.referrer.getOrElse(\"unknow\"),\n",
147235
" host=event.host,\n",
148236
" url=event.url,\n",
149237
" event_time=event.event_time\n",
150238
" ) }\n",
151-
" .map( case (row: EventWithDeviceInfo) => {\n",
152-
" row.browser_type = toUpperCase(row.browser_type)\n",
153-
" return row\n",
154-
" })\n",
155-
"\n",
156-
"\n",
157-
"\n",
239+
" .map { eventWithDevice =>\n",
240+
" // Convert browser_type to uppercase while maintaining immutability\n",
241+
" eventWithDevice.copy(browser_type = eventWithDevice.browser_type.toUpperCase)\n",
242+
" }\n"
243+
]
244+
},
245+
{
246+
"cell_type": "code",
247+
"execution_count": 9,
248+
"id": "3ce150e3-e5b7-4ece-8803-8189762625ea",
249+
"metadata": {},
250+
"outputs": [
251+
{
252+
"name": "stdout",
253+
"output_type": "stream",
254+
"text": [
255+
"EventWithDeviceInfo(1037710827,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-03-08 17:27:24.241)\n",
256+
"EventWithDeviceInfo(925588856,532630305,OTHER,Other,Other,unknow,www.eczachly.com,/,2021-05-10 11:26:21.247)\n",
257+
"EventWithDeviceInfo(-1180485268,532630305,OTHER,Other,Other,unknow,admin.zachwilson.tech,/,2021-02-17 16:19:30.738)\n",
258+
"EventWithDeviceInfo(-1044833855,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-09-24 15:53:14.466)\n",
259+
"EventWithDeviceInfo(747494706,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-09-26 16:03:17.535)\n",
260+
"+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n",
261+
"| user_id|device_id|browser_type|os_type|device_type|referrer| host|url| event_time|\n",
262+
"+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n",
263+
"| 1037710827|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-03-08 17:27:...|\n",
264+
"| 925588856|532630305| Other| Other| Other| NULL| www.eczachly.com| /|2021-05-10 11:26:...|\n",
265+
"|-1180485268|532630305| Other| Other| Other| NULL|admin.zachwilson....| /|2021-02-17 16:19:...|\n",
266+
"|-1044833855|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-24 15:53:...|\n",
267+
"| 747494706|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-26 16:03:...|\n",
268+
"+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n",
269+
"only showing top 5 rows\n",
270+
"\n",
271+
"+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n",
272+
"| user_id|device_id|browser_type|os_type|device_type|referrer| host|url| event_time|\n",
273+
"+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n",
274+
"| 1037710827|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-03-08 17:27:...|\n",
275+
"| 925588856|532630305| Other| Other| Other| NULL| www.eczachly.com| /|2021-05-10 11:26:...|\n",
276+
"|-1180485268|532630305| Other| Other| Other| NULL|admin.zachwilson....| /|2021-02-17 16:19:...|\n",
277+
"|-1044833855|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-24 15:53:...|\n",
278+
"| 747494706|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-26 16:03:...|\n",
279+
"+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n",
280+
"only showing top 5 rows\n",
281+
"\n"
282+
]
283+
},
284+
{
285+
"data": {
286+
"text/plain": [
287+
"combinedViaDataFrames: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]\n",
288+
"combinedViaSparkSQL: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]\n",
289+
"rows: Array[EventWithDeviceInfo] = Array(EventWithDeviceInfo(1037710827,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-03-08 17:27:24.241), EventWithDeviceInfo(925588856,532630305,OTHER,Other,Other,unknow,www.eczachly.com,/,2021-05-10 11:26:21.247), EventWithDeviceInfo(-1180485268,532630305,OTHER,Other,Other,unknow,admin.zachwilson.tech,/,2021-02-17 16:19:30.738), EventWithDeviceInfo(-1044833855,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-09-24 15:53:14.466), EventWithDeviceInfo(747494706,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-...\n"
290+
]
291+
},
292+
"execution_count": 9,
293+
"metadata": {},
294+
"output_type": "execute_result"
295+
}
296+
],
297+
"source": [
158298
"\n",
159299
"// DataFrames give up some of the intellisense because you no longer have static typing\n",
160300
"val combinedViaDataFrames = filteredViaDataFrame.as(\"e\")\n",
@@ -189,29 +329,17 @@
189329
" JOIN devices d ON fe.device_id = d.device_id\n",
190330
"\"\"\")\n",
191331
"\n",
192-
"combinedViaDatasets.take(5)\n"
332+
"\n",
333+
"val rows= combinedViaDatasets.take(5)\n",
334+
"rows.foreach(println)\n",
335+
"combinedViaDataFrames.show(5)\n",
336+
"combinedViaSparkSQL.show(5)"
193337
]
194338
},
195339
{
196340
"cell_type": "code",
197341
"execution_count": null,
198-
"id": "3ce150e3-e5b7-4ece-8803-8189762625ea",
199-
"metadata": {},
200-
"outputs": [],
201-
"source": []
202-
},
203-
{
204-
"cell_type": "code",
205-
"execution_count": null,
206-
"id": "dd4b33aa-98c4-4f8d-8d38-f5674eb4d8ed",
207-
"metadata": {},
208-
"outputs": [],
209-
"source": []
210-
},
211-
{
212-
"cell_type": "code",
213-
"execution_count": null,
214-
"id": "203ac70d-b0e5-474b-8c82-b1bae2624d51",
342+
"id": "1db45765-b3d1-4059-92f1-e0414f502eb7",
215343
"metadata": {},
216344
"outputs": [],
217345
"source": []

0 commit comments

Comments
 (0)