|
18 | 18 | {
|
19 | 19 | "data": {
|
20 | 20 | "text/plain": [
|
21 |
| - "Spark Web UI available at http://0a64d2ba5c88:4042\n", |
22 |
| - "SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1733519375641)\n", |
| 21 | + "Spark Web UI available at http://120fa5ac9a2d:4041\n", |
| 22 | + "SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1734316680820)\n", |
23 | 23 | "SparkSession available as 'spark'\n"
|
24 | 24 | ]
|
25 | 25 | },
|
|
29 | 29 | {
|
30 | 30 | "data": {
|
31 | 31 | "text/plain": [
|
32 |
| - "res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@591302c9\n" |
| 32 | + "res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@442b0b59\n" |
33 | 33 | ]
|
34 | 34 | },
|
35 | 35 | "execution_count": 1,
|
|
42 | 42 | ]
|
43 | 43 | },
|
44 | 44 | {
|
45 |
| - "cell_type": "code", |
46 |
| - "execution_count": null, |
47 |
| - "id": "d8d4270e-b96d-4437-808a-994b0bb996b5", |
| 45 | + "cell_type": "markdown", |
| 46 | + "id": "2295da8f-3623-43ba-af46-6fe19ec9ffca", |
48 | 47 | "metadata": {},
|
49 |
| - "outputs": [], |
50 | 48 | "source": [
|
51 | 49 | "# If something is nullabe, you need to wrap the value type in Option[] - this helps enforce assumptions about the pipeline"
|
52 | 50 | ]
|
|
58 | 56 | "metadata": {},
|
59 | 57 | "outputs": [
|
60 | 58 | {
|
61 |
| - "ename": "<console>", |
62 |
| - "evalue": "80: error: illegal start of simple expression", |
63 |
| - "output_type": "error", |
64 |
| - "traceback": [ |
65 |
| - "<console>:80: error: illegal start of simple expression", |
66 |
| - " .map( case (row: EventWithDeviceInfo) => {", |
67 |
| - " ^", |
68 |
| - "" |
69 |
| - ] |
| 59 | + "data": { |
| 60 | + "text/plain": [ |
| 61 | + "import org.apache.spark.sql.SparkSession\n", |
| 62 | + "sparkSession: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@442b0b59\n", |
| 63 | + "defined class Event\n" |
| 64 | + ] |
| 65 | + }, |
| 66 | + "execution_count": 2, |
| 67 | + "metadata": {}, |
| 68 | + "output_type": "execute_result" |
70 | 69 | }
|
71 | 70 | ],
|
72 | 71 | "source": [
|
|
84 | 83 | " url: String,\n",
|
85 | 84 | " event_time: String\n",
|
86 | 85 | ")\n",
|
| 86 | + "\n" |
| 87 | + ] |
| 88 | + }, |
| 89 | + { |
| 90 | + "cell_type": "code", |
| 91 | + "execution_count": 3, |
| 92 | + "id": "25c3320b-03ab-4969-88a7-f7771534d70c", |
| 93 | + "metadata": {}, |
| 94 | + "outputs": [ |
| 95 | + { |
| 96 | + "data": { |
| 97 | + "text/plain": [ |
| 98 | + "dummyData: List[Event] = List(Event(Some(1),Some(2),Some(linkedin),eczachly.com,/signup,2023-01-01), Event(Some(3),Some(7),Some(twitter),eczachly.com,/signup,2023-01-01))\n" |
| 99 | + ] |
| 100 | + }, |
| 101 | + "execution_count": 3, |
| 102 | + "metadata": {}, |
| 103 | + "output_type": "execute_result" |
| 104 | + } |
| 105 | + ], |
| 106 | + "source": [ |
87 | 107 | "\n",
|
88 |
| - "\n", |
89 |
| - "dummyData = List(\n", |
90 |
| - " Event(user_id=1, device_id=2, referrer=\"linkedin\", host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\"),\n", |
91 |
| - " Event(user_id=3, device_id=7, referrer=\"twitter\", host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\")\n", |
| 108 | + "val dummyData = List(\n", |
| 109 | + " Event(user_id=Some(1), device_id=Some(2), referrer=Some(\"linkedin\"), host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\"),\n", |
| 110 | + " Event(user_id=Some(3), device_id=Some(7), referrer=Some(\"twitter\"), host=\"eczachly.com\", url=\"/signup\", event_time=\"2023-01-01\")\n", |
92 | 111 | " )\n",
|
| 112 | + "\n" |
| 113 | + ] |
| 114 | + }, |
| 115 | + { |
| 116 | + "cell_type": "code", |
| 117 | + "execution_count": 4, |
| 118 | + "id": "f7190f95-33b1-4779-8f0d-40e9046e8157", |
| 119 | + "metadata": {}, |
| 120 | + "outputs": [ |
| 121 | + { |
| 122 | + "data": { |
| 123 | + "text/plain": [ |
| 124 | + "defined class Device\n", |
| 125 | + "defined class EventWithDeviceInfo\n" |
| 126 | + ] |
| 127 | + }, |
| 128 | + "execution_count": 4, |
| 129 | + "metadata": {}, |
| 130 | + "output_type": "execute_result" |
| 131 | + } |
| 132 | + ], |
| 133 | + "source": [ |
93 | 134 | "\n",
|
94 | 135 | "//TODO Illustrate how this fails if you change from Option[Long] to Long\n",
|
95 | 136 | "case class Device (\n",
|
|
109 | 150 | " host: String,\n",
|
110 | 151 | " url: String,\n",
|
111 | 152 | " event_time: String\n",
|
112 |
| - ")\n", |
| 153 | + ")" |
| 154 | + ] |
| 155 | + }, |
| 156 | + { |
| 157 | + "cell_type": "code", |
| 158 | + "execution_count": 5, |
| 159 | + "id": "d045ae06-0f21-42f1-a271-f455e95b0879", |
| 160 | + "metadata": {}, |
| 161 | + "outputs": [ |
| 162 | + { |
| 163 | + "data": { |
| 164 | + "text/plain": [ |
| 165 | + "import org.apache.spark.sql.Dataset\n", |
| 166 | + "import sparkSession.implicits._\n", |
| 167 | + "events: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]\n", |
| 168 | + "devices: org.apache.spark.sql.Dataset[Device] = [device_id: int, browser_type: string ... 2 more fields]\n", |
| 169 | + "filteredViaDataset: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]\n", |
| 170 | + "filteredViaDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: int, device_id: int ... 4 more fields]\n", |
| 171 | + "filteredViaSparkSql: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 4 more fields]\n" |
| 172 | + ] |
| 173 | + }, |
| 174 | + "execution_count": 5, |
| 175 | + "metadata": {}, |
| 176 | + "output_type": "execute_result" |
| 177 | + } |
| 178 | + ], |
| 179 | + "source": [ |
| 180 | + "\n", |
| 181 | + "import org.apache.spark.sql.Dataset\n", |
113 | 182 | "\n",
|
114 |
| - "// When should you use each type?\n", |
115 | 183 | "import sparkSession.implicits._\n",
|
116 | 184 | "\n",
|
117 | 185 | "// Applying this case class before hand is very powerful, enforces Nullability/non-nullability at runtime!\n",
|
|
132 | 200 | "val filteredViaDataset = events.filter(event => event.user_id.isDefined && event.device_id.isDefined)\n",
|
133 | 201 | "val filteredViaDataFrame = events.toDF().where($\"user_id\".isNotNull && $\"device_id\".isNotNull)\n",
|
134 | 202 | "val filteredViaSparkSql = sparkSession.sql(\"SELECT * FROM events WHERE user_id IS NOT NULL AND device_id IS NOT NULL\")\n",
|
135 |
| - "\n", |
| 203 | + "\n" |
| 204 | + ] |
| 205 | + }, |
| 206 | + { |
| 207 | + "cell_type": "code", |
| 208 | + "execution_count": 6, |
| 209 | + "id": "80517409-a26e-4aa3-b383-56b928f0c9d3", |
| 210 | + "metadata": {}, |
| 211 | + "outputs": [ |
| 212 | + { |
| 213 | + "data": { |
| 214 | + "text/plain": [ |
| 215 | + "combinedViaDatasets: org.apache.spark.sql.Dataset[EventWithDeviceInfo] = [user_id: int, device_id: int ... 7 more fields]\n" |
| 216 | + ] |
| 217 | + }, |
| 218 | + "execution_count": 6, |
| 219 | + "metadata": {}, |
| 220 | + "output_type": "execute_result" |
| 221 | + } |
| 222 | + ], |
| 223 | + "source": [ |
136 | 224 | "\n",
|
137 | 225 | "// This will fail if user_id is None\n",
|
138 | 226 | "val combinedViaDatasets = filteredViaDataset\n",
|
|
143 | 231 | " browser_type=device.browser_type,\n",
|
144 | 232 | " os_type=device.os_type,\n",
|
145 | 233 | " device_type=device.device_type,\n",
|
146 |
| - " referrer=event.referrer,\n", |
| 234 | + " referrer=event.referrer.getOrElse(\"unknow\"),\n", |
147 | 235 | " host=event.host,\n",
|
148 | 236 | " url=event.url,\n",
|
149 | 237 | " event_time=event.event_time\n",
|
150 | 238 | " ) }\n",
|
151 |
| - " .map( case (row: EventWithDeviceInfo) => {\n", |
152 |
| - " row.browser_type = toUpperCase(row.browser_type)\n", |
153 |
| - " return row\n", |
154 |
| - " })\n", |
155 |
| - "\n", |
156 |
| - "\n", |
157 |
| - "\n", |
| 239 | + " .map { eventWithDevice =>\n", |
| 240 | + " // Convert browser_type to uppercase while maintaining immutability\n", |
| 241 | + " eventWithDevice.copy(browser_type = eventWithDevice.browser_type.toUpperCase)\n", |
| 242 | + " }\n" |
| 243 | + ] |
| 244 | + }, |
| 245 | + { |
| 246 | + "cell_type": "code", |
| 247 | + "execution_count": 9, |
| 248 | + "id": "3ce150e3-e5b7-4ece-8803-8189762625ea", |
| 249 | + "metadata": {}, |
| 250 | + "outputs": [ |
| 251 | + { |
| 252 | + "name": "stdout", |
| 253 | + "output_type": "stream", |
| 254 | + "text": [ |
| 255 | + "EventWithDeviceInfo(1037710827,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-03-08 17:27:24.241)\n", |
| 256 | + "EventWithDeviceInfo(925588856,532630305,OTHER,Other,Other,unknow,www.eczachly.com,/,2021-05-10 11:26:21.247)\n", |
| 257 | + "EventWithDeviceInfo(-1180485268,532630305,OTHER,Other,Other,unknow,admin.zachwilson.tech,/,2021-02-17 16:19:30.738)\n", |
| 258 | + "EventWithDeviceInfo(-1044833855,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-09-24 15:53:14.466)\n", |
| 259 | + "EventWithDeviceInfo(747494706,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-09-26 16:03:17.535)\n", |
| 260 | + "+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n", |
| 261 | + "| user_id|device_id|browser_type|os_type|device_type|referrer| host|url| event_time|\n", |
| 262 | + "+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n", |
| 263 | + "| 1037710827|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-03-08 17:27:...|\n", |
| 264 | + "| 925588856|532630305| Other| Other| Other| NULL| www.eczachly.com| /|2021-05-10 11:26:...|\n", |
| 265 | + "|-1180485268|532630305| Other| Other| Other| NULL|admin.zachwilson....| /|2021-02-17 16:19:...|\n", |
| 266 | + "|-1044833855|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-24 15:53:...|\n", |
| 267 | + "| 747494706|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-26 16:03:...|\n", |
| 268 | + "+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n", |
| 269 | + "only showing top 5 rows\n", |
| 270 | + "\n", |
| 271 | + "+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n", |
| 272 | + "| user_id|device_id|browser_type|os_type|device_type|referrer| host|url| event_time|\n", |
| 273 | + "+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n", |
| 274 | + "| 1037710827|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-03-08 17:27:...|\n", |
| 275 | + "| 925588856|532630305| Other| Other| Other| NULL| www.eczachly.com| /|2021-05-10 11:26:...|\n", |
| 276 | + "|-1180485268|532630305| Other| Other| Other| NULL|admin.zachwilson....| /|2021-02-17 16:19:...|\n", |
| 277 | + "|-1044833855|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-24 15:53:...|\n", |
| 278 | + "| 747494706|532630305| Other| Other| Other| NULL| www.zachwilson.tech| /|2021-09-26 16:03:...|\n", |
| 279 | + "+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+\n", |
| 280 | + "only showing top 5 rows\n", |
| 281 | + "\n" |
| 282 | + ] |
| 283 | + }, |
| 284 | + { |
| 285 | + "data": { |
| 286 | + "text/plain": [ |
| 287 | + "combinedViaDataFrames: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]\n", |
| 288 | + "combinedViaSparkSQL: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]\n", |
| 289 | + "rows: Array[EventWithDeviceInfo] = Array(EventWithDeviceInfo(1037710827,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-03-08 17:27:24.241), EventWithDeviceInfo(925588856,532630305,OTHER,Other,Other,unknow,www.eczachly.com,/,2021-05-10 11:26:21.247), EventWithDeviceInfo(-1180485268,532630305,OTHER,Other,Other,unknow,admin.zachwilson.tech,/,2021-02-17 16:19:30.738), EventWithDeviceInfo(-1044833855,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-09-24 15:53:14.466), EventWithDeviceInfo(747494706,532630305,OTHER,Other,Other,unknow,www.zachwilson.tech,/,2021-...\n" |
| 290 | + ] |
| 291 | + }, |
| 292 | + "execution_count": 9, |
| 293 | + "metadata": {}, |
| 294 | + "output_type": "execute_result" |
| 295 | + } |
| 296 | + ], |
| 297 | + "source": [ |
158 | 298 | "\n",
|
159 | 299 | "// DataFrames give up some of the intellisense because you no longer have static typing\n",
|
160 | 300 | "val combinedViaDataFrames = filteredViaDataFrame.as(\"e\")\n",
|
|
189 | 329 | " JOIN devices d ON fe.device_id = d.device_id\n",
|
190 | 330 | "\"\"\")\n",
|
191 | 331 | "\n",
|
192 |
| - "combinedViaDatasets.take(5)\n" |
| 332 | + "\n", |
| 333 | + "val rows= combinedViaDatasets.take(5)\n", |
| 334 | + "rows.foreach(println)\n", |
| 335 | + "combinedViaDataFrames.show(5)\n", |
| 336 | + "combinedViaSparkSQL.show(5)" |
193 | 337 | ]
|
194 | 338 | },
|
195 | 339 | {
|
196 | 340 | "cell_type": "code",
|
197 | 341 | "execution_count": null,
|
198 |
| - "id": "3ce150e3-e5b7-4ece-8803-8189762625ea", |
199 |
| - "metadata": {}, |
200 |
| - "outputs": [], |
201 |
| - "source": [] |
202 |
| - }, |
203 |
| - { |
204 |
| - "cell_type": "code", |
205 |
| - "execution_count": null, |
206 |
| - "id": "dd4b33aa-98c4-4f8d-8d38-f5674eb4d8ed", |
207 |
| - "metadata": {}, |
208 |
| - "outputs": [], |
209 |
| - "source": [] |
210 |
| - }, |
211 |
| - { |
212 |
| - "cell_type": "code", |
213 |
| - "execution_count": null, |
214 |
| - "id": "203ac70d-b0e5-474b-8c82-b1bae2624d51", |
| 342 | + "id": "1db45765-b3d1-4059-92f1-e0414f502eb7", |
215 | 343 | "metadata": {},
|
216 | 344 | "outputs": [],
|
217 | 345 | "source": []
|
|
0 commit comments