Skip to content

Commit bf57e04

Browse files
committed
ready for December NLP Live Training
1 parent 2b939a9 commit bf57e04

6 files changed

+182
-115
lines changed

notebooks/experimental/octopus-v1.0.ipynb

+104-89
Large diffs are not rendered by default.

notebooks/live_training/natural_language_preprocessing_best_practices_LT.ipynb

+78-26
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
{
2727
"cell_type": "code",
2828
"execution_count": null,
29-
"metadata": {},
29+
"metadata": {
30+
"collapsed": true
31+
},
3032
"outputs": [],
3133
"source": [
3234
"# the initial block is copied from creating_word_vectors_with_word2vec.ipynb\n",
@@ -44,7 +46,9 @@
4446
{
4547
"cell_type": "code",
4648
"execution_count": null,
47-
"metadata": {},
49+
"metadata": {
50+
"collapsed": true
51+
},
4852
"outputs": [],
4953
"source": [
5054
"nltk.download('punkt')"
@@ -69,7 +73,9 @@
6973
{
7074
"cell_type": "code",
7175
"execution_count": null,
72-
"metadata": {},
76+
"metadata": {
77+
"collapsed": true
78+
},
7379
"outputs": [],
7480
"source": [
7581
"nltk.download('stopwords')"
@@ -85,7 +91,9 @@
8591
{
8692
"cell_type": "code",
8793
"execution_count": null,
88-
"metadata": {},
94+
"metadata": {
95+
"collapsed": true
96+
},
8997
"outputs": [],
9098
"source": [
9199
"nltk.download('gutenberg')"
@@ -130,7 +138,9 @@
130138
{
131139
"cell_type": "code",
132140
"execution_count": null,
133-
"metadata": {},
141+
"metadata": {
142+
"collapsed": true
143+
},
134144
"outputs": [],
135145
"source": [
136146
"gberg_sents[4]"
@@ -146,7 +156,9 @@
146156
{
147157
"cell_type": "code",
148158
"execution_count": null,
149-
"metadata": {},
159+
"metadata": {
160+
"collapsed": true
161+
},
150162
"outputs": [],
151163
"source": [
152164
"# CODE HERE"
@@ -173,7 +185,9 @@
173185
{
174186
"cell_type": "code",
175187
"execution_count": null,
176-
"metadata": {},
188+
"metadata": {
189+
"collapsed": true
190+
},
177191
"outputs": [],
178192
"source": [
179193
"stpwrds"
@@ -182,7 +196,9 @@
182196
{
183197
"cell_type": "code",
184198
"execution_count": null,
185-
"metadata": {},
199+
"metadata": {
200+
"collapsed": true
201+
},
186202
"outputs": [],
187203
"source": [
188204
"# CODE HERE"
@@ -209,7 +225,9 @@
209225
{
210226
"cell_type": "code",
211227
"execution_count": null,
212-
"metadata": {},
228+
"metadata": {
229+
"collapsed": true
230+
},
213231
"outputs": [],
214232
"source": [
215233
"# CODE HERE"
@@ -247,7 +265,9 @@
247265
{
248266
"cell_type": "code",
249267
"execution_count": null,
250-
"metadata": {},
268+
"metadata": {
269+
"collapsed": true
270+
},
251271
"outputs": [],
252272
"source": [
253273
"bigram.phrasegrams # output count and score of each bigram"
@@ -256,7 +276,9 @@
256276
{
257277
"cell_type": "code",
258278
"execution_count": null,
259-
"metadata": {},
279+
"metadata": {
280+
"collapsed": true
281+
},
260282
"outputs": [],
261283
"source": [
262284
"\"Jon lives in New York City\".split()"
@@ -265,7 +287,9 @@
265287
{
266288
"cell_type": "code",
267289
"execution_count": null,
268-
"metadata": {},
290+
"metadata": {
291+
"collapsed": true
292+
},
269293
"outputs": [],
270294
"source": [
271295
"# CODE HERE"
@@ -294,7 +318,9 @@
294318
{
295319
"cell_type": "code",
296320
"execution_count": null,
297-
"metadata": {},
321+
"metadata": {
322+
"collapsed": true
323+
},
298324
"outputs": [],
299325
"source": [
300326
"lower_sents[0:5]"
@@ -314,7 +340,9 @@
314340
{
315341
"cell_type": "code",
316342
"execution_count": null,
317-
"metadata": {},
343+
"metadata": {
344+
"collapsed": true
345+
},
318346
"outputs": [],
319347
"source": [
320348
"lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston"
@@ -323,7 +351,9 @@
323351
{
324352
"cell_type": "code",
325353
"execution_count": null,
326-
"metadata": {},
354+
"metadata": {
355+
"collapsed": true
356+
},
327357
"outputs": [],
328358
"source": [
329359
"lower_bigram[\"jon lives in new york city\".split()]"
@@ -332,7 +362,9 @@
332362
{
333363
"cell_type": "code",
334364
"execution_count": null,
335-
"metadata": {},
365+
"metadata": {
366+
"collapsed": true
367+
},
336368
"outputs": [],
337369
"source": [
338370
"lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))\n",
@@ -358,7 +390,9 @@
358390
{
359391
"cell_type": "code",
360392
"execution_count": null,
361-
"metadata": {},
393+
"metadata": {
394+
"collapsed": true
395+
},
362396
"outputs": [],
363397
"source": [
364398
"clean_sents[0:9]"
@@ -367,7 +401,9 @@
367401
{
368402
"cell_type": "code",
369403
"execution_count": null,
370-
"metadata": {},
404+
"metadata": {
405+
"collapsed": true
406+
},
371407
"outputs": [],
372408
"source": [
373409
"clean_sents[6] # could consider removing stop words or common words"
@@ -415,7 +451,9 @@
415451
{
416452
"cell_type": "code",
417453
"execution_count": null,
418-
"metadata": {},
454+
"metadata": {
455+
"collapsed": true
456+
},
419457
"outputs": [],
420458
"source": [
421459
"len(model.wv.vocab) # down from 17k in previous notebook"
@@ -424,7 +462,9 @@
424462
{
425463
"cell_type": "code",
426464
"execution_count": null,
427-
"metadata": {},
465+
"metadata": {
466+
"collapsed": true
467+
},
428468
"outputs": [],
429469
"source": [
430470
"model['ma_am']"
@@ -433,7 +473,9 @@
433473
{
434474
"cell_type": "code",
435475
"execution_count": null,
436-
"metadata": {},
476+
"metadata": {
477+
"collapsed": true
478+
},
437479
"outputs": [],
438480
"source": [
439481
"model.most_similar('ma_am') "
@@ -442,7 +484,9 @@
442484
{
443485
"cell_type": "code",
444486
"execution_count": null,
445-
"metadata": {},
487+
"metadata": {
488+
"collapsed": true
489+
},
446490
"outputs": [],
447491
"source": [
448492
"model.most_similar(positive=['father', 'woman'], negative=['man']) "
@@ -492,7 +536,9 @@
492536
{
493537
"cell_type": "code",
494538
"execution_count": null,
495-
"metadata": {},
539+
"metadata": {
540+
"collapsed": true
541+
},
496542
"outputs": [],
497543
"source": [
498544
"# coords_df.head()"
@@ -530,7 +576,9 @@
530576
{
531577
"cell_type": "code",
532578
"execution_count": null,
533-
"metadata": {},
579+
"metadata": {
580+
"collapsed": true
581+
},
534582
"outputs": [],
535583
"source": [
536584
"_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)"
@@ -539,7 +587,9 @@
539587
{
540588
"cell_type": "code",
541589
"execution_count": null,
542-
"metadata": {},
590+
"metadata": {
591+
"collapsed": true
592+
},
543593
"outputs": [],
544594
"source": [
545595
"output_notebook()"
@@ -571,7 +621,9 @@
571621
{
572622
"cell_type": "code",
573623
"execution_count": null,
574-
"metadata": {},
624+
"metadata": {
625+
"collapsed": true
626+
},
575627
"outputs": [],
576628
"source": [
577629
"show(p)"

0 commit comments

Comments
 (0)