From 42522693fcbea2e7dc7a5c1270d7bdab8710a50e Mon Sep 17 00:00:00 2001 From: Bryce McWilliams Date: Fri, 10 Jul 2020 19:29:23 +0200 Subject: [PATCH] feat: add nlp 101 --- nlp_101.ipynb | 1085 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1085 insertions(+) create mode 100644 nlp_101.ipynb diff --git a/nlp_101.ipynb b/nlp_101.ipynb new file mode 100644 index 0000000..fc61bf3 --- /dev/null +++ b/nlp_101.ipynb @@ -0,0 +1,1085 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "nlp_101.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyNjj+II+0ZLdlUhZ9ycFP4B", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5dmft22JszFn", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "80354877-3829-4aff-d362-b3e5102eacc4" + }, + "source": [ + "import nltk\n", + "#nltk.download('movie_reviews')\n", + "from nltk.corpus import movie_reviews\n", + "movie_reviews.words()" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 2 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4-B-SrRtrB75", + "colab_type": "text" + }, + "source": [ + "#Data Preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qlJ1EJewrFx1", + "colab_type": "text" + }, + "source": [ + "## 1. Tokenization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z0kVzWJmraHy", + "colab_type": "text" + }, + "source": [ + "###1.1 Word" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KsGROjs7qLOE", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + }, + "outputId": "aed49aac-8936-47c9-b1b4-3afba2a360e5" + }, + "source": [ + "from nltk.tokenize import word_tokenize\n", + "\n", + "#nltk.download('punkt')\n", + "\n", + "data = \"I like skewl\"\n", + "\n", + "tokenized_text=word_tokenize(data)\n", + "print(tokenized_text)\n", + "print(type(tokenized_text))" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['I', 'like', 'skewl']\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xozestRYrNIO", + "colab_type": "text" + }, + "source": [ + "\n", + "###1.2 Sentence" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "H6td1iLuqfon", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 71 + }, + "outputId": "bc299b80-5eb5-4dd7-a00d-acd46c6f2669" + }, + "source": [ + "from nltk.tokenize import sent_tokenize\n", + "\n", + "paragraph=\"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.The most commonly used cake ingredients include flour, sugar, eggs, butter or oil or margarine, a liquid, and leavening agents, such as baking soda or baking powder. Common additional ingredients and flavourings include dried, candied, or fresh fruit, nuts, cocoa, and extracts such as vanilla, with numerous substitutions for the primary ingredients.Cakes can also be filled with fruit preserves, nuts or dessert sauces (like pastry cream), iced with buttercream or other icings, and decorated with marzipan, piped borders, or candied fruit.\"\"\"\n", + "\n", + "tokenized_paragraph=sent_tokenize(paragraph)\n", + "print(tokenized_paragraph)\n", + "print(type(tokenized_paragraph))" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.The most commonly used cake ingredients include flour, sugar, eggs, butter or oil or margarine, a liquid, and leavening agents, such as baking soda or baking powder.', 'Common additional ingredients and flavourings include dried, candied, or fresh fruit, nuts, cocoa, and extracts such as vanilla, with numerous substitutions for the primary ingredients.Cakes can also be filled with fruit preserves, nuts or dessert sauces (like pastry cream), iced with buttercream or other icings, and decorated with marzipan, piped borders, or candied fruit.']\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4gjpctQrkaY", + "colab_type": "text" + }, + "source": [ + "##2. Punctuation Removal" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X6NE31DXqs6o", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "7abe3695-4c28-4213-9ebb-1a703910afa5" + }, + "source": [ + "from nltk.tokenize import RegexpTokenizer\n", + "\n", + "tokenizer = RegexpTokenizer(r'\\w+')\n", + "\n", + "result = tokenizer.tokenize(\"Wow! I am excited to learn science\")\n", + "print(result)" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['Wow', 'I', 'am', 'excited', 'to', 'learn', 'science']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "40A-9-eKruLk", + "colab_type": "text" + }, + "source": [ + "##3. Stop Words Removal" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YfLgTlJ2q53Q", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 71 + }, + "outputId": "6274393a-9a44-4e76-b8cd-e1e51821a821" + }, + "source": [ + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "#nltk.download('stopwords')\n", + "\n", + "to_be_removed = set(stopwords.words('english'))\n", + "\n", + "para=\"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations \n", + "that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, \n", + "and pies.\"\"\"\n", + "\n", + "tokenized_para=word_tokenize(para)\n", + "print(tokenized_para)\n", + "\n", + "modified_token_list=[word for word in tokenized_para if not word in to_be_removed]\n", + "print(modified_token_list)" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['Cake', 'is', 'a', 'form', 'of', 'sweet', 'food', 'made', 'from', 'flour', ',', 'sugar', ',', 'and', 'other', 'ingredients', ',', 'that', 'is', 'usually', 'baked', '.', 'In', 'their', 'oldest', 'forms', ',', 'cakes', 'were', 'modifications', 'of', 'bread', ',', 'but', 'cakes', 'now', 'cover', 'a', 'wide', 'range', 'of', 'preparations', 'that', 'can', 'be', 'simple', 'or', 'elaborate', ',', 'and', 'that', 'share', 'features', 'with', 'other', 'desserts', 'such', 'as', 'pastries', ',', 'meringues', ',', 'custards', ',', 'and', 'pies', '.']\n", + "['Cake', 'form', 'sweet', 'food', 'made', 'flour', ',', 'sugar', ',', 'ingredients', ',', 'usually', 'baked', '.', 'In', 'oldest', 'forms', ',', 'cakes', 'modifications', 'bread', ',', 'cakes', 'cover', 'wide', 'range', 'preparations', 'simple', 'elaborate', ',', 'share', 'features', 'desserts', 'pastries', ',', 'meringues', ',', 'custards', ',', 'pies', '.']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GCrBnfsIsLKU", + "colab_type": "text" + }, + "source": [ + "##4. Stemming" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b1qksx6esSL6", + "colab_type": "text" + }, + "source": [ + "###4.1 Porter Stemmer" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5z-Y8O1jr0Bm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "f3fe6773-276d-4ae1-dcf4-06c794efc619" + }, + "source": [ + "from nltk.stem import PorterStemmer\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "stemmer = PorterStemmer()\n", + "\n", + "content = \"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations \n", + "that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.\"\"\"\n", + "\n", + "tk_content=word_tokenize(content)\n", + "\n", + "stemmed_words = [stemmer.stem(i) for i in tk_content] \n", + "print(stemmed_words)" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['cake', 'is', 'a', 'form', 'of', 'sweet', 'food', 'made', 'from', 'flour', ',', 'sugar', ',', 'and', 'other', 'ingredi', ',', 'that', 'is', 'usual', 'baked.in', 'their', 'oldest', 'form', ',', 'cake', 'were', 'modif', 'of', 'bread', ',', 'but', 'cake', 'now', 'cover', 'a', 'wide', 'rang', 'of', 'prepar', 'that', 'can', 'be', 'simpl', 'or', 'elabor', ',', 'and', 'that', 'share', 'featur', 'with', 'other', 'dessert', 'such', 'as', 'pastri', ',', 'meringu', ',', 'custard', ',', 'and', 'pie', '.']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AUK64-yVseOW", + "colab_type": "text" + }, + "source": [ + "###4.2 Lancaster Stemmer" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EaJJzeJisW8R", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "6c3c2064-3944-4e6f-ace9-5b35a89f7c9c" + }, + "source": [ + "from nltk.stem import LancasterStemmer\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "stemmer = PorterStemmer()\n", + "\n", + "content = \"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations \n", + "that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, \n", + "and pies.\"\"\"\n", + "\n", + "tk_content=word_tokenize(content)\n", + "\n", + "stemmed_words = [stemmer.stem(i) for i in tk_content]\n", + "print(stemmed_words)" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['cake', 'is', 'a', 'form', 'of', 'sweet', 'food', 'made', 'from', 'flour', ',', 'sugar', ',', 'and', 'other', 'ingredi', ',', 'that', 'is', 'usual', 'bake', '.', 'In', 'their', 'oldest', 'form', ',', 'cake', 'were', 'modif', 'of', 'bread', ',', 'but', 'cake', 'now', 'cover', 'a', 'wide', 'rang', 'of', 'prepar', 'that', 'can', 'be', 'simpl', 'or', 'elabor', ',', 'and', 'that', 'share', 'featur', 'with', 'other', 'dessert', 'such', 'as', 'pastri', ',', 'meringu', ',', 'custard', ',', 'and', 'pie', '.']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pkt2nvyLswWD", + "colab_type": "text" + }, + "source": [ + "##5. Lemmentization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s3Rfm6W7tBw8", + "colab_type": "text" + }, + "source": [ + "###5.1 WordNet Lemmatizer" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-awZ_d49sk7U", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 88 + }, + "outputId": "a95d4c6c-2ee7-4bf7-fce6-48597e9ed2ac" + }, + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "nltk.download('wordnet')\n", + "\n", + "lemmatizer=WordNetLemmatizer()\n", + "\n", + "content = \"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations \n", + "that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, \n", + "and pies.\"\"\"\n", + "\n", + "tk_content=word_tokenize(content)\n", + "\n", + "lemmatized_words = [lemmatizer.lemmatize(i) for i in tk_content] \n", + "print(lemmatized_words)" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "['Cake', 'is', 'a', 'form', 'of', 'sweet', 'food', 'made', 'from', 'flour', ',', 'sugar', ',', 'and', 'other', 'ingredient', ',', 'that', 'is', 'usually', 'baked', '.', 'In', 'their', 'oldest', 'form', ',', 'cake', 'were', 'modification', 'of', 'bread', ',', 'but', 'cake', 'now', 'cover', 'a', 'wide', 'range', 'of', 'preparation', 'that', 'can', 'be', 'simple', 'or', 'elaborate', ',', 'and', 'that', 'share', 'feature', 'with', 'other', 'dessert', 'such', 'a', 'pastry', ',', 'meringue', ',', 'custard', ',', 'and', 'pie', '.']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6t864u2wtHlS", + "colab_type": "text" + }, + "source": [ + "## 6. POS Tagging" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2dXnFzZ0s8AN", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "48e664ae-2e28-4220-9069-fd8855569cb1" + }, + "source": [ + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "\n", + "#nltk.download('averaged_perceptron_tagger')\n", + "#nltk.download('universal_tagset')\n", + "\n", + "content = \"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations \n", + "that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, \n", + "and pies.\"\"\"\n", + "\n", + "words = [word_tokenize(i) for i in sent_tokenize(content)]\n", + "\n", + "pos_tag = [nltk.pos_tag(i,tagset=\"universal\") for i in words]\n", + "print(pos_tag)" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[[('Cake', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('form', 'NOUN'), ('of', 'ADP'), ('sweet', 'ADJ'), ('food', 'NOUN'), ('made', 'VERB'), ('from', 'ADP'), ('flour', 'NOUN'), (',', '.'), ('sugar', 'NOUN'), (',', '.'), ('and', 'CONJ'), ('other', 'ADJ'), ('ingredients', 'NOUN'), (',', '.'), ('that', 'DET'), ('is', 'VERB'), ('usually', 'ADV'), ('baked', 'VERB'), ('.', '.')], [('In', 'ADP'), ('their', 'PRON'), ('oldest', 'ADJ'), ('forms', 'NOUN'), (',', '.'), ('cakes', 'NOUN'), ('were', 'VERB'), ('modifications', 'NOUN'), ('of', 'ADP'), ('bread', 'NOUN'), (',', '.'), ('but', 'CONJ'), ('cakes', 'NOUN'), ('now', 'ADV'), ('cover', 'VERB'), ('a', 'DET'), ('wide', 'ADJ'), ('range', 'NOUN'), ('of', 'ADP'), ('preparations', 'NOUN'), ('that', 'DET'), ('can', 'VERB'), ('be', 'VERB'), ('simple', 'ADJ'), ('or', 'CONJ'), ('elaborate', 'ADJ'), (',', '.'), ('and', 'CONJ'), ('that', 'ADP'), ('share', 'NOUN'), ('features', 'NOUN'), ('with', 'ADP'), ('other', 'ADJ'), ('desserts', 'NOUN'), ('such', 'ADJ'), ('as', 'ADP'), ('pastries', 'NOUN'), (',', '.'), ('meringues', 'NOUN'), (',', '.'), ('custards', 'NOUN'), (',', '.'), ('and', 'CONJ'), ('pies', 'NOUN'), ('.', '.')]]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C6AYSxBQtj4Y", + "colab_type": "text" + }, + "source": [ + "##7. Chunking" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JjTVvvoDtN3n", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "outputId": "99d48872-13b9-415d-87c8-47b835e20e12" + }, + "source": [ + "from nltk.tokenize import word_tokenize\n", + "\n", + "content = \"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\"\n", + "\n", + "tokenized_text = nltk.word_tokenize(content)\n", + "tagged_token = nltk.pos_tag(tokenized_text)\n", + "\n", + "grammer = \"NP: {
?*}\"\n", + "phrases = nltk.RegexpParser(grammer)\n", + "\n", + "result = phrases.parse(tagged_token)\n", + "print(result)\n", + "\n", + "#result.draw()" + ], + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(S\n", + " Cake/NNP\n", + " is/VBZ\n", + " (NP a/DT form/NN)\n", + " of/IN\n", + " (NP sweet/JJ food/NN)\n", + " made/VBN\n", + " from/IN\n", + " (NP flour/NN)\n", + " ,/,\n", + " (NP sugar/NN)\n", + " ,/,\n", + " and/CC\n", + " other/JJ\n", + " ingredients/NNS\n", + " ,/,\n", + " that/DT\n", + " is/VBZ\n", + " usually/RB\n", + " baked/VBN\n", + " ./.)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jhae39lQuZCy", + "colab_type": "text" + }, + "source": [ + "##8. Bag of Words" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mJ1HKNB1tqL1", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 131 + }, + "outputId": "9dacc291-4676-4226-99cd-ba23070c42ca" + }, + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "import pandas as pd\n", + "\n", + "content = \"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.\"\"\"\n", + "\n", + "count_vectorizer = CountVectorizer()\n", + "\n", + "bag_of_words = count_vectorizer.fit_transform(content.splitlines())\n", + "\n", + "pd.DataFrame(bag_of_words.toarray(), columns = count_vectorizer.get_feature_names())" + ], + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
andasbakedbebreadbutcakecakescancovercustardsdessertselaboratefeaturesflourfoodformformsfrominingredientsismademeringuesmodificationsnowofoldestorotherpastriespiespreparationsrangesharesimplesuchsugarsweetthattheirusuallywerewidewith
0101000100000001110101210001001000000011101000
1210111021111110001010001112111111111100210111
\n", + "
" + ], + "text/plain": [ + " and as baked be bread but ... that their usually were wide with\n", + "0 1 0 1 0 0 0 ... 1 0 1 0 0 0\n", + "1 2 1 0 1 1 1 ... 2 1 0 1 1 1\n", + "\n", + "[2 rows x 45 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 22 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t-PUyYohulkQ", + "colab_type": "text" + }, + "source": [ + "###8.1 Synonyms using wordnet" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "c9O3VvmLueas", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 + }, + "outputId": "bf680489-b5f6-4c18-8374-a700fc13a31f" + }, + "source": [ + "from nltk.corpus import wordnet\n", + "\n", + "syns = wordnet.synsets(\"dog\") \n", + " \n", + "print(syns[0].name()) \n", + " \n", + "print(syns[0].lemmas()[0].name()) \n", + " \n", + "print(syns[0].definition()) \n", + " \n", + "print(syns[0].examples())" + ], + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "text": [ + "dog.n.01\n", + "dog\n", + "a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds\n", + "['the dog barked all night']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nLdgG6MjuyUJ", + "colab_type": "text" + }, + "source": [ + "###8.2 Frequency distribution of words" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "okhy1hfMurbI", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 337 + }, + "outputId": "3f94e773-f3a2-4963-9af6-262f4a6edf2f" + }, + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "content = \"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread\"\"\"\n", + "\n", + "words = nltk.tokenize.word_tokenize(content)\n", + "\n", + "fd = nltk.FreqDist(words)\n", + "fd.plot()" + ], + "execution_count": 24, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N3BmcD2Uu85e", + "colab_type": "text" + }, + "source": [ + "###9. Word Embeddings" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kbEUVpk3u30F", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 564 + }, + "outputId": "a4ad4e2e-c6d9-419e-ec4d-6af060008841" + }, + "source": [ + "from gensim.models import Word2Vec\n", + "\n", + "# define training data\n", + "content=\"\"\"Cake is a form of sweet food made from flour, sugar, and other ingredients, that is usually baked.\n", + "In their oldest forms, cakes were modifications of bread, but cakes now cover a wide range of preparations that can be simple or elaborate, and that share features with other desserts such as pastries, meringues, custards, and pies.\"\"\"\n", + "sentences=nltk.sent_tokenize(content)\n", + "words=[]\n", + "\n", + "for i in sentences:\n", + " words.append(nltk.word_tokenize(i))\n", + "\n", + "# train model\n", + "model = Word2Vec(words, min_count=1)\n", + "\n", + "# summarize the loaded model\n", + "print(model)\n", + "\n", + "# summarize vocabulary\n", + "word_vec_words = list(model.wv.vocab)\n", + "print(word_vec_words)\n", + "\n", + "# access vector for one word\n", + "print(model['sugar'])\n", + "\n", + "# save model\n", + "model.save('model.bin')\n", + "\n", + "# load model\n", + "new_model = Word2Vec.load('model.bin')\n", + "print(new_model)" + ], + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Word2Vec(vocab=48, size=100, alpha=0.025)\n", + "['Cake', 'is', 'a', 'form', 'of', 'sweet', 'food', 'made', 'from', 'flour', ',', 'sugar', 'and', 'other', 'ingredients', 'that', 'usually', 'baked', '.', 'In', 'their', 'oldest', 'forms', 'cakes', 'were', 'modifications', 'bread', 'but', 'now', 'cover', 'wide', 'range', 'preparations', 'can', 'be', 'simple', 'or', 'elaborate', 'share', 'features', 'with', 'desserts', 'such', 'as', 'pastries', 'meringues', 'custards', 'pies']\n", + "[ 1.2900397e-03 -4.4050827e-03 -3.3263196e-03 -2.0251162e-03\n", + " -1.9256959e-03 4.5178290e-03 4.6570050e-03 2.8711003e-03\n", + " -4.2531281e-03 -3.7845614e-04 3.8574554e-03 -1.9891351e-03\n", + " 4.5824298e-03 1.5097363e-03 3.1842457e-03 3.1077143e-04\n", + " -4.2730705e-03 1.6763955e-03 -5.4607273e-04 -4.6840194e-04\n", + " -1.2934895e-03 4.8788548e-03 3.9951564e-03 1.8809187e-03\n", + " -4.5051551e-04 -3.6797088e-03 -2.6806856e-03 -4.9093249e-03\n", + " 1.0661986e-05 -2.5036826e-03 3.6932496e-04 -2.4800377e-03\n", + " -4.7568125e-03 -4.7006006e-03 4.6129087e-03 6.1959872e-04\n", + " 3.6717989e-03 -2.4429176e-03 -1.2363732e-04 -3.5046753e-03\n", + " -2.0565446e-03 -2.3353908e-03 -2.7658308e-03 -1.8586651e-03\n", + " -3.7130108e-03 -6.0016935e-04 -2.9897159e-03 -3.7880535e-03\n", + " -3.9280793e-03 3.9332923e-03 1.6729704e-03 -4.9394034e-03\n", + " 4.6572643e-03 -4.3637827e-03 -9.3136783e-05 -2.9492262e-03\n", + " -1.9744544e-03 -3.6059276e-03 -3.9423611e-03 -2.9691169e-03\n", + " -1.7650127e-04 -3.2259724e-03 9.9363422e-04 1.0665000e-03\n", + " -2.2204674e-03 3.3678003e-03 4.4750911e-03 2.2151966e-03\n", + " 5.6144886e-04 4.4316403e-03 -4.0480960e-03 2.9111751e-03\n", + " -4.8157568e-03 -3.9936928e-03 1.0673276e-04 -4.9511571e-03\n", + " 3.2988950e-03 3.1355142e-03 3.5804932e-03 -2.7380732e-03\n", + " -3.0919246e-03 -6.9842924e-04 3.2996046e-03 1.4009699e-04\n", + " -1.5377058e-03 1.6681730e-03 -2.1100568e-03 4.0061162e-03\n", + " 7.1020133e-04 -4.9544768e-05 -4.6256362e-03 -1.6403218e-03\n", + " 4.4054287e-03 -4.3784659e-03 2.6041116e-03 -3.6177207e-03\n", + " 2.0688606e-04 -1.4998632e-03 9.1215217e-04 -3.8086215e-03]\n", + "Word2Vec(vocab=48, size=100, alpha=0.025)\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:23: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n", + "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:254: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GE_C9QAZvbHt", + "colab_type": "text" + }, + "source": [ + "##10. Complete example" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LnILqHx1vCqQ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 459 + }, + "outputId": "d921c903-56dd-4655-98d3-cc12a5d85f96" + }, + "source": [ + "import numpy as np\n", + "from keras.utils import to_categorical\n", + "from keras import models\n", + "from keras import layers\n", + "from keras.datasets import imdb\n", + " \n", + "(train_data, train_target), (test_data, test_target) = imdb.load_data(num_words=10000)\n", + "\n", + "dt = np.concatenate((train_data, test_data), axis=0)\n", + "tar = np.concatenate((train_target, test_target), axis=0)\n", + " \n", + "def convert(sequences, dimension = 10000):\n", + " results = np.zeros((len(sequences), dimension))\n", + " for i, sequence in enumerate(sequences):\n", + " results[i, sequence] = 1\n", + " return results\n", + " \n", + "dt = convert(dt)\n", + "tar = np.array(tar).astype(\"float32\")\n", + "\n", + "test_x = dt[:9000]\n", + "test_y = tar[:9000]\n", + "train_x = dt[9000:]\n", + "train_y = tar[9000:]\n", + "\n", + "model = models.Sequential()\n", + "# Input - Layer\n", + "model.add(layers.Dense(50, activation = \"relu\", input_shape=(10000, )))\n", + "# Hidden - Layers\n", + "model.add(layers.Dropout(0.4, noise_shape=None, seed=None))\n", + "model.add(layers.Dense(50, activation = \"relu\"))\n", + "model.add(layers.Dropout(0.3, noise_shape=None, seed=None))\n", + "model.add(layers.Dense(50, activation = \"relu\"))\n", + "# Output- Layer\n", + "model.add(layers.Dense(1, activation = \"sigmoid\"))\n", + "\n", + "model.summary()\n", + "\n", + "# compiling the model\n", + "model.compile(\n", + " optimizer = \"adam\",\n", + " loss = \"binary_crossentropy\",\n", + " metrics = [\"accuracy\"]\n", + ")\n", + "\n", + "results = model.fit(\n", + " train_x,\n", + " train_y,\n", + " epochs= 2,\n", + " batch_size = 500,\n", + " validation_data = (test_x, test_y)\n", + ")\n", + "\n", + "print(\"Test-Accuracy:\", np.mean(results.history[\"val_accuracy\"]))" + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense_5 (Dense) (None, 50) 500050 \n", + "_________________________________________________________________\n", + "dropout_3 (Dropout) (None, 50) 0 \n", + "_________________________________________________________________\n", + "dense_6 (Dense) (None, 50) 2550 \n", + "_________________________________________________________________\n", + "dropout_4 (Dropout) (None, 50) 0 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (None, 50) 2550 \n", + "_________________________________________________________________\n", + "dense_8 (Dense) (None, 1) 51 \n", + "=================================================================\n", + "Total params: 505,201\n", + "Trainable params: 505,201\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Train on 41000 samples, validate on 9000 samples\n", + "Epoch 1/2\n", + "41000/41000 [==============================] - 5s 126us/step - loss: 0.4352 - accuracy: 0.7917 - val_loss: 0.2614 - val_accuracy: 0.8930\n", + "Epoch 2/2\n", + "41000/41000 [==============================] - 5s 122us/step - loss: 0.2279 - accuracy: 0.9121 - val_loss: 0.2599 - val_accuracy: 0.8968\n", + "Test-Accuracy: 0.8948888778686523\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KErz2zNGvhNV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 26, + "outputs": [] + } + ] +} \ No newline at end of file