1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "name" : " Bag_Of_Words.ipynb" ,
7
+ "provenance" : [],
8
+ "authorship_tag" : " ABX9TyO632z+jWs35D5lpZDncmwt" ,
9
+ "include_colab_link" : true
10
+ },
11
+ "kernelspec" : {
12
+ "name" : " python3" ,
13
+ "display_name" : " Python 3"
14
+ },
15
+ "language_info" : {
16
+ "name" : " python"
17
+ }
18
+ },
19
+ "cells" : [
20
+ {
21
+ "cell_type" : " markdown" ,
22
+ "metadata" : {
23
+ "id" : " view-in-github" ,
24
+ "colab_type" : " text"
25
+ },
26
+ "source" : [
27
+ " <a href=\" https://colab.research.google.com/github/DataMinati/NLP-Legion/blob/main/Bag_Of_Words.ipynb\" target=\" _parent\" ><img src=\" https://colab.research.google.com/assets/colab-badge.svg\" alt=\" Open In Colab\" /></a>"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type" : " code" ,
32
+ "metadata" : {
33
+ "colab" : {
34
+ "base_uri" : " https://localhost:8080/"
35
+ },
36
+ "id" : " MwGTmNkIl88E" ,
37
+ "outputId" : " 16ef351d-1cfb-4e6c-f1a1-69f7d86452e7"
38
+ },
39
+ "source" : [
40
+ " nltk.download('punkt')\n " ,
41
+ " nltk.download('stopwords')\n " ,
42
+ " nltk.download('wordnet')"
43
+ ],
44
+ "execution_count" : null ,
45
+ "outputs" : [
46
+ {
47
+ "output_type" : " stream" ,
48
+ "text" : [
49
+ " [nltk_data] Downloading package punkt to /root/nltk_data...\n " ,
50
+ " [nltk_data] Unzipping tokenizers/punkt.zip.\n " ,
51
+ " [nltk_data] Downloading package stopwords to /root/nltk_data...\n " ,
52
+ " [nltk_data] Unzipping corpora/stopwords.zip.\n " ,
53
+ " [nltk_data] Downloading package wordnet to /root/nltk_data...\n " ,
54
+ " [nltk_data] Unzipping corpora/wordnet.zip.\n "
55
+ ],
56
+ "name" : " stdout"
57
+ },
58
+ {
59
+ "output_type" : " execute_result" ,
60
+ "data" : {
61
+ "text/plain" : [
62
+ " True"
63
+ ]
64
+ },
65
+ "metadata" : {
66
+ "tags" : []
67
+ },
68
+ "execution_count" : 3
69
+ }
70
+ ]
71
+ },
72
+ {
73
+ "cell_type" : " code" ,
74
+ "metadata" : {
75
+ "id" : " CVxFbs5hmSz5"
76
+ },
77
+ "source" : [
78
+ " import nltk\n " ,
79
+ " import re\n " ,
80
+ " from nltk.corpus import stopwords\n " ,
81
+ " from nltk.stem.porter import PorterStemmer\n " ,
82
+ " from nltk.stem import WordNetLemmatizer\n " ,
83
+ " from sklearn.feature_extraction.text import CountVectorizer"
84
+ ],
85
+ "execution_count" : null ,
86
+ "outputs" : []
87
+ },
88
+ {
89
+ "cell_type" : " code" ,
90
+ "metadata" : {
91
+ "id" : " -eGTmbzWmZMV"
92
+ },
93
+ "source" : [
94
+ " paragraph = \"\"\" I have three visions for India. In 3000 years of our history, people from all over \n " ,
95
+ " the world have come and invaded us, captured our lands, conquered our minds. \n " ,
96
+ " From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,\n " ,
97
+ " the French, the Dutch, all of them came and looted us, took over what was ours. \n " ,
98
+ " Yet we have not done this to any other nation. We have not conquered anyone. \n " ,
99
+ " We have not grabbed their land, their culture, \n " ,
100
+ " their history and tried to enforce our way of life on them. \n " ,
101
+ " Why? Because we respect the freedom of others.That is why my \n " ,
102
+ " first vision is that of freedom. I believe that India got its first vision of \n " ,
103
+ " this in 1857, when we started the War of Independence. It is this freedom that\n " ,
104
+ " we must protect and nurture and build on. If we are not free, no one will respect us.\n " ,
105
+ " My second vision for India’s development. For fifty years we have been a developing nation.\n " ,
106
+ " It is time we see ourselves as a developed nation. We are among the top 5 nations of the world\n " ,
107
+ " in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling.\n " ,
108
+ " Our achievements are being globally recognised today. Yet we lack the self-confidence to\n " ,
109
+ " see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?\n " ,
110
+ " I have a third vision. India must stand up to the world. Because I believe that unless India \n " ,
111
+ " stands up to the world, no one will respect us. Only strength respects strength. We must be \n " ,
112
+ " strong not only as a military power but also as an economic power. Both must go hand-in-hand. \n " ,
113
+ " My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of \n " ,
114
+ " space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.\n " ,
115
+ " I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. \n " ,
116
+ " I see four milestones in my career\"\"\" "
117
+ ],
118
+ "execution_count" : null ,
119
+ "outputs" : []
120
+ },
121
+ {
122
+ "cell_type" : " code" ,
123
+ "metadata" : {
124
+ "id" : " qAfq2diYmmJC"
125
+ },
126
+ "source" : [
127
+ " ps = PorterStemmer()\n " ,
128
+ " wordnet=WordNetLemmatizer()\n " ,
129
+ " sentences = nltk.sent_tokenize(paragraph)\n " ,
130
+ " corpus = []"
131
+ ],
132
+ "execution_count" : null ,
133
+ "outputs" : []
134
+ },
135
+ {
136
+ "cell_type" : " code" ,
137
+ "metadata" : {
138
+ "id" : " dQbpjGI5mpEe"
139
+ },
140
+ "source" : [
141
+ " for i in range(len(sentences)):\n " ,
142
+ " review = re.sub('[^a-zA-Z]', ' ', sentences[i])\n " ,
143
+ " review = review.lower()\n " ,
144
+ " review = review.split()\n " ,
145
+ " review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]\n " ,
146
+ " review = ' '.join(review)\n " ,
147
+ " corpus.append(review)"
148
+ ],
149
+ "execution_count" : null ,
150
+ "outputs" : []
151
+ },
152
+ {
153
+ "cell_type" : " code" ,
154
+ "metadata" : {
155
+ "id" : " ozeAyFDXmvSi"
156
+ },
157
+ "source" : [
158
+ " cv = CountVectorizer(max_features = 1500)\n " ,
159
+ " X = cv.fit_transform(corpus).toarray()"
160
+ ],
161
+ "execution_count" : null ,
162
+ "outputs" : []
163
+ },
164
+ {
165
+ "cell_type" : " code" ,
166
+ "metadata" : {
167
+ "id" : " 317O_rAEmxjv" ,
168
+ "colab" : {
169
+ "base_uri" : " https://localhost:8080/"
170
+ },
171
+ "outputId" : " 0f0f70fc-06db-4b57-dc25-ac18f967df75"
172
+ },
173
+ "source" : [
174
+ " X"
175
+ ],
176
+ "execution_count" : null ,
177
+ "outputs" : [
178
+ {
179
+ "output_type" : " execute_result" ,
180
+ "data" : {
181
+ "text/plain" : [
182
+ " array([[0, 0, 0, ..., 0, 0, 0],\n " ,
183
+ " [0, 0, 0, ..., 1, 1, 0],\n " ,
184
+ " [0, 1, 0, ..., 0, 0, 0],\n " ,
185
+ " ...,\n " ,
186
+ " [0, 0, 0, ..., 0, 0, 0],\n " ,
187
+ " [0, 0, 0, ..., 0, 0, 0],\n " ,
188
+ " [0, 0, 0, ..., 0, 0, 0]])"
189
+ ]
190
+ },
191
+ "metadata" : {
192
+ "tags" : []
193
+ },
194
+ "execution_count" : 10
195
+ }
196
+ ]
197
+ },
198
+ {
199
+ "cell_type" : " code" ,
200
+ "metadata" : {
201
+ "id" : " G1Q426GrmyeY"
202
+ },
203
+ "source" : [
204
+ " "
205
+ ],
206
+ "execution_count" : null ,
207
+ "outputs" : []
208
+ }
209
+ ]
210
+ }
0 commit comments