-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathremove-stopwords-online.html
235 lines (226 loc) · 12.1 KB
/
remove-stopwords-online.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
---
title: Remove Stopwords Online and Cleanse Text| Developer Tools
layout: post
---
<html>
<head>
<!-- Meta tags common for website -->
{% include common-meta %}
<title>{{ page.title }}</title>
<!-- Tell the browser to be responsive to screen width -->
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport">
<meta name="description" content="This is easy to use open source tool to cleanse english stopwords from any text or string." />
<meta name="keywords" content="online,tool,stopwords,english,online,cleanse,web,opensource" />
<!-- CSS for the site theme -->
{% include theme-css %}
<!-- Annoying IE fixes -->
{% include ie-fixes %}
</head>
<body class="hold-transition skin-green sidebar-mini">
<!-- Site wrapper -->
<div class="wrapper">
<!-- header tag from theme -->
{% include theme-header %}
<!-- Sidebar for the whole website -->
{% include theme-sidebar %}
<!-- Content Wrapper. Contains page content -->
<div class="content-wrapper">
<!-- Main content -->
<section class="content">
<div class="row">
<!-- left column -->
<div class="col-md-6">
<div class="box box-info">
<div class="box-header with-border">
<h1 class="box-title">Stop Words Removal Tool: Cleanse Text</h1>
</div>
<!-- /.box-header -->
<!-- form start -->
<div class="box-body">
<form role="form">
<div class="form-group">
<label for="intext">Your Text</label>
<textarea class="form-control" rows="10" id="intext" placeholder="Enter your text here" autofocus></textarea>
</div>
<div class="form-group">
<label for="customstopwords">Your Custom Stopwords (Comma separated)</label>
<textarea class="form-control" rows="5" id="customstopwords" placeholder="Enter your custom stopwords comma[,] separated here"></textarea>
</div>
</form>
</div>
<!-- /.box-body -->
<div class="box-footer">
<div class="row">
<div class="col-xs-3">
<button type="button" class="btn btn-info" id="removestopwords">Remove Stopwords<i class="fa fa-fw fa-arrow-right"></i></button>
</div>
</div>
</div>
<!-- /.box-footer -->
</div>
</div>
<div class="col-md-6">
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">Copy Your Clean Text From Here</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<div class="form-group">
<label for="output">Cleansed / Filtered Text</label>
<textarea class="form-control" id="output" rows="10" placeholder="Copy from cleased text from here"></textarea>
</div>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">Removed Words Count</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p id="removedwordscount"></p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">Removed Words</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p id="removedwords"></p>
</div>
<!-- /.box-body -->
</div>
</div>
</div>
</section>
<section class="content">
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">About Stopwords Cleanser Tool</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<img class="img-responsive" src="images/stopwords-word-cloud.png" alt="Online Stopwords removal tool" title="Online Stopwords removal tool">
<p>This is a free online tool to remove and clean any text. The tool is opensource and free to use. It works in any modern browser. This tool uses a default stopwords list in English.</p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">Can I remove my custom stopwords?</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p>Yes, this tool support custom stopwords. You can add your own words and use them as stopwords. This tool can remember your custom stopwords in your browser. This feature can be handy for repeat use. Please note to use same browser to ensure data saved on your browser can be used, this site does not have any server side storage so if you change your browser your custom stopwords need to be added again.</p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">Can I remove non-English stopwords?</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p>Yes, you can use the custom stopwords in any language. However, we donot have a predefined list of each language. You can contribute a language if you would like.</p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">What are stopwords?</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p>Stopwords are the words that commonly appear in natural language. The concept of stopwords is common in datamining, machine learning and natural language processing (NLP). These repeating words (stopwords) donot add much value in machine learning. Therefore it has become a common practice to remove them from text under analysis. </p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">Why do we remove stop words?</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p>Stop words may not be value add in computing. Therefore most of the machine leanring and data processing tools remove them before processing. </p>
<p>This approach also reduces the size of text to process. Smaller text can be analyzed quicker. When machine learning is doing a big data analysis it becomes essential to clean up the text to save resources.</p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">What are common stopwords?</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p>The commonly removed stop words are listed below</p>
<p><code>a</code> <code>the</code> <code>then</code> <code>than</code> <code>is</code> <code>was</code> <code>not</code></p>
</div>
<!-- /.box-body -->
</div>
<div class="box box-success">
<div class="box-header with-border">
<h3 class="box-title">How many stopwords in English?</h3>
</div>
<!-- /.box-header -->
<div class="box-body">
<p>The list of stopwords can grow based on the application and context of use. Some stopwords list have upto 800+ words in them. </p>
</div>
<!-- /.box-body -->
</div>
</section>
{% include addthis %}
</div>
<!-- /.content-wrapper -->
{% include theme-footer %}
</div>
<!-- ./wrapper -->
{% include theme-bottom-js %}
</body>
<script src="plugins/selectOnFocus/jquery.selectOnFocus.min.js"></script>
<script src="javascripts/stopwords.js?v=3"></script>
<script src="javascripts/fromdev-utils.js"></script>
<script>
const processCustomStopwords = function() {
const customstopwords = $("#customstopwords").val();
const hasCustomStopwords = (customstopwords && customstopwords.length);
const customstopwordsarr = (hasCustomStopwords) ? customstopwords.split(',') : [];
let trimmedCustomstopwordsarr = customstopwordsarr.map(str => str.trim().toLowerCase());
if(hasCustomStopwords) {
//update storage
StorageUtils.setItem('CUSTOM_STOP_WORDS', trimmedCustomstopwordsarr.join(','));
} else {
StorageUtils.setItem('CUSTOM_STOP_WORDS', '');
}
return hasCustomStopwords ? new Set(trimmedCustomstopwordsarr) : new Set();
};
$(document).ready(function() {
const storedCustomStopWords = StorageUtils.getItem('CUSTOM_STOP_WORDS');
if(storedCustomStopWords) {
$("#customstopwords").val(storedCustomStopWords);
}
$("#removestopwords").click(function(){
var input = $("#intext").val();
if(input) {
const customStopwordSet = processCustomStopwords();
var cleansedOutput = cleanse(input, customStopwordSet);
if(cleansedOutput) {
$("#output").val(cleansedOutput.out);
const removedWordSize = cleansedOutput.removedWords ? cleansedOutput.removedWords.length : 0;
$("#removedwordscount").html(removedWordSize);
const removedWordSet = (removedWordSize) ? new Set(cleansedOutput.removedWords) : new Set();
const removedWordsHtmlArr = [];
removedWordSet.forEach(word => removedWordsHtmlArr.push(`<code>${word}</code>`));
$("#removedwords").html(removedWordsHtmlArr.join(' '));
}
}
});
$("#intext").selectOnFocus();
$("#output").selectOnFocus();
$('#string-utility-category').addClass('active');
$('.markdown-body').attr('style', 'max-width:100%;');
});
</script>
</html>