Skip to content

Commit 9e9b27b

Browse files
committed
Add word count script
For IAE scenario
1 parent d62354b commit 9e9b27b

File tree

1 file changed

+29
-0
lines changed

1 file changed

+29
-0
lines changed

notebooks/wordcount.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from __future__ import print_function
2+
from pyspark import SparkContext, SparkConf
3+
import sys
4+
5+
if __name__ == "__main__":
6+
7+
# create Spark context with Spark configuration
8+
conf = SparkConf().setAppName("Spark Count")
9+
sc = SparkContext(conf=conf)
10+
11+
# get threshold
12+
threshold = int(sys.argv[2])
13+
14+
# read in text file and split each document into words
15+
tokenized = sc.textFile(sys.argv[1]).flatMap(lambda line: line.split(" "))
16+
17+
# count the occurrence of each word
18+
wordCounts = tokenized.map(lambda word: (word, 1)).reduceByKey(lambda v1,v2:v1 +v2)
19+
20+
# filter out words with fewer than threshold occurrences
21+
filtered = wordCounts.filter(lambda pair:pair[1] >= threshold)
22+
23+
# count characters
24+
charCounts = filtered.flatMap(lambda pair:pair[0]).map(lambda c: c).map(lambda c: (c, 1)).reduceByKey(lambda v1,v2:v1 +v2)
25+
26+
list = charCounts.collect()
27+
f1=open('/home/clsadmin/output.txt', 'w+')
28+
f1.write(repr(list)[1:-1])
29+
f1.close()

0 commit comments

Comments
 (0)