File tree Expand file tree Collapse file tree 1 file changed +29
-0
lines changed Expand file tree Collapse file tree 1 file changed +29
-0
lines changed Original file line number Diff line number Diff line change 1+ from  __future__ import  print_function 
2+ from  pyspark  import  SparkContext , SparkConf 
3+ import  sys 
4+ 
5+ if  __name__  ==  "__main__" :
6+ 
7+   # create Spark context with Spark configuration 
8+   conf  =  SparkConf ().setAppName ("Spark Count" )
9+   sc  =  SparkContext (conf = conf )
10+ 
11+   # get threshold 
12+   threshold  =  int (sys .argv [2 ])
13+ 
14+   # read in text file and split each document into words 
15+   tokenized  =  sc .textFile (sys .argv [1 ]).flatMap (lambda  line : line .split (" " ))
16+ 
17+   # count the occurrence of each word 
18+   wordCounts  =  tokenized .map (lambda  word : (word , 1 )).reduceByKey (lambda  v1 ,v2 :v1  + v2 )
19+ 
20+   # filter out words with fewer than threshold occurrences 
21+   filtered  =  wordCounts .filter (lambda  pair :pair [1 ] >=  threshold )
22+ 
23+   # count characters 
24+   charCounts  =  filtered .flatMap (lambda  pair :pair [0 ]).map (lambda  c : c ).map (lambda  c : (c , 1 )).reduceByKey (lambda  v1 ,v2 :v1  + v2 )
25+ 
26+   list  =  charCounts .collect ()
27+   f1 = open ('/home/clsadmin/output.txt' , 'w+' )
28+   f1 .write (repr (list )[1 :- 1 ])
29+   f1 .close ()
    
 
   
 
     
   
   
          
     
  
    
     
 
    
      
     
 
     
    You can’t perform that action at this time.
  
 
    
  
     
    
      
        
     
 
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments