Skip to content

Commit e79a604

Browse files
author
ballima
committed
Create reducer.py
1 parent c05b65e commit e79a604

File tree

1 file changed

+39
-0
lines changed

1 file changed

+39
-0
lines changed

reducer.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python
2+
3+
from operator import itemgetter
4+
import sys
5+
6+
current_word = None
7+
current_count = 0
8+
word = None
9+
10+
# input comes from STDIN
11+
for line in sys.stdin:
12+
# remove leading and trailing whitespace
13+
line = line.strip()
14+
15+
# parse the input we got from mapper.py
16+
word, count = line.split('\t', 1)
17+
18+
# convert count (currently a string) to int
19+
try:
20+
count = int(count)
21+
except ValueError:
22+
# count was not a number, so silently
23+
# ignore/discard this line
24+
continue
25+
26+
# this IF-switch only works because Hadoop sorts map output
27+
# by key (here: word) before it is passed to the reducer
28+
if current_word == word:
29+
current_count += count
30+
else:
31+
if current_word:
32+
# write result to STDOUT
33+
print '%s\t%s' % (current_word, current_count)
34+
current_count = count
35+
current_word = word
36+
37+
# do not forget to output the last word if needed!
38+
if current_word == word:
39+
print '%s\t%s' % (current_word, current_count)

0 commit comments

Comments
 (0)