Skip to content

Commit

Permalink
Fix trainling spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
nielsbasjes committed Oct 29, 2020
1 parent 1d3a999 commit 3ff33bb
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 150 deletions.
10 changes: 5 additions & 5 deletions Benchmark/GetStatistisc.make
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ LastJob = 0267

JobTargets = $(shell for I in `seq -w $(FirstJob) $(LastJob)` ; do echo Results-$$I.txt ; done)

Results.txt:: $(JobTargets)
Results.txt:: $(JobTargets)
@echo "Merge results"
@cat Results-*.txt > $@

Expand All @@ -17,18 +17,18 @@ Results-%.txt: Extract-%.txt
@( ID=$$(echo $@ | cut -d'-' -f 2 | cut -d'.' -f1) ; echo "Calculate run $${ID}" )
@cat $< | sed 's@\([0-9]*\)hrs, @(\1*3600)+@g;s@\([0-9]*\)mins, @(\1*60)+@g;s@\([0-9]*\)sec@(\1)@g;s@ @@g;s@|@ @g' |\
while read name splits elapsed avgmap ; do echo "$${name} | $${splits} | $$((elapsed)) | $$((avgmap)) | $$(( (avgmap) * (splits) )) " ; done >$@

clean::
rm -f $(JobTargets)

#.PRECIOUS: Extract-%.txt
Extract-%.txt: Output-%.txt
@cat $< | tr -d '\n' | sed 's@ *<@<@g;s@> *@>@g;s@.*>Job Name:<td>\([^<]*\).*>Elapsed:<td>\([^<]*\).*>Average Map Time<td>\([^<]*\).*>Map</a><td>\([^<]*\).*@\1 | \4 | \2 | \3 @;s@Wordcount-@@g;s@Wordcount@GzipCodec@g;' > $@ ;
@echo >> $@
@cat $< | tr -d '\n' | sed 's@ *<@<@g;s@> *@>@g;s@.*>Job Name:<td>\([^<]*\).*>Elapsed:<td>\([^<]*\).*>Average Map Time<td>\([^<]*\).*>Map</a><td>\([^<]*\).*@\1 | \4 | \2 | \3 @;s@Wordcount-@@g;s@Wordcount@GzipCodec@g;' > $@ ;
@echo >> $@

clean::
rm -f Extract-*.txt


.PRECIOUS: Output-%.txt
Output-%.txt:
Expand Down
6 changes: 3 additions & 3 deletions Benchmark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
IoBytesPerChecksum = 512
InputFile = words-20K.txt.gz

all:
all:
@echo "Don't use the makefile directly. Run the ./run.sh instead."

clean::
clean::
-rm -f wordcount-normal.txt

wordcount-normal.txt: $(InputFile) target/wordcount-1.0-job.jar
Expand All @@ -24,7 +24,7 @@ wordcount-%.txt: $(InputFile) target/wordcount-1.0-job.jar
hdfs dfs -text wordcount-$${SPLITS}/part-r-* | sort > wordcount-$${SPLITS}.txt ; \
)

clean::
clean::
-rm -f wordcount-*.txt

words.txt.gz:
Expand Down
114 changes: 57 additions & 57 deletions Benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ After the maximal parallelization point (12) we see that the processing time jum

From the ganglia graphs of the cluster you can see that beyond the point of `no more free mappers` the only thing that really changes is that the cluster is moving more and more data around over the network.

![CPU Usage](cpu-graph.gif)
![Load](load-graph.gif)
![CPU Usage](cpu-graph.gif)
![Load](load-graph.gif)
![Network traffic](network-graph.gif)

Splitsize | Splits | Elapsed | Avg. Map Time | Total Map Time
Expand Down Expand Up @@ -101,57 +101,57 @@ On a single machine simply running a `gzip -t <file>` takes about 45 seconds. So

Splitsize | Splits | Elapsed | Avg. Map Time | Total Map Time
--:|--:|--:|--:|--:|
GzipCodec | 1 | 1401 | 1393 | 1393
1101900001 | 1 | 1363 | 1355 | 1355
550950001 | 2 | 764 | 757 | 1514
367300001 | 3 | 559 | 528 | 1584
275475001 | 4 | 442 | 415 | 1660
220380001 | 5 | 365 | 315 | 1575
183650001 | 6 | 281 | 259 | 1554
157414286 | 7 | 246 | 224 | 1568
137737501 | 8 | 234 | 205 | 1640
122433334 | 9 | 208 | 182 | 1638
110190001 | 10 | 191 | 170 | 1700
100172728 | 11 | 174 | 151 | 1661
91825001 | 12 | 174 | 144 | 1728
84761539 | 13 | 247 | 132 | 1716
78707143 | 14 | 247 | 124 | 1736
73460001 | 15 | 238 | 117 | 1755
68868751 | 16 | 229 | 112 | 1792
64817648 | 17 | 248 | 106 | 1802
61216667 | 18 | 231 | 102 | 1836
57994737 | 19 | 217 | 98 | 1862
55095001 | 20 | 220 | 94 | 1880
52471429 | 21 | 195 | 90 | 1890
50086364 | 22 | 205 | 89 | 1958
47908696 | 23 | 210 | 85 | 1955
45912501 | 24 | 236 | 80 | 1920
44076001 | 25 | 226 | 80 | 2000
42380770 | 26 | 242 | 78 | 2028
40811112 | 27 | 234 | 75 | 2025
39353572 | 28 | 232 | 73 | 2044
37996552 | 29 | 228 | 71 | 2059
36730001 | 30 | 238 | 71 | 2130
35545162 | 31 | 234 | 70 | 2170
34434376 | 32 | 229 | 67 | 2144
33390910 | 33 | 223 | 65 | 2145
32408824 | 34 | 241 | 64 | 2176
31482858 | 35 | 247 | 62 | 2170
30608334 | 36 | 248 | 61 | 2196
29781082 | 37 | 248 | 60 | 2220
28997369 | 38 | 255 | 60 | 2280
28253847 | 39 | 238 | 58 | 2262
27547501 | 40 | 237 | 56 | 2240
26875610 | 41 | 252 | 56 | 2296
26235715 | 42 | 245 | 56 | 2352
25625582 | 43 | 245 | 55 | 2365
25043182 | 44 | 256 | 54 | 2376
24486667 | 45 | 238 | 52 | 2340
23954348 | 46 | 265 | 52 | 2392
23444681 | 47 | 258 | 51 | 2397
22956251 | 48 | 258 | 50 | 2400
22487756 | 49 | 270 | 51 | 2499
22038001 | 50 | 258 | 48 | 2400
GzipCodec | 1 | 1401 | 1393 | 1393
1101900001 | 1 | 1363 | 1355 | 1355
550950001 | 2 | 764 | 757 | 1514
367300001 | 3 | 559 | 528 | 1584
275475001 | 4 | 442 | 415 | 1660
220380001 | 5 | 365 | 315 | 1575
183650001 | 6 | 281 | 259 | 1554
157414286 | 7 | 246 | 224 | 1568
137737501 | 8 | 234 | 205 | 1640
122433334 | 9 | 208 | 182 | 1638
110190001 | 10 | 191 | 170 | 1700
100172728 | 11 | 174 | 151 | 1661
91825001 | 12 | 174 | 144 | 1728
84761539 | 13 | 247 | 132 | 1716
78707143 | 14 | 247 | 124 | 1736
73460001 | 15 | 238 | 117 | 1755
68868751 | 16 | 229 | 112 | 1792
64817648 | 17 | 248 | 106 | 1802
61216667 | 18 | 231 | 102 | 1836
57994737 | 19 | 217 | 98 | 1862
55095001 | 20 | 220 | 94 | 1880
52471429 | 21 | 195 | 90 | 1890
50086364 | 22 | 205 | 89 | 1958
47908696 | 23 | 210 | 85 | 1955
45912501 | 24 | 236 | 80 | 1920
44076001 | 25 | 226 | 80 | 2000
42380770 | 26 | 242 | 78 | 2028
40811112 | 27 | 234 | 75 | 2025
39353572 | 28 | 232 | 73 | 2044
37996552 | 29 | 228 | 71 | 2059
36730001 | 30 | 238 | 71 | 2130
35545162 | 31 | 234 | 70 | 2170
34434376 | 32 | 229 | 67 | 2144
33390910 | 33 | 223 | 65 | 2145
32408824 | 34 | 241 | 64 | 2176
31482858 | 35 | 247 | 62 | 2170
30608334 | 36 | 248 | 61 | 2196
29781082 | 37 | 248 | 60 | 2220
28997369 | 38 | 255 | 60 | 2280
28253847 | 39 | 238 | 58 | 2262
27547501 | 40 | 237 | 56 | 2240
26875610 | 41 | 252 | 56 | 2296
26235715 | 42 | 245 | 56 | 2352
25625582 | 43 | 245 | 55 | 2365
25043182 | 44 | 256 | 54 | 2376
24486667 | 45 | 238 | 52 | 2340
23954348 | 46 | 265 | 52 | 2392
23444681 | 47 | 258 | 51 | 2397
22956251 | 48 | 258 | 50 | 2400
22487756 | 49 | 270 | 51 | 2499
22038001 | 50 | 258 | 48 | 2400

## A 220 MB input file doing a wordcount.

Expand Down Expand Up @@ -218,7 +218,7 @@ Splitsize | Splits | Elapsed | Avg. Map Time | Total Map Time
70000000 | 2 | 90 sec | 83 sec | 166 sec |
50000000 | 3 | 66 sec | 58 sec | 174 sec |
35000000 | 4 | 55 sec | 48 sec | 192 sec |
28000000 | 5 | 49 sec | 42 sec | 210 sec |
28000000 | 5 | 49 sec | 42 sec | 210 sec |
25000000 | 6 | 39 sec | 32 sec | 192 sec |
20000000 | 7 | 38 sec | 30 sec | 210 sec |
17500000 | 8 | 32 sec | 26 sec | 208 sec |
Expand All @@ -227,15 +227,15 @@ Splitsize | Splits | Elapsed | Avg. Map Time | Total Map Time
11500000 | 13 | 44 sec | 19 sec | 247 sec |
10000000 | 14 | 39 sec | 18 sec | 252 sec |
5000000 | 28 | 42 sec | 11 sec | 308 sec |
2500000 | 56 | 59 sec | 8 sec | 448 sec |
2500000 | 56 | 59 sec | 8 sec | 448 sec |

Splitsize | Splits | Elapsed | Avg. Map Time | Total Map Time
--:|--:|--:|--:|--:|
139796017 | 1 | 167 sec | 159 sec | 159 sec |
70000000 | 2 | 90 sec | 82 sec | 164 sec |
50000000 | 3 | 65 sec | 57 sec | 171 sec |
35000000 | 4 | 57 sec | 48 sec | 192 sec |
28000000 | 5 | 48 sec | 41 sec | 205 sec |
28000000 | 5 | 48 sec | 41 sec | 205 sec |
25000000 | 6 | 44 sec | 35 sec | 210 sec |
20000000 | 7 | 38 sec | 29 sec | 203 sec |
17500000 | 8 | 34 sec | 28 sec | 224 sec |
Expand All @@ -244,6 +244,6 @@ Splitsize | Splits | Elapsed | Avg. Map Time | Total Map Time
11500000 | 13 | 43 sec | 19 sec | 247 sec |
10000000 | 14 | 40 sec | 18 sec | 252 sec |
5000000 | 28 | 43 sec | 12 sec | 336 sec |
2500000 | 56 | 62 sec | 9 sec | 504 sec |
2500000 | 56 | 62 sec | 9 sec | 504 sec |


16 changes: 8 additions & 8 deletions Benchmark/javamr/src/main/assembly/job.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
<assembly
xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
<id>job</id>
Expand All @@ -20,12 +20,12 @@
<unpack>false</unpack>
<scope>runtime</scope>
</dependencySet>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>lib</outputDirectory>
<unpack>false</unpack>
<scope>provided</scope>
</dependencySet>
<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>lib</outputDirectory>
<unpack>false</unpack>
<scope>provided</scope>
</dependencySet>
-->
</dependencySets>
<fileSets>
Expand Down
4 changes: 2 additions & 2 deletions Benchmark/pig/src/main/bash/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ for SPLITS in `seq 1 100`;
do
hdfs dfs -rm -R wordcount-${SPLITS}.txt
FILESIZE=$(stat -c%s "${INPUT}")
SPLITSIZE=$(((FILESIZE/SPLITS)+1))
SPLITSIZE=$(((FILESIZE/SPLITS)+1))

exec 3>&1 4>&2
TIME=$(TIMEFORMAT="%R"; { time pig -param file=${INPUT} -param splits=${SPLITS} -param splitsize=${SPLITSIZE} pig/wordcount.pig 1>&3 2>&4; } 2>&1)
exec 3>&- 4>&-
echo $(date),${SPLITS},${FILESIZE},${SPLITSIZE},${TIME} >> output.txt
done

cat output.txt
cat output.txt
2 changes: 1 addition & 1 deletion Benchmark/pig/src/main/resources/log4j.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Root logger option
log4j.rootLogger=WARN, stdout

# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
Expand Down
4 changes: 2 additions & 2 deletions Benchmark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

make wordcount-normal.txt

for I in `seq 1 50` ;
do
for I in `seq 1 50` ;
do
make wordcount-${I}.txt
done
2 changes: 1 addition & 1 deletion README-Spark.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Thanks to [Nicholas Chammas](https://github.com/nchammas) for contributing this documentation.

# Common problem for Spark users
Apparently the fact that GZipped files are not splittable is also in the Spark arena a recurring problem as you can see
Apparently the fact that GZipped files are not splittable is also in the Spark arena a recurring problem as you can see
in this [Spark Jira ticket](https://issues.apache.org/jira/browse/SPARK-29102?focusedCommentId=16932921&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-16932921) and
these two questions on StackOverflow [Question 1](https://stackoverflow.com/q/28127119/877069) [Question 2](https://stackoverflow.com/q/27531816/877069).

Expand Down
Loading

0 comments on commit 3ff33bb

Please sign in to comment.