diff --git a/run.sh b/run.sh index 0100a6b..a98cffc 100755 --- a/run.sh +++ b/run.sh @@ -8,7 +8,7 @@ fi set -e # erlc +bin_opt_info -W src/erlang_1brc.erl erlc -W src/erlang_1brc.erl || exit 1 -/bin/time -f "Elapsed time: %e seconds (%E)" \ +/bin/time -f "Elapsed time: %e seconds (%E) (%P CPU)" \ erl \ -noinput \ -s erlang_1brc run "$1" \ diff --git a/src/erlang_1brc.erl b/src/erlang_1brc.erl index 6888adc..7dfe42f 100644 --- a/src/erlang_1brc.erl +++ b/src/erlang_1brc.erl @@ -8,20 +8,55 @@ , run/1 %% Entrypoint for run.sh ]). +%% These inlinings are actually necessary. On my machine, it yields a +%% 15-20% performance improvement. -compile({inline, [ {process_temp, 2} , {process_line, 3} , {process_station, 2} ]}). +%% This is the size of the chunks we read from the input file at a +%% time. There is some overhead "stitching" buffers together, so this +%% should be large enough to keep the worker threads busy in the +%% process_* functions. -define(BUFSIZE, 2 * 1024 * 1024). -%% 64k seems to be the smallest buffer we can read and still get all -%% the city names. This should be computed dynamically instead. +%% We pre-compute a mapping using the `KEY' macro from cities to +%% (smallish) integers and store in the process dictionary. When we +%% iterate over the chunks, matching binaries as we go, this mapping +%% is such that it can be computed byte-by-byte, so we do not need to +%% keep the entire city name around. This makes it easier to leverage +%% the "match context reuse" optimization, as we do not need any +%% "look-back" to extract the station name once we reach the ";". +%% +%% The formula is totally non-scientific, but it seems to work well in +%% practise. +%% +%% For the large inputs, the loop in `process_station/2' turns out to +%% be the really hot part, and the match-context reuse optimization +%% almost halves the total runtime. +-define(KEY(C, Acc), ((C * 17) bxor Acc) bsl 1). + +%% The worker threads will produce a map of #{Key => TempData} where +%% the key is an integer (see the `KEY' macro). To be able to convert +%% the key back into a station name, we compute the mapping between +%% keys and their station name upfront by reading a single chunk of +%% size `MAP_CITIES_BUFSIZE'. This should be large enough to include +%% all citites, but small enough such the scanning of it is several +%% magnitudes faster than the the total runtime. +%% +%% For my 1B-file, 64k seems to be the smallest buffer we can read and +%% still get all the city names. +%% +%% TODO Compute this dynamically instead. This can be done once we +%% have collected all the temperature data; we can then scan the file +%% from the beginning until we have found stations for all the keys +%% used in the temperature data map. -define(MAP_CITIES_BUFSIZE, 64 * 1024). --define(EXPECTED_NUM_CITIES, 413). -%% Compute a compressed key one byte at a time --define(KEY(C, Acc), ((C * 17) bxor Acc) bsl 1). +%% Just as a precaution, check that we have actually found all the +%% cities. +-define(EXPECTED_NUM_CITIES, 413). options() -> [ {file, $f, "file", {string, "measurements.txt"}, "The input file."} @@ -219,6 +254,7 @@ start_processors() -> start_processors(NumProcs) -> Self = self(), + io:format("Starting ~p parallell chunk processors~n", [NumProcs]), lists:foldl( fun(_, Pids) -> [spawn_link(fun() -> chunk_processor(Self) end)|Pids]