Skip to content

Commit

Permalink
simplify src data env var as we dont use remote data like in db-bench…
Browse files Browse the repository at this point in the history
…mark v1.0.0
  • Loading branch information
jangorecki committed Nov 25, 2020
1 parent edf9328 commit 601e0bd
Show file tree
Hide file tree
Showing 22 changed files with 68 additions and 84 deletions.
21 changes: 3 additions & 18 deletions _launcher/solution.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,6 @@ file.ext = function(x) {
if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
ans
}
# data_name env var for each task
task.env = function(x) {
ans = switch(
x,
"groupby"="SRC_GRP_LOCAL",
"join"="SRC_JN_LOCAL"
)
if (is.null(ans)) stop(sprintf("task %s does not have data name environment variable defined in task.env helper function", x))
ans
}
# dynamic LHS in: Sys.setenv(var = value)
setenv = function(var, value, quiet=TRUE) {
stopifnot(is.character(var), !is.na(var), length(value)==1L, is.atomic(value))
Expand All @@ -145,9 +135,6 @@ data.desc = function(task, nrow, k, na, sort) {
}
sprintf("%s_%s_%s_%s_%s", prefix, nrow, k, na, sort)
}
data_name_exception = function(solution, task, d) {
d
}
# no dots solution name used in paths
solution.path = function(x) {
gsub(".", "", x, fixed=TRUE)
Expand All @@ -157,23 +144,21 @@ solution.path = function(x) {

s = args[["solution"]]
t = args[["task"]]
data_name_env = task.env(t)
d = data.desc(t, args[["nrow"]], args[["k"]], args[["na"]], args[["sort"]])
d = data_name_exception(solution=s, task=t, d=d) # this is already handled in launch.R but here we handle ad-hoc single solution cmd runs

Sys.setenv("CSV_TIME_FILE"=args[["out"]])
setenv(data_name_env, d)
setenv("SRC_DATANAME", d)

ns = solution.path(s)
ext = file.ext(s)
localcmd = if (s %in% c("clickhouse","h2o")) { # custom launcher bash script, for clickhouse and h2o
sprintf("exec.sh %s %s", t, d)
sprintf("exec.sh %s", t)
} else sprintf("%s-%s.%s", t, ns, ext)
cmd = sprintf("./%s/%s", ns, localcmd)

ret = system(cmd, ignore.stdout=as.logical(args[["quiet"]]))

Sys.unsetenv(data_name_env)
Sys.unsetenv("SRC_DATANAME")
Sys.unsetenv("CSV_TIME_FILE")

if (stdout && file.size(args[["out"]])) {
Expand Down
76 changes: 38 additions & 38 deletions clickhouse/exec.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash
set -e

if [ "$#" -ne 2 ]; then
echo 'usage: ./clickhouse/exec.sh groupby G1_1e7_1e2_0_0';
exit 1
if [ "$#" -ne 1 ]; then
echo 'usage: ./clickhouse/exec.sh groupby';
exit 1
fi;

source ./clickhouse/ch.sh
Expand All @@ -19,56 +19,56 @@ ch_active || exit 1

# tune CH settings and load data
CH_MEM=107374182400 # 100GB ## old value 128849018880 # 120GB ## now set to 96GB after cache=1 to in-memory temp tables because there was not enough mem for R to parse timings
clickhouse-client --query "DROP TABLE IF EXISTS ans"
echo "# clickhouse/exec.sh: creating tables and loading data"
clickhouse-client --query 'DROP TABLE IF EXISTS ans'
echo '# clickhouse/exec.sh: creating tables and loading data'
if [ $1 == 'groupby' ]; then
CH_EXT_GRP_BY=53687091200 # twice less than CH_MEM #96
CH_EXT_SORT=53687091200
clickhouse-client --query "DROP TABLE IF EXISTS $2"
clickhouse-client --query "CREATE TABLE $2 (id1 String, id2 String, id3 String, id4 Int32, id5 Int32, id6 Int32, v1 Int32, v2 Int32, v3 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $2 SELECT id1, id2, id3, id4, id5, id6, v1, v2, v3 FROM file('data/$2.csv', 'CSVWithNames', 'id1 String, id2 String, id3 String, id4 Int32, id5 Int32, id6 Int32, v1 Int32, v2 Int32, v3 Float64')"
clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME"
clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 String, id2 String, id3 String, id4 Int32, id5 Int32, id6 Int32, v1 Int32, v2 Int32, v3 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT id1, id2, id3, id4, id5, id6, v1, v2, v3 FROM file('data/$SRC_DATANAME.csv', 'CSVWithNames', 'id1 String, id2 String, id3 String, id4 Int32, id5 Int32, id6 Int32, v1 Int32, v2 Int32, v3 Float64')"
# confirm all data loaded yandex/ClickHouse#4463
echo -e "clickhouse-client --query 'SELECT count(*) FROM $2'\n$2" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(strsplit(stdin[2L], "_", fixed=TRUE)[[1L]][2L])) stop("incomplete data load for ", stdin[2L],", loaded ", loaded, " rows only")'
echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
elif [ $1 == 'join' ]; then
# lhs
clickhouse-client --query "DROP TABLE IF EXISTS $2"
clickhouse-client --query "CREATE TABLE $2 (id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v1 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $2 SELECT id1, id2, id3, id4, id5, id6, v1 FROM file('data/$2.csv', 'CSVWithNames', 'id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v1 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $2'\n$2" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(strsplit(stdin[2L], "_", fixed=TRUE)[[1L]][2L])) stop("incomplete data load for ", stdin[2L],", loaded ", loaded, " rows only")'
rhs=$(join_to_tbls $2)
rhs1=$(echo $rhs | cut -d ' ' -f1)
clickhouse-client --query "DROP TABLE IF EXISTS $rhs1"
clickhouse-client --query "CREATE TABLE $rhs1 (id1 Int32, id4 String, v2 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $rhs1 SELECT id1, id4, v2 FROM file('data/$rhs1.csv', 'CSVWithNames', 'id1 Int32, id4 String, v2 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $rhs1'\n$rhs1" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(strsplit(stdin[2L], "_", fixed=TRUE)[[1L]][3L])) stop("incomplete data load for ", stdin[2L],", loaded ", loaded, " rows only")'
rhs2=$(echo $rhs | cut -d ' ' -f2)
clickhouse-client --query "DROP TABLE IF EXISTS $rhs2"
clickhouse-client --query "CREATE TABLE $rhs2 (id1 Int32, id2 Int32, id4 String, id5 String, v2 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $rhs2 SELECT id1, id2, id4, id5, v2 FROM file('data/$rhs2.csv', 'CSVWithNames', 'id1 Int32, id2 Int32, id4 String, id5 String, v2 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $rhs2'\n$rhs2" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(strsplit(stdin[2L], "_", fixed=TRUE)[[1L]][3L])) stop("incomplete data load for ", stdin[2L],", loaded ", loaded, " rows only")'
rhs3=$(echo $rhs | cut -d ' ' -f3)
clickhouse-client --query "DROP TABLE IF EXISTS $rhs3"
clickhouse-client --query "CREATE TABLE $rhs3 (id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v2 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $rhs3 SELECT id1, id2, id3, id4, id5, id6, v2 FROM file('data/$rhs3.csv', 'CSVWithNames', 'id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v2 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $rhs3'\n$rhs3" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(strsplit(stdin[2L], "_", fixed=TRUE)[[1L]][3L])) stop("incomplete data load for ", stdin[2L],", loaded ", loaded, " rows only")'
clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME"
clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v1 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT id1, id2, id3, id4, id5, id6, v1 FROM file('data/$SRC_DATANAME.csv', 'CSVWithNames', 'id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v1 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
RHS=$(join_to_tbls $SRC_DATANAME)
RHS1=$(echo $RHS | cut -d' ' -f1)
clickhouse-client --query "DROP TABLE IF EXISTS $RHS1"
clickhouse-client --query "CREATE TABLE $RHS1 (id1 Int32, id4 String, v2 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS1 SELECT id1, id4, v2 FROM file('data/$RHS1.csv', 'CSVWithNames', 'id1 Int32, id4 String, v2 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
RHS2=$(echo $RHS | cut -d' ' -f2)
clickhouse-client --query "DROP TABLE IF EXISTS $RHS2"
clickhouse-client --query "CREATE TABLE $RHS2 (id1 Int32, id2 Int32, id4 String, id5 String, v2 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS2 SELECT id1, id2, id4, id5, v2 FROM file('data/$RHS2.csv', 'CSVWithNames', 'id1 Int32, id2 Int32, id4 String, id5 String, v2 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
RHS3=$(echo $RHS | cut -d' ' -f3)
clickhouse-client --query "DROP TABLE IF EXISTS $RHS3"
clickhouse-client --query "CREATE TABLE $RHS3 (id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v2 Float64) ENGINE = MergeTree() ORDER BY tuple();"
clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS3 SELECT id1, id2, id3, id4, id5, id6, v2 FROM file('data/$RHS3.csv', 'CSVWithNames', 'id1 Int32, id2 Int32, id3 Int32, id4 String, id5 String, id6 String, v2 Float64')"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
else
echo "clickhouse task $1 not implemented" >&2 && exit 1
fi

# cleanup timings from last run if they have not been cleaned up after parsing
mkdir -p clickhouse/log
rm -f clickhouse/log/$1_$2_q*.csv
rm -f clickhouse/log/$1_${SRC_DATANAME}_q*.csv

# execute sql script on clickhouse
clickhouse-client --query "TRUNCATE TABLE system.query_log"
echo "# clickhouse/exec.sh: data loaded, logs truncated, preparing $1-$2 benchmark sql script and sending it clickhouse"
clickhouse-client --query 'TRUNCATE TABLE system.query_log'
echo "# clickhouse/exec.sh: data loaded, logs truncated, preparing $1-$SRC_DATANAME benchmark sql script and sending it clickhouse"
if [ $1 == 'groupby' ]; then
# for each data_name produce sql script
sed "s/DATA_NAME/$2/g" < "clickhouse/$1-clickhouse.sql.in" > "clickhouse/$1-clickhouse.sql"
cat "clickhouse/$1-clickhouse.sql" | clickhouse-client -mn --max_memory_usage $CH_MEM --max_bytes_before_external_group_by $CH_EXT_GRP_BY --max_bytes_before_external_sort $CH_EXT_SORT --receive_timeout 10800 --format Pretty --output_format_pretty_max_rows 1 && echo '# clickhouse/exec.sh: benchmark sql script finished' || echo "# clickhouse/exec.sh: benchmark sql script for $2 terminated with error"
sed "s/DATA_NAME/$SRC_DATANAME/g" < "clickhouse/$1-clickhouse.sql.in" > "clickhouse/$1-clickhouse.sql"
cat "clickhouse/$1-clickhouse.sql" | clickhouse-client -mn --max_memory_usage $CH_MEM --max_bytes_before_external_group_by $CH_EXT_GRP_BY --max_bytes_before_external_sort $CH_EXT_SORT --receive_timeout 10800 --format Pretty --output_format_pretty_max_rows 1 && echo '# clickhouse/exec.sh: benchmark sql script finished' || echo "# clickhouse/exec.sh: benchmark sql script for $SRC_DATANAME terminated with error"
elif [ $1 == 'join' ]; then
sed "s/DATA_NAME/$2/g; s/RHS_SMALL/$rhs1/g; s/RHS_MEDIUM/$rhs2/g; s/RHS_BIG/$rhs3/g" < "clickhouse/join-clickhouse.sql.in" > "clickhouse/join-clickhouse.sql"
cat "clickhouse/$1-clickhouse.sql" | clickhouse-client -mn --max_memory_usage $CH_MEM --receive_timeout 10800 --format Pretty --output_format_pretty_max_rows 1 && echo '# clickhouse/exec.sh: benchmark sql script finished' || echo "# clickhouse/exec.sh: benchmark sql script for $2 terminated with error"
sed "s/DATA_NAME/$SRC_DATANAME/g; s/RHS_SMALL/$RHS1/g; s/RHS_MEDIUM/$RHS2/g; s/RHS_BIG/$RHS3/g" < "clickhouse/join-clickhouse.sql.in" > "clickhouse/join-clickhouse.sql"
cat "clickhouse/$1-clickhouse.sql" | clickhouse-client -mn --max_memory_usage $CH_MEM --receive_timeout 10800 --format Pretty --output_format_pretty_max_rows 1 && echo '# clickhouse/exec.sh: benchmark sql script finished' || echo "# clickhouse/exec.sh: benchmark sql script for $SRC_DATANAME terminated with error"
else
echo "clickhouse task $1 benchmark script launching not defined" >&2 && exit 1
fi
Expand All @@ -77,7 +77,7 @@ fi
sleep 90

# cleanup data
ch_active && echo '# clickhouse/exec.sh: finishing, cleaning up' && clickhouse-client --query "DROP TABLE IF EXISTS ans" && clickhouse-client --query "DROP TABLE IF EXISTS $2" || echo '# clickhouse/exec.sh: finishing, clickhouse server down, possibly crashed, could not clean up'
ch_active && echo '# clickhouse/exec.sh: finishing, cleaning up' && clickhouse-client --query "DROP TABLE IF EXISTS ans" && clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME" || echo '# clickhouse/exec.sh: finishing, clickhouse server down, possibly crashed, could not clean up'

# stop server
ch_stop && echo '# clickhouse/exec.sh: stopping server finished' || echo '# clickhouse/exec.sh: stopping server failed'
Expand All @@ -86,4 +86,4 @@ ch_stop && echo '# clickhouse/exec.sh: stopping server finished' || echo '# clic
sleep 30

# parse timings from clickhouse/log/[task]_[data_name]_q[i]_r[j].csv
Rscript clickhouse/clickhouse-parse-log.R "$1" "$2"
Rscript clickhouse/clickhouse-parse-log.R $1 $SRC_DATANAME
2 changes: 1 addition & 1 deletion cudf/groupby-cudf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
fun = ".groupby"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
data_name = os.environ['SRC_DATANAME']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name, flush=True)

Expand Down
2 changes: 1 addition & 1 deletion cudf/join-cudf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
fun = ".merge"
cache = "TRUE"

data_name = os.environ['SRC_JN_LOCAL']
data_name = os.environ['SRC_DATANAME']
src_jn_x = os.path.join("data", data_name+".csv")
y_data_name = join_to_tbls(data_name)
src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")]
Expand Down
2 changes: 1 addition & 1 deletion dask/groupby-dask2.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# since we are running on local cluster of processes, we would prefer to keep the communication between workers to relative minimum, thus it's better to trade some tasks granularity for better processing locality
dk.config.set({"optimization.fuse.ave-width": 20})

data_name = os.environ['SRC_GRP_LOCAL']
data_name = os.environ['SRC_DATANAME']
on_disk = False #data_name.split("_")[1] == "1e9" # on-disk data storage #126
fext = "parquet" if on_disk else "csv"
src_grp = os.path.join("data", data_name+"."+fext)
Expand Down
2 changes: 1 addition & 1 deletion dask/join-dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
fun = ".merge"
cache = "TRUE"

data_name = os.environ['SRC_JN_LOCAL']
data_name = os.environ['SRC_DATANAME']
on_disk = False #data_name.split("_")[1] == "1e9" # on-disk data storage #126
fext = "parquet" if on_disk else "csv"
src_jn_x = os.path.join("data", data_name+"."+fext)
Expand Down
2 changes: 1 addition & 1 deletion datatable/groupby-datatable.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fun = "[.data.table"
cache = TRUE
on_disk = FALSE

data_name = Sys.getenv("SRC_GRP_LOCAL")
data_name = Sys.getenv("SRC_DATANAME")
src_grp = file.path("data", paste(data_name, "csv", sep="."))
#src_grp = file.path("data", paste(data_name, "rds", sep="."))
cat(sprintf("loading dataset %s\n", data_name))
Expand Down
2 changes: 1 addition & 1 deletion datatable/join-datatable.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fun = "[.data.table"
cache = TRUE
on_disk = FALSE

data_name = Sys.getenv("SRC_JN_LOCAL")
data_name = Sys.getenv("SRC_DATANAME")
src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
y_data_name = join_to_tbls(data_name)
src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
Expand Down
2 changes: 1 addition & 1 deletion dplyr/groupby-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fun = "group_by"
cache = TRUE
on_disk = FALSE

data_name = Sys.getenv("SRC_GRP_LOCAL")
data_name = Sys.getenv("SRC_DATANAME")
src_grp = file.path("data", paste(data_name, "csv", sep="."))
cat(sprintf("loading dataset %s\n", data_name))

Expand Down
2 changes: 1 addition & 1 deletion dplyr/join-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ solution = "dplyr"
cache = TRUE
on_disk = FALSE

data_name = Sys.getenv("SRC_JN_LOCAL")
data_name = Sys.getenv("SRC_DATANAME")
src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
y_data_name = join_to_tbls(data_name)
src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
Expand Down
17 changes: 8 additions & 9 deletions h2o/exec.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
#!/bin/bash
set -e

if [ "$#" -ne 2 ]; then
echo "usage: ./h2o/exec.sh groupby G1_1e7_1e2_0_0";
exit 1
if [ "$#" -ne 1 ]; then
echo 'usage: ./h2o/exec.sh groupby';
exit 1
fi;

source ./h2o/h2o.sh

h2o_active && echo "h2o instance should not be already running, investigate" >&2
h2o_active && echo 'h2o instance should not be already running, investigate' >&2
h2o_active && exit 1

# start h2o
h2o_start "h2o_$1_$2"
h2o_start "h2o_$1_""$SRC_DATANAME"

# confirm h2o working
h2o_active || sleep 30
h2o_active || echo "h2o instance should be already running, investigate" >&2
h2o_active || echo 'h2o instance should be already running, investigate' >&2
h2o_active || exit 1

# execute benchmark script
./h2o/$1-h2o.R || echo "# h2o/exec.sh: benchmark script for $2 terminated with error" >&2
./h2o/$1-h2o.R || echo "# h2o/exec.sh: benchmark script for $SRC_DATANAME terminated with error" >&2

# stop h2o instance
h2o_stop && echo "# h2o/exec.sh: stopping h2o instance finished" || echo "# h2o/exec.sh: stopping h2o instance failed" >&2
h2o_stop && echo '# h2o/exec.sh: stopping h2o instance finished' || echo '# h2o/exec.sh: stopping h2o instance failed' >&2
h2o_active || exit 1

2 changes: 1 addition & 1 deletion h2o/groupby-h2o.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ on_disk = FALSE
h = h2o.init(startH2O=FALSE, port=55888)
h2o.no_progress()

data_name = Sys.getenv("SRC_GRP_LOCAL")
data_name = Sys.getenv("SRC_DATANAME")
src_grp = file.path("data", paste(data_name, "csv", sep="."))
cat(sprintf("loading dataset %s\n", data_name))

Expand Down
2 changes: 1 addition & 1 deletion h2o/join-h2o.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ on_disk = FALSE
h = h2o.init(startH2O=FALSE, port=55888)
h2o.no_progress()

data_name = Sys.getenv("SRC_JN_LOCAL")
data_name = Sys.getenv("SRC_DATANAME")
src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
y_data_name = join_to_tbls(data_name)
src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
Expand Down
2 changes: 1 addition & 1 deletion juliadf/groupby-juliadf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fun = "by";
cache = true;
on_disk = false;

data_name = ENV["SRC_GRP_LOCAL"];
data_name = ENV["SRC_DATANAME"];
src_grp = string("data/", data_name, ".csv");
println(string("loading dataset ", data_name)); flush(stdout);

Expand Down
2 changes: 1 addition & 1 deletion juliadf/join-juliadf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fun = "join";
cache = true;
on_disk = false;

data_name = ENV["SRC_JN_LOCAL"];
data_name = ENV["SRC_DATANAME"];
src_jn_x = string("data/", data_name, ".csv");
y_data_name = join_to_tbls(data_name);
src_jn_y = [string("data/", y_data_name[1], ".csv"), string("data/", y_data_name[2], ".csv"), string("data/", y_data_name[3], ".csv")];
Expand Down
2 changes: 1 addition & 1 deletion modin/groupby-modin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
cache = "TRUE"
on_disk = "FALSE"

data_name = os.environ['SRC_GRP_LOCAL']
data_name = os.environ['SRC_DATANAME']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name, flush=True)

Expand Down
Loading

0 comments on commit 601e0bd

Please sign in to comment.