|
| 1 | +--- |
| 2 | +description: 'A benchmark dataset used for comparing the performance of data warehousing |
| 3 | + solutions.' |
| 4 | +sidebar_label: 'AMPLab Big Data Benchmark' |
| 5 | +slug: /getting-started/example-datasets/amplab-benchmark |
| 6 | +title: 'AMPLab Big Data Benchmark' |
| 7 | +--- |
| 8 | + |
| 9 | +See https://amplab.cs.berkeley.edu/benchmark/ |
| 10 | + |
| 11 | +Sign up for a free account at https://aws.amazon.com. It requires a credit card, email, and phone number. Get a new access key at https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential |
| 12 | + |
| 13 | +Run the following in the console: |
| 14 | + |
| 15 | +```bash |
| 16 | +$ sudo apt-get install s3cmd |
| 17 | +$ mkdir tiny; cd tiny; |
| 18 | +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . |
| 19 | +$ cd .. |
| 20 | +$ mkdir 1node; cd 1node; |
| 21 | +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . |
| 22 | +$ cd .. |
| 23 | +$ mkdir 5nodes; cd 5nodes; |
| 24 | +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . |
| 25 | +$ cd .. |
| 26 | +``` |
| 27 | + |
| 28 | +Run the following ClickHouse queries: |
| 29 | + |
| 30 | +```sql |
| 31 | +CREATE TABLE rankings_tiny |
| 32 | +( |
| 33 | + pageURL String, |
| 34 | + pageRank UInt32, |
| 35 | + avgDuration UInt32 |
| 36 | +) ENGINE = Log; |
| 37 | + |
| 38 | +CREATE TABLE uservisits_tiny |
| 39 | +( |
| 40 | + sourceIP String, |
| 41 | + destinationURL String, |
| 42 | + visitDate Date, |
| 43 | + adRevenue Float32, |
| 44 | + UserAgent String, |
| 45 | + cCode FixedString(3), |
| 46 | + lCode FixedString(6), |
| 47 | + searchWord String, |
| 48 | + duration UInt32 |
| 49 | +) ENGINE = MergeTree(visitDate, visitDate, 8192); |
| 50 | + |
| 51 | +CREATE TABLE rankings_1node |
| 52 | +( |
| 53 | + pageURL String, |
| 54 | + pageRank UInt32, |
| 55 | + avgDuration UInt32 |
| 56 | +) ENGINE = Log; |
| 57 | + |
| 58 | +CREATE TABLE uservisits_1node |
| 59 | +( |
| 60 | + sourceIP String, |
| 61 | + destinationURL String, |
| 62 | + visitDate Date, |
| 63 | + adRevenue Float32, |
| 64 | + UserAgent String, |
| 65 | + cCode FixedString(3), |
| 66 | + lCode FixedString(6), |
| 67 | + searchWord String, |
| 68 | + duration UInt32 |
| 69 | +) ENGINE = MergeTree(visitDate, visitDate, 8192); |
| 70 | + |
| 71 | +CREATE TABLE rankings_5nodes_on_single |
| 72 | +( |
| 73 | + pageURL String, |
| 74 | + pageRank UInt32, |
| 75 | + avgDuration UInt32 |
| 76 | +) ENGINE = Log; |
| 77 | + |
| 78 | +CREATE TABLE uservisits_5nodes_on_single |
| 79 | +( |
| 80 | + sourceIP String, |
| 81 | + destinationURL String, |
| 82 | + visitDate Date, |
| 83 | + adRevenue Float32, |
| 84 | + UserAgent String, |
| 85 | + cCode FixedString(3), |
| 86 | + lCode FixedString(6), |
| 87 | + searchWord String, |
| 88 | + duration UInt32 |
| 89 | +) ENGINE = MergeTree(visitDate, visitDate, 8192); |
| 90 | +``` |
| 91 | + |
| 92 | +Go back to the console: |
| 93 | + |
| 94 | +```bash |
| 95 | +$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done |
| 96 | +$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done |
| 97 | +$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done |
| 98 | +$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done |
| 99 | +$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done |
| 100 | +$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done |
| 101 | +``` |
| 102 | + |
| 103 | +Queries for obtaining data samples: |
| 104 | + |
| 105 | +```sql |
| 106 | +SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000 |
| 107 | + |
| 108 | +SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8) |
| 109 | + |
| 110 | +SELECT |
| 111 | + sourceIP, |
| 112 | + sum(adRevenue) AS totalRevenue, |
| 113 | + avg(pageRank) AS pageRank |
| 114 | +FROM rankings_1node ALL INNER JOIN |
| 115 | +( |
| 116 | + SELECT |
| 117 | + sourceIP, |
| 118 | + destinationURL AS pageURL, |
| 119 | + adRevenue |
| 120 | + FROM uservisits_1node |
| 121 | + WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01') |
| 122 | +) USING pageURL |
| 123 | +GROUP BY sourceIP |
| 124 | +ORDER BY totalRevenue DESC |
| 125 | +LIMIT 1 |
| 126 | +``` |
0 commit comments