Skip to content

Commit e316b8f

Browse files
authored
Merge pull request #115 from HTTPArchive/incremental-timeseries
Incremental time series
2 parents 490b75d + 22e452f commit e316b8f

File tree

5 files changed

+77
-18
lines changed

5 files changed

+77
-18
lines changed

crontab

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
0 10 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_har.sh chrome' >> /var/log/HA-import-har-chrome.log 2>&1
55
0 11 * * * /bin/bash -l -c 'cd /home/igrigorik/code && ./sync_har.sh android' >> /var/log/HA-import-har-android.log 2>&1
66

7-
0 7 15 * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -fth `date -d "-1 month" "+\%Y_\%m_01"` -r "*crux*"' >> /var/log/crux_reruns.log 2>&1
7+
0 7 15 * * /bin/bash -l -c 'cd /home/igrigorik/code && sql/generate_reports.sh -th `date -d "-1 month" "+\%Y_\%m_01"` -r "*crux*"' >> /var/log/crux_reruns.log 2>&1

docs/README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@ Sometimes it's necessary to manually run this process, for example if a new metr
3838
# -h: Whether to generate histograms. Must be accompanied by the date to query.
3939
#
4040
# -f: Whether to force histogram querying and updating even if the data exists.
41-
# Timeseries are always overwritten.
41+
# Timeseries are usually appended to from last date, but this flag forces a complete rerun
42+
#
43+
# -r: Optional pattern match for reports to be run. Use quotes to avoid the shell expanding names
44+
# (e.g. "*crux*")
4245
```
4346

4447
You can omit one of the `-t` or -h` flags to focus only on histogram or timeseries generation. The `-f` flag ensures that histogram data gets overwritten. Omit this flag to skip queries for dates that already exist (much faster for batch jobs, see below).

sql/generate_reports.sh

+68-12
Original file line numberDiff line numberDiff line change
@@ -146,34 +146,84 @@ else
146146
# Extract the metric name from the file path.
147147
metric=$(echo $(basename $query) | cut -d"." -f1)
148148

149+
date_join=""
150+
max_date=""
151+
current_contents=""
149152
gs_url="gs://httparchive/reports/$gs_lens_dir${metric}.json"
150153
gsutil ls $gs_url &> /dev/null
151-
if [ $? -eq 0 ] && [ $FORCE -eq 0 ]; then
152-
# The file already exists, so skip the query.
153-
echo -e "Skipping $metric timeseries"
154-
continue
154+
if [ $? -eq 0 ]; then
155+
# The file already exists, so check max date
156+
if [ $FORCE -eq 0 ]; then
157+
current_contents=$(gsutil cat $gs_url)
158+
max_date=$(echo $current_contents | jq -r '[ .[] | .date ] | max')
159+
160+
if [[ "${max_date}" == "${YYYY_MM_DD}" || "${max_date}" > "${YYYY_MM_DD}" ]]; then
161+
echo -e "Skipping $metric timeseries"
162+
continue
163+
elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that
164+
date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) > \"$max_date\""
165+
if [[ -n "$YYYY_MM_DD" ]]; then
166+
date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\""
167+
fi
168+
fi
169+
fi
155170
fi
156171

157-
echo -e "Generating $metric timeseries"
172+
if [[ -n "${date_join}" && -n "${max_date}" ]]; then
173+
echo -e "Generating $metric timeseries in incremental mode from ${max_date} to ${YYYY_MM_DD}"
174+
else
175+
echo -e "Generating $metric timeseries from start"
176+
fi
158177

159178
# Run the query on BigQuery.
160179
START_TIME=$SECONDS
161180
if [[ $LENS != "" ]]; then
181+
182+
if [[ $(grep "httparchive.blink_features.usage" $query) ]]; then
183+
echo "blink_features.usage queries do not support lens's so skipping lens"
184+
continue
185+
fi
186+
162187
lens_join="JOIN ($(cat sql/lens/$LENS/timeseries.sql | tr '\n' ' ')) USING (url, _TABLE_SUFFIX)"
163188
if [[ $metric == crux* ]]; then
164189
echo "CrUX query so using alternative lens join"
165190
lens_join="JOIN ($(cat sql/lens/$LENS/timeseries.sql | tr '\n' ' ')) ON (origin || '\/' = url AND REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\\\\\\\d{4})(\\\\\\\\d{2})', '\\\\\\\\1_\\\\\\\\2_01') || '_' || IF(device = 'phone', 'mobile', device) = _TABLE_SUFFIX)"
166191
fi
167-
if [[ $(grep "httparchive.blink_features.usage" $query) ]]; then
168-
echo "blink_features.usage queries do not support lens's so skipping lens"
169-
continue
192+
193+
if [[ -n "${date_join}" ]]; then
194+
if [[ $(grep -i "WHERE" $query) ]]; then
195+
# If WHERE clause already exists then add to it, before GROUP BY
196+
result=$(sed -e "s/\(GROUP BY\)/AND $date_join \1/" $query \
197+
| sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" \
198+
| $BQ_CMD)
199+
else
200+
# If WHERE clause doesn't exists then add it, before GROUP BY
201+
result=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query \
202+
| sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" \
203+
| $BQ_CMD)
204+
fi
205+
else
206+
result=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query \
207+
| $BQ_CMD)
170208
fi
171-
result=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query \
172-
| $BQ_CMD)
209+
173210
else
174-
result=$(cat $query \
175-
| $BQ_CMD)
211+
# blink_features do not support date_join so do full run for them
212+
if [[ -z "${date_join}" || $(grep "httparchive.blink_features.usage" $query) ]]; then
213+
date_join=""
214+
result=$(cat $query \
215+
| $BQ_CMD)
216+
elif [[ $(grep -i "WHERE" $query) ]]; then
217+
# If WHERE clause already exists then add to it, before GROUP BY
218+
result=$(sed -e "s/\(GROUP BY\)/AND $date_join \1/" $query \
219+
| $BQ_CMD)
220+
else
221+
# If WHERE clause doesn't exists then add it, before GROUP BY
222+
result=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query \
223+
| $BQ_CMD)
224+
fi
176225
fi
226+
177227
# Make sure the query succeeded.
178228
if [ $? -eq 0 ]; then
179229
ELAPSED_TIME=$(($SECONDS - $START_TIME))
@@ -182,6 +232,12 @@ else
182232
else
183233
echo "$metric took $ELAPSED_TIME seconds"
184234
fi
235+
236+
# If it's a partial run, then combine with the current results.
237+
if [[ -n "${date_join}" ]]; then
238+
result=$(echo ${result} ${current_contents} | jq '.+= input')
239+
fi
240+
185241
# Upload the response to Google Storage.
186242
echo $result \
187243
| gsutil -h "Content-Type:application/json" cp - $gs_url

sync_csv.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ cd $HOME/code
135135

136136
gsutil -q stat gs://httparchive/reports/$table/*
137137
if [ $? -eq 1 ]; then
138-
. sql/generate_reports.sh -fth $table
139-
ls -1 sql/lens | xargs -I lens sql/generate_reports.sh -fth $table -l lens
138+
. sql/generate_reports.sh -th $table
139+
ls -1 sql/lens | xargs -I lens sql/generate_reports.sh -th $table -l lens
140140
else
141141
echo -e "Reports for ${table} already exist, skipping."
142142
fi

sync_har.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ cd $HOME/code
8787

8888
gsutil -q stat gs://httparchive/reports/$table/*
8989
if [ $? -eq 1 ]; then
90-
. sql/generate_reports.sh -fth $table
91-
ls -1 sql/lens | xargs -I lens sql/generate_reports.sh -fth $table -l lens
90+
. sql/generate_reports.sh -th $table
91+
ls -1 sql/lens | xargs -I lens sql/generate_reports.sh -th $table -l lens
9292
else
9393
echo -e "Reports for ${table} already exist, skipping."
9494
fi

0 commit comments

Comments
 (0)