Skip to content

Commit

Permalink
queries: improve perf of collection queries (#1697)
Browse files Browse the repository at this point in the history
Co-authored-by: goroutine <[email protected]>
  • Loading branch information
Mini256 and ngaut authored May 20, 2024
1 parent e08bcfb commit c7f597c
Show file tree
Hide file tree
Showing 10 changed files with 124 additions and 19 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE TABLE IF NOT EXISTS `mv_events_increment_intervals`
(
`record_time` DATETIME,
`interval` INT(11),
PRIMARY KEY (`record_time`)
);
14 changes: 14 additions & 0 deletions configs/materialized_views/mv_events_increment_list/ddl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CREATE TABLE IF NOT EXISTS `mv_events_increment_list`
(
`id` BIGINT(20) NOT NULL DEFAULT '0',
`type` VARCHAR(29) NOT NULL DEFAULT 'Event',
`repo_id` BIGINT(20) NOT NULL DEFAULT '0',
`repo_name` VARCHAR(140) NOT NULL DEFAULT '',
`actor_id` BIGINT(20) NOT NULL DEFAULT '0',
`actor_login` VARCHAR(40) NOT NULL DEFAULT '',
`number` INT(11) NOT NULL DEFAULT '0',
`pr_merged` TINYINT(1) NOT NULL DEFAULT '0',
`created_at` DATETIME NOT NULL DEFAULT '1970-01-01 00:00:00',
PRIMARY KEY (`id`),
KEY idx_meil_on_created_at (`created_at`)
);
7 changes: 7 additions & 0 deletions configs/materialized_views/mv_events_total/ddl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CREATE TABLE IF NOT EXISTS `mv_events_total`
(
`record_time` DATETIME,
`events_increment` BIGINT(11),
`events_total` BIGINT(11),
PRIMARY KEY (`record_time`)
);
11 changes: 11 additions & 0 deletions configs/materialized_views/mv_repo_monthly_summary/ddl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE TABLE IF NOT EXISTS `mv_repo_monthly_summary`
(
`repo_id` INT(11),
`month` DATE,
`stars` INT(11),
`pull_requests` INT(11),
`pull_request_creators` INT(11),
`issues` INT(11),
`issue_creators` INT(11),
PRIMARY KEY (`repo_id`, `month`)
);
8 changes: 8 additions & 0 deletions configs/pipelines/calc_repo_monthly_summary/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"name": "mv_repo_monthly_summary",
"description": "None",
"cron": "0 0 4 * * *",
"incremental": {
"timeRange": "last_month"
}
}
34 changes: 34 additions & 0 deletions configs/pipelines/calc_repo_monthly_summary/process.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

INSERT INTO mv_repo_monthly_summary(repo_id, month, stars, pull_requests, pull_request_creators, issues, issue_creators)
WITH repos AS (
SELECT repo_id
FROM collection_items ci
GROUP BY repo_id
)
SELECT
/*+ READ_FROM_STORAGE(TIFLASH[ge, ci]) */
ge.repo_id,
DATE_FORMAT(ge.created_at, '%Y-%m-01') AS month,
COUNT(IF(ge.type = 'WatchEvent', 1, NULL)) AS new_stars,
COUNT(IF(ge.type = 'PullRequestEvent', 1, NULL)) AS new_pull_requests,
COUNT(IF(ge.type = 'IssuesEvent', 1, NULL)) AS new_issues,
COUNT(DISTINCT IF(ge.type = 'PullRequestEvent', ge.actor_login, NULL)) AS new_pull_request_creators,
COUNT(DISTINCT IF(ge.type = 'IssuesEvent', ge.actor_login, NULL)) AS new_issue_creators
FROM github_events ge
JOIN repos r ON ge.repo_id = r.repo_id
WHERE
ge.type IN ('WatchEvent', 'PullRequestEvent', 'IssuesEvent')
AND ge.action IN ('opened', 'created', 'started')
AND ge.created_at >= :from
AND ge.created_at < :to
GROUP BY
repo_id, month
ORDER BY
repo_id, month
ON DUPLICATE KEY UPDATE
stars = new_stars,
pull_requests = new_pull_requests,
issues = new_issue_creators,
pull_request_creators = new_pull_request_creators,
issue_creators = new_issue_creators
;
8 changes: 8 additions & 0 deletions configs/pipelines/sync_events_increment_list/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"name": "sync_events_increment_list",
"description": "None",
"cron": "0 0 3 * * *",
"incremental": {
"timeRange": "last_day"
}
}
18 changes: 18 additions & 0 deletions configs/pipelines/sync_events_increment_list/process.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
WITH collections_with_past_month_visits AS (
SELECT
/*+ READ_FROM_STORAGE(TIFLASH[sar]) */
CAST(JSON_EXTRACT(query, '$.collectionId') AS SIGNED) AS collection_id,
COUNT(*) AS past_month_visit
FROM stats_api_requests sar
WHERE
path LIKE '/q/collection-%'
AND finished_at > DATE_SUB(NOW(), INTERVAL 1 MONTH)
GROUP BY collection_id
)
UPDATE
collections c, collections_with_past_month_visits cv
SET
c.past_month_visits = cv.past_month_visit
WHERE
c.id = cv.collection_id
;
25 changes: 6 additions & 19 deletions configs/queries/collection-stars-history/template.sql
Original file line number Diff line number Diff line change
@@ -1,31 +1,18 @@
WITH accumulative_stars_by_month AS (
SELECT
t_month,
repo_id,
COUNT(*) OVER (PARTITION BY repo_id ORDER BY t_month) AS total,
-- De-duplicate by t_month column, keeping only the first accumulative value of each month.
ROW_NUMBER() OVER (PARTITION BY repo_id, t_month) AS row_num_by_month
FROM (
SELECT
repo_id,
DATE_FORMAT(created_at, '%Y-%m-01') AS t_month,
-- De-duplicate by actor_login column, keeping only the first event of each star.
ROW_NUMBER() OVER (PARTITION BY ge.repo_id, actor_login ORDER BY created_at) AS row_num_by_actor_login
FROM github_events ge
WHERE
type = 'WatchEvent'
AND repo_id IN (SELECT repo_id FROM collection_items ci WHERE collection_id = 10001)
) sub
month,
stars AS total
FROM mv_repo_monthly_summary mrms
WHERE
row_num_by_actor_login = 1
repo_id IN (SELECT repo_id FROM collection_items ci WHERE collection_id = 10001)
)
SELECT
ci.repo_id AS repo_id,
ci.repo_name AS repo_name,
acc.t_month AS event_month,
acc.month AS event_month,
acc.total
FROM accumulative_stars_by_month acc
JOIN collection_items ci ON collection_id = 10001 AND ci.repo_id = acc.repo_id
WHERE row_num_by_month = 1
ORDER BY t_month
ORDER BY repo_id, month
;
12 changes: 12 additions & 0 deletions packages/pipeline/src/plugins/pipelines/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ export interface Pipeline {
}

export enum PresetIncrementalTimeRange {
LAST_MONTH = 'last_month',
PAST_MONTH = 'past_month',
YESTERDAY = 'yesterday',
LAST_DAY = 'last_day',
LAST_HOUR = 'last_hour',
Expand Down Expand Up @@ -121,6 +123,16 @@ export interface TimeRange {
export function resolveTimeRange(timeRange: PresetIncrementalTimeRange = PresetIncrementalTimeRange.YESTERDAY): TimeRange {
const now = DateTime.now();
switch (timeRange) {
case PresetIncrementalTimeRange.PAST_MONTH:
return {
from: now.minus({month: 1}),
to: now
};
case PresetIncrementalTimeRange.LAST_MONTH:
return {
from: now.startOf('month').minus({ month: 1 }),
to: now.startOf('month')
};
case PresetIncrementalTimeRange.PAST_HOUR:
return {
from: now.minus({hours: 1}),
Expand Down

0 comments on commit c7f597c

Please sign in to comment.