Skip to content

Commit a319c4d

Browse files
committed
initial working commit
0 parents  commit a319c4d

13 files changed

+488
-0
lines changed

.Rbuildignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
^.*\.Rproj$
2+
^\.Rproj\.user$

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata

DESCRIPTION

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
Package: redshiftTools
2+
Type: Package
3+
Title: Redshift Tools
4+
Version: 0.1.0
5+
Authors@R: person("Pablo", "Seibelt", email = "[email protected]",
6+
role = c("aut", "cre"))
7+
Mantainers@R: person("Pablo", "Seibelt", email = "[email protected]",
8+
role = c("aut", "cre"))
9+
Depends:
10+
R (>= 3.1.2)
11+
Imports:
12+
DBI,
13+
RPostgres,
14+
aws.s3
15+
Description: Tools to upload data to an Amazon Redshift Database with good performance.
16+
License: MIT + file LICENSE
17+
LazyData: TRUE
18+
RoxygenNote: 5.0.1

LICENSE

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2016 Pablo Alejandro Seibelt <[email protected]>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6+
7+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8+
9+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

NAMESPACE

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(rs_replace_table)
4+
export(rs_upsert_table)
5+
importFrom("aws.s3","bucket_exists")
6+
importFrom("aws.s3","delete_object")
7+
importFrom("aws.s3","put_object")
8+
importFrom("utils","write.csv")
9+
importFrom(DBI,dbGetQuery)

R/internal.R

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Internal utility functions used by the redshift tools
2+
3+
#' @importFrom "aws.s3" "put_object" "bucket_exists"
4+
#' @importFrom "utils" "write.csv"
5+
uploadToS3 = function (data, bucket, split_files){
6+
prefix=paste0(sample(letters,16),collapse = "")
7+
if(!bucket_exists(bucket)){
8+
stop("Bucket does not exist")
9+
}
10+
if(nrow(data) == 0){
11+
stop("Input data is empty")
12+
}
13+
if(nrow(data) < split_files){
14+
split_files = nrow(data)
15+
}
16+
splitted = suppressWarnings(split(data, seq(1:split_files)))
17+
18+
for (i in 1:split_files) {
19+
part = data.frame(splitted[i])
20+
21+
tmpFile = tempfile()
22+
s3Name=paste(prefix, ".", formatC(i, width = 4, format = "d", flag = "0"), sep="")
23+
write.csv(part, gzfile(tmpFile), na='', row.names=F)
24+
25+
print(paste("Uploading", s3Name))
26+
put_object(file = tmpFile, object = s3Name, bucket = bucket)
27+
}
28+
29+
return(prefix)
30+
}
31+
32+
#' @importFrom "aws.s3" "delete_object"
33+
deletePrefix = function(prefix, bucket, split_files){
34+
for (i in 1:split_files) {
35+
s3Name=paste(prefix, ".", formatC(i, width = 4, format = "d", flag = "0"), sep="")
36+
print(paste("Deleting", s3Name))
37+
delete_object(s3Name, bucket)
38+
}
39+
}
40+
41+
#' @importFrom DBI dbGetQuery
42+
queryDo = function(dbcon, query){
43+
dbGetQuery(dbcon, query)
44+
}
45+

R/redshift-replace.R

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#' Replace or upsert redshift table
2+
#'
3+
#' Upload a table to S3 and then load it with redshift, replacing the contents of that table.
4+
#' The table on redshift has to have the same structure and column ordering to work correctly.
5+
#'
6+
#' @param data a data frame
7+
#' @param dbcon an RPostgres connection to the redshift server
8+
#' @param tableName the name of the table to replace
9+
#' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
10+
#' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
11+
#' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
12+
#' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
13+
#' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
14+
#' @examples
15+
#' library(DBI)
16+
#'
17+
#' a=data.frame(a=seq(1,10000), b=seq(10000,1))
18+
#'
19+
#'\dontrun{
20+
#' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
21+
#' host='my-redshift-url.amazon.com', port='5439',
22+
#' user='myuser', password='mypassword',sslmode='require')
23+
#'
24+
#' rs_replace_table(data=a, dbcon=con, tableName='testTable',
25+
#' bucket="my-bucket", split_files=4)
26+
#'
27+
#' }
28+
#' @export
29+
rs_replace_table = function(
30+
data,
31+
dbcon,
32+
tableName,
33+
split_files,
34+
bucket=Sys.getenv('AWS_BUCKET_NAME'),
35+
region=Sys.getenv('AWS_DEFAULT_REGION'),
36+
access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
37+
secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY')
38+
)
39+
{
40+
if(missing(split_files)){
41+
print("Getting number of slices from Redshift")
42+
slices = queryDo(dbcon,"select count(*) from stv_slices")
43+
split_files = unlist(slices[1]*4)
44+
print(sprintf("%s slices detected, will split into %s files", slices, split_files))
45+
}
46+
split_files = min(split_files, nrow(data))
47+
48+
prefix = uploadToS3(data, bucket, split_files)
49+
50+
51+
52+
result = tryCatch({
53+
print("Truncating target table")
54+
queryDo(dbcon, sprintf("truncate table %s", tableName))
55+
56+
print("Copying data from S3 into Redshift")
57+
queryDo(dbcon, sprintf("copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull credentials 'aws_access_key_id=%s;aws_secret_access_key=%s';",
58+
tableName,
59+
bucket,
60+
prefix,
61+
region,
62+
access_key,
63+
secret_key
64+
))
65+
66+
print("Committing changes")
67+
queryDo(dbcon, "COMMIT;")
68+
}, warning = function(w) {
69+
print(w)
70+
}, error = function(e) {
71+
print(e$message)
72+
queryDo(dbcon, 'ROLLBACK;')
73+
}, finally = {
74+
print("Deleting temporary files from S3 bucket")
75+
deletePrefix(prefix, bucket, split_files)
76+
})
77+
78+
}
79+

R/redshift-upsert.R

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#' Upsert redshift table
2+
#'
3+
#' Upload a table to S3 and then load it with redshift, replacing rows with the same
4+
#' keys, and inserting rows with new keys.
5+
#' The table on redshift has to have the same structure and column ordering to work correctly.
6+
#'
7+
#' @param data a data frame
8+
#' @param dbcon an RPostgres connection to the redshift server
9+
#' @param tableName the name of the table to replace
10+
#' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
11+
#' @param keys athis optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.
12+
#' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
13+
#' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
14+
#' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
15+
#' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
16+
#' @examples
17+
#' library(DBI)
18+
#'
19+
#' a=data.frame(a=seq(1,10000), b=seq(10000,1))
20+
#' n=head(a,n=5000)
21+
#' n$b=n$a
22+
#' nx=rbind(n, data.frame(a=seq(99999:104000), b=seq(104000:99999)))
23+
#'
24+
#'\dontrun{
25+
#' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
26+
#' host='my-redshift-url.amazon.com', port='5439',
27+
#' user='myuser', password='mypassword',sslmode='require')
28+
#'
29+
#' rs_upsert_table(data=nx, dbcon=con, tableName='testTable',
30+
#' bucket="my-bucket", split_files=4, keys=c('a'))
31+
#'
32+
#'}
33+
#' @export
34+
rs_upsert_table = function(
35+
data,
36+
dbcon,
37+
tableName,
38+
keys,
39+
split_files,
40+
bucket=Sys.getenv('AWS_BUCKET_NAME'),
41+
region=Sys.getenv('AWS_DEFAULT_REGION'),
42+
access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
43+
secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY')
44+
)
45+
{
46+
if(missing(split_files)){
47+
print("Getting number of slices from Redshift")
48+
slices = queryDo(dbcon,"select count(*) from stv_slices")
49+
split_files = unlist(slices[1]*4)
50+
print(sprintf("%s slices detected, will split into %s files", slices, split_files))
51+
}
52+
split_files = min(split_files, nrow(data))
53+
54+
prefix = uploadToS3(data, bucket, split_files)
55+
56+
57+
58+
59+
result = tryCatch({
60+
stageTable=paste0(sample(letters,16),collapse = "")
61+
62+
queryDo(dbcon, sprintf("create temp table %s (like %s)", stageTable, tableName))
63+
64+
print("Copying data from S3 into Redshift")
65+
queryDo(dbcon, sprintf("copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull credentials 'aws_access_key_id=%s;aws_secret_access_key=%s';",
66+
stageTable,
67+
bucket,
68+
prefix,
69+
region,
70+
access_key,
71+
secret_key
72+
))
73+
if(!missing(keys)){
74+
print("Deleting rows with same keys")
75+
keysCond = paste(stageTable,".",keys, "=", tableName,".",keys, sep="")
76+
keysWhere = sub(" and $", "", paste0(keysCond, collapse="", sep=" and "))
77+
queryDo(dbcon, sprintf('delete from %s using %s where %s',
78+
tableName,
79+
stageTable,
80+
keysWhere
81+
))
82+
}
83+
print("Insert new rows")
84+
queryDo(dbcon, sprintf('insert into %s select * from %s', tableName, stageTable))
85+
86+
queryDo(sprintf("drop table %s", stageTable))
87+
print("Commiting")
88+
queryDo(dbcon, "COMMIT;")
89+
}, warning = function(w) {
90+
print(w)
91+
}, error = function(e) {
92+
print(e$message)
93+
queryDo(dbcon, 'ROLLBACK;')
94+
}, finally = {
95+
print("Deleting temporary files from S3 bucket")
96+
deletePrefix(prefix, bucket, split_files)
97+
})
98+
}

README.Rmd

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
---
2+
output:
3+
md_document:
4+
variant: markdown_github
5+
---
6+
7+
<!-- README.md is generated from README.Rmd. Please edit that file -->
8+
9+
```{r setup, include=FALSE}
10+
knitr::opts_chunk$set(echo = TRUE)
11+
```
12+
13+
# redshiftTools
14+
15+
This is an R Package meant to easen the uploading of bulk data into Amazon Redshift.
16+
17+
18+
## Installation
19+
20+
To install this package, you'll need to execute these commands:
21+
22+
```R
23+
install.packages('devtools')
24+
devtools::install_github("RcppCore/Rcpp")
25+
devtools::install_github("rstats-db/DBI")
26+
devtools::install_github("rstats-db/RPostgres")
27+
install.packages("aws.s3", repos = c(getOption("repos"), "http://cloudyr.github.io/drat"))
28+
devtools::install_github("sicarul/redshiftTools")
29+
```
30+
31+
## Usage
32+
33+
You'll have available now 2 functions: `rs_replace_table` and `rs_upsert_table`, both of these functions are called with almost the same parameters, except on upsert you can specify with which keys to search for matching rows.
34+
35+
For example, suppose we have a table to load with 2 integer columns, we could use the following code:
36+
37+
```R
38+
library("aws.s3")
39+
library(RPostgres)
40+
library(redshiftTools)
41+
42+
a=data.frame(a=seq(1,10000), b=seq(10000,1))
43+
n=head(a,n=10)
44+
n$b=n$a
45+
nx=rbind(n, data.frame(a=seq(5:10), b=seq(10:5)))
46+
47+
con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
48+
host='my-redshift-url.amazon.com', port='5439',
49+
user='myuser', password='mypassword',sslmode='require')
50+
51+
b=rs_replace_table(a, dbcon=con, tableName='mytable', bucket="mybucket", split_files=4)
52+
c=rs_upsert_table(nx, dbcon=con, tableName = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
53+
54+
```

README.md

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
<!-- README.md is generated from README.Rmd. Please edit that file -->
2+
redshiftTools
3+
=============
4+
5+
This is an R Package meant to easen the uploading of bulk data into Amazon Redshift.
6+
7+
Installation
8+
------------
9+
10+
To install this package, you'll need to execute these commands:
11+
12+
```R
13+
install.packages('devtools')
14+
devtools::install_github("RcppCore/Rcpp")
15+
devtools::install_github("rstats-db/DBI")
16+
devtools::install_github("rstats-db/RPostgres")
17+
install.packages("aws.s3", repos = c(getOption("repos"), "http://cloudyr.github.io/drat"))
18+
devtools::install_github("sicarul/redshiftTools")
19+
```
20+
21+
Usage
22+
-----
23+
24+
You'll have available now 2 functions: `rs_replace_table` and `rs_upsert_table`, both of these functions are called with almost the same parameters, except on upsert you can specify with which keys to search for matching rows.
25+
26+
For example, suppose we have a table to load with 2 integer columns, we could use the following code:
27+
28+
```R
29+
library("aws.s3")
30+
library(RPostgres)
31+
library(redshiftTools)
32+
33+
a=data.frame(a=seq(1,10000), b=seq(10000,1))
34+
n=head(a,n=10)
35+
n$b=n$a
36+
nx=rbind(n, data.frame(a=seq(5:10), b=seq(10:5)))
37+
38+
con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
39+
host='my-redshift-url.amazon.com', port='5439',
40+
user='myuser', password='mypassword',sslmode='require')
41+
42+
b=rs_replace_table(a, dbcon=con, tableName='mytable', bucket="mybucket", split_files=4)
43+
c=rs_upsert_table(nx, dbcon=con, tableName = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
44+
45+
```

0 commit comments

Comments
 (0)