initial working commit

sicarul · sicarul · commit a319c4dd2009 · 2016-05-15T01:42:50.000-03:00
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,18 @@
+Package: redshiftTools
+Type: Package
+Title: Redshift Tools
+Version: 0.1.0
+Authors@R: person("Pablo", "Seibelt", email = "pabloseibelt@sicarul.com",
+  role = c("aut", "cre"))
+Mantainers@R: person("Pablo", "Seibelt", email = "pabloseibelt@sicarul.com",
+    role = c("aut", "cre"))
+Depends:
+    R (>= 3.1.2)
+Imports:
+    DBI,
+    RPostgres,
+    aws.s3
+Description: Tools to upload data to an Amazon Redshift Database with good performance.
+License: MIT + file LICENSE
+LazyData: TRUE
+RoxygenNote: 5.0.1
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,9 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Pablo Alejandro Seibelt <pabloseibelt@sicarul.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,9 @@
+# Generated by roxygen2: do not edit by hand
+
+export(rs_replace_table)
+export(rs_upsert_table)
+importFrom("aws.s3","bucket_exists")
+importFrom("aws.s3","delete_object")
+importFrom("aws.s3","put_object")
+importFrom("utils","write.csv")
+importFrom(DBI,dbGetQuery)
diff --git a/R/internal.R b/R/internal.R
@@ -0,0 +1,45 @@
+# Internal utility functions used by the redshift tools
+
+#' @importFrom "aws.s3" "put_object" "bucket_exists"
+#' @importFrom "utils" "write.csv"
+uploadToS3 = function (data, bucket, split_files){
+  prefix=paste0(sample(letters,16),collapse = "")
+  if(!bucket_exists(bucket)){
+    stop("Bucket does not exist")
+  }
+  if(nrow(data) == 0){
+    stop("Input data is empty")
+  }
+  if(nrow(data) < split_files){
+    split_files = nrow(data)
+  }
+  splitted = suppressWarnings(split(data, seq(1:split_files)))
+
+  for (i in 1:split_files) {
+    part = data.frame(splitted[i])
+
+    tmpFile = tempfile()
+    s3Name=paste(prefix, ".", formatC(i, width = 4, format = "d", flag = "0"), sep="")
+    write.csv(part, gzfile(tmpFile), na='', row.names=F)
+
+    print(paste("Uploading", s3Name))
+    put_object(file = tmpFile, object = s3Name, bucket = bucket)
+  }
+
+  return(prefix)
+}
+
+#' @importFrom "aws.s3" "delete_object"
+deletePrefix = function(prefix, bucket, split_files){
+  for (i in 1:split_files) {
+    s3Name=paste(prefix, ".", formatC(i, width = 4, format = "d", flag = "0"), sep="")
+    print(paste("Deleting", s3Name))
+    delete_object(s3Name, bucket)
+  }
+}
+
+#' @importFrom DBI dbGetQuery
+queryDo = function(dbcon, query){
+  dbGetQuery(dbcon, query)
+}
+
diff --git a/R/redshift-replace.R b/R/redshift-replace.R
@@ -0,0 +1,79 @@
+#' Replace or upsert redshift table
+#'
+#' Upload a table to S3 and then load it with redshift, replacing the contents of that table.
+#' The table on redshift has to have the same structure and column ordering to work correctly.
+#'
+#' @param data a data frame
+#' @param dbcon an RPostgres connection to the redshift server
+#' @param tableName the name of the table to replace
+#' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
+#' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
+#' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
+#' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
+#' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
+#' @examples
+#' library(DBI)
+#'
+#' a=data.frame(a=seq(1,10000), b=seq(10000,1))
+#'
+#'\dontrun{
+#' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+#' host='my-redshift-url.amazon.com', port='5439',
+#' user='myuser', password='mypassword',sslmode='require')
+#'
+#' rs_replace_table(data=a, dbcon=con, tableName='testTable',
+#' bucket="my-bucket", split_files=4)
+#'
+#' }
+#' @export
+rs_replace_table = function(
+    data,
+    dbcon,
+    tableName,
+    split_files,
+    bucket=Sys.getenv('AWS_BUCKET_NAME'),
+    region=Sys.getenv('AWS_DEFAULT_REGION'),
+    access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
+    secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY')
+    )
+  {
+  if(missing(split_files)){
+    print("Getting number of slices from Redshift")
+    slices = queryDo(dbcon,"select count(*) from stv_slices")
+    split_files = unlist(slices[1]*4)
+    print(sprintf("%s slices detected, will split into %s files", slices, split_files))
+  }
+  split_files = min(split_files, nrow(data))
+
+  prefix = uploadToS3(data, bucket, split_files)
+
+
+
+  result = tryCatch({
+      print("Truncating target table")
+      queryDo(dbcon, sprintf("truncate table %s", tableName))
+
+      print("Copying data from S3 into Redshift")
+      queryDo(dbcon, sprintf("copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull credentials 'aws_access_key_id=%s;aws_secret_access_key=%s';",
+                          tableName,
+                          bucket,
+                          prefix,
+                          region,
+                          access_key,
+                          secret_key
+              ))
+
+      print("Committing changes")
+      queryDo(dbcon, "COMMIT;")
+  }, warning = function(w) {
+      print(w)
+  }, error = function(e) {
+      print(e$message)
+      queryDo(dbcon, 'ROLLBACK;')
+  }, finally = {
+    print("Deleting temporary files from S3 bucket")
+    deletePrefix(prefix, bucket, split_files)
+  })
+
+}
+
diff --git a/R/redshift-upsert.R b/R/redshift-upsert.R
@@ -0,0 +1,98 @@
+#' Upsert redshift table
+#'
+#' Upload a table to S3 and then load it with redshift, replacing rows with the same
+#' keys, and inserting rows with new keys.
+#' The table on redshift has to have the same structure and column ordering to work correctly.
+#'
+#' @param data a data frame
+#' @param dbcon an RPostgres connection to the redshift server
+#' @param tableName the name of the table to replace
+#' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
+#' @param keys athis optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.
+#' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
+#' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
+#' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
+#' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
+#' @examples
+#' library(DBI)
+#'
+#' a=data.frame(a=seq(1,10000), b=seq(10000,1))
+#' n=head(a,n=5000)
+#' n$b=n$a
+#' nx=rbind(n, data.frame(a=seq(99999:104000), b=seq(104000:99999)))
+#'
+#'\dontrun{
+#' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+#' host='my-redshift-url.amazon.com', port='5439',
+#' user='myuser', password='mypassword',sslmode='require')
+#'
+#' rs_upsert_table(data=nx, dbcon=con, tableName='testTable',
+#' bucket="my-bucket", split_files=4, keys=c('a'))
+#'
+#'}
+#' @export
+rs_upsert_table = function(
+    data,
+    dbcon,
+    tableName,
+    keys,
+    split_files,
+    bucket=Sys.getenv('AWS_BUCKET_NAME'),
+    region=Sys.getenv('AWS_DEFAULT_REGION'),
+    access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
+    secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY')
+    )
+  {
+  if(missing(split_files)){
+    print("Getting number of slices from Redshift")
+    slices = queryDo(dbcon,"select count(*) from stv_slices")
+    split_files = unlist(slices[1]*4)
+    print(sprintf("%s slices detected, will split into %s files", slices, split_files))
+  }
+  split_files = min(split_files, nrow(data))
+
+  prefix = uploadToS3(data, bucket, split_files)
+
+
+
+
+  result = tryCatch({
+    stageTable=paste0(sample(letters,16),collapse = "")
+
+    queryDo(dbcon, sprintf("create temp table %s (like %s)", stageTable, tableName))
+
+    print("Copying data from S3 into Redshift")
+    queryDo(dbcon, sprintf("copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull credentials 'aws_access_key_id=%s;aws_secret_access_key=%s';",
+                        stageTable,
+                        bucket,
+                        prefix,
+                        region,
+                        access_key,
+                        secret_key
+            ))
+    if(!missing(keys)){
+      print("Deleting rows with same keys")
+      keysCond = paste(stageTable,".",keys, "=", tableName,".",keys, sep="")
+      keysWhere = sub(" and $", "", paste0(keysCond, collapse="", sep=" and "))
+      queryDo(dbcon, sprintf('delete from %s using %s where %s',
+              tableName,
+              stageTable,
+              keysWhere
+              ))
+    }
+    print("Insert new rows")
+    queryDo(dbcon, sprintf('insert into %s select * from %s', tableName, stageTable))
+
+    queryDo(sprintf("drop table %s", stageTable))
+    print("Commiting")
+    queryDo(dbcon, "COMMIT;")
+  }, warning = function(w) {
+      print(w)
+  }, error = function(e) {
+      print(e$message)
+      queryDo(dbcon, 'ROLLBACK;')
+  }, finally = {
+    print("Deleting temporary files from S3 bucket")
+    deletePrefix(prefix, bucket, split_files)
+  })
+}
diff --git a/README.Rmd b/README.Rmd
@@ -0,0 +1,54 @@
+---
+output:
+  md_document:
+    variant: markdown_github
+---
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# redshiftTools
+
+This is an R Package meant to easen the uploading of bulk data into Amazon Redshift.
+
+
+## Installation
+
+To install this package, you'll need to execute these commands:
+
+    ```R
+    install.packages('devtools')
+    devtools::install_github("RcppCore/Rcpp")
+    devtools::install_github("rstats-db/DBI")
+    devtools::install_github("rstats-db/RPostgres")
+    install.packages("aws.s3", repos = c(getOption("repos"), "http://cloudyr.github.io/drat"))
+    devtools::install_github("sicarul/redshiftTools")
+    ```
+
+## Usage
+
+You'll have available now 2 functions: `rs_replace_table` and `rs_upsert_table`, both of these functions are called with almost the same parameters, except on upsert you can specify with which keys to search for matching rows.
+
+For example, suppose we have a table to load with 2 integer columns, we could use the following code:
+
+    ```R
+    library("aws.s3")
+    library(RPostgres)
+    library(redshiftTools)
+    
+    a=data.frame(a=seq(1,10000), b=seq(10000,1))
+    n=head(a,n=10)
+    n$b=n$a
+    nx=rbind(n, data.frame(a=seq(5:10), b=seq(10:5)))
+    
+    con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+    host='my-redshift-url.amazon.com', port='5439',
+    user='myuser', password='mypassword',sslmode='require')
+    
+    b=rs_replace_table(a, dbcon=con, tableName='mytable', bucket="mybucket", split_files=4)
+    c=rs_upsert_table(nx, dbcon=con, tableName = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
+
+    ```
diff --git a/README.md b/README.md
@@ -0,0 +1,45 @@
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+redshiftTools
+=============
+
+This is an R Package meant to easen the uploading of bulk data into Amazon Redshift.
+
+Installation
+------------
+
+To install this package, you'll need to execute these commands:
+
+    ```R
+    install.packages('devtools')
+    devtools::install_github("RcppCore/Rcpp")
+    devtools::install_github("rstats-db/DBI")
+    devtools::install_github("rstats-db/RPostgres")
+    install.packages("aws.s3", repos = c(getOption("repos"), "http://cloudyr.github.io/drat"))
+    devtools::install_github("sicarul/redshiftTools")
+    ```
+
+Usage
+-----
+
+You'll have available now 2 functions: `rs_replace_table` and `rs_upsert_table`, both of these functions are called with almost the same parameters, except on upsert you can specify with which keys to search for matching rows.
+
+For example, suppose we have a table to load with 2 integer columns, we could use the following code:
+
+    ```R
+    library("aws.s3")
+    library(RPostgres)
+    library(redshiftTools)
+
+    a=data.frame(a=seq(1,10000), b=seq(10000,1))
+    n=head(a,n=10)
+    n$b=n$a
+    nx=rbind(n, data.frame(a=seq(5:10), b=seq(10:5)))
+
+    con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+    host='my-redshift-url.amazon.com', port='5439',
+    user='myuser', password='mypassword',sslmode='require')
+
+    b=rs_replace_table(a, dbcon=con, tableName='mytable', bucket="mybucket", split_files=4)
+    c=rs_upsert_table(nx, dbcon=con, tableName = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
+
+    ```
diff --git a/man/rs_replace_table.Rd b/man/rs_replace_table.Rd
diff --git a/man/rs_upsert_table.Rd b/man/rs_upsert_table.Rd
diff --git a/redshiftTools.Rproj b/redshiftTools.Rproj