danielfm123
diff --git a/‎DESCRIPTION
+6-4 b/‎DESCRIPTION
+6-4
diff --git a/‎NAMESPACE
+1 b/‎NAMESPACE
+1
diff --git a/‎R/create.R
+62 b/‎R/create.R
+62
diff --git a/‎R/internal.R
+25-1 b/‎R/internal.R
+25-1
diff --git a/‎R/redshift-replace.R ‎R/replace.R
+9-17 b/‎R/redshift-replace.R ‎R/replace.R
+9-17
diff --git a/‎R/redshift-upsert.R ‎R/upsert.R
+12-21 b/‎R/redshift-upsert.R ‎R/upsert.R
+12-21
diff --git a/‎README.Rmd
+73-4 b/‎README.Rmd
+73-4
@@ -1,18 +1,20 @@
 Package: redshiftTools
 Type: Package
 Title: Redshift Tools
-Version: 0.2.900
+Version: 0.3.900
 Authors@R: person("Pablo", "Seibelt", email = "[email protected]",
   role = c("aut", "cre"))
 Mantainers@R: person("Pablo", "Seibelt", email = "[email protected]",
     role = c("aut", "cre"))
 Depends:
-    R (>= 3.1.2)
+    R (>= 3.3.0)
 Imports:
     DBI,
-    RPostgres,
     aws.s3
-Description: Tools to upload data to an Amazon Redshift Database with good performance.
+Suggests:
+    RJDBC,
+    RPostgres
+Description: Tools for uploading data into Amazon Redshift from R.
 License: MIT + file LICENSE
 LazyData: TRUE
 RoxygenNote: 6.0.1
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(rs_create_statement)
+export(rs_create_table)
 export(rs_replace_table)
 export(rs_upsert_table)
 importFrom("aws.s3","bucket_exists")
 
@@ -0,0 +1,62 @@
+#' Create a table from scratch, guessing the table schema
+#'
+#'
+#' @param data a data frame
+#' @param dbcon an RPostgres connection to the redshift server
+#' @param table_name the name of the table to create
+#' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
+#' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
+#' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
+#' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
+#' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
+#' @param iam_role_arn an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.
+#' @param wlm_slots amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html
+#' @param sortkeys Column or columns to sort the table by
+#' @param sortkey_style Sortkey style, can be compound or interleaved http://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data-compare-sort-styles.html
+#' @param distkey Distkey column, can only be one, if chosen the table is distributed among clusters according to a hash of this column's value.
+#' @param distkey_style Distkey style, can be even or all, for the key distribution use the distkey parameter. http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
+#' @param compression Add encoding for columns whose compression algorithm is easy to guess, for the rest you should upload it to Redshift and run ANALYZE COMPRESSION
+#'
+#' @examples
+#' library(DBI)
+#'
+#' a=data.frame(a=seq(1,10000), b=seq(10000,1))
+#'
+#'\dontrun{
+#' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+#' host='my-redshift-url.amazon.com', port='5439',
+#' user='myuser', password='mypassword',sslmode='require')
+#'
+#' rs_create_table(data=a, dbcon=con, table_name='testTable',
+#' bucket="my-bucket", split_files=4)
+#'
+#' }
+#' @export
+rs_create_table = function(
+    data,
+    dbcon,
+    table_name=deparse(substitute(data)),
+    split_files,
+    bucket=Sys.getenv('AWS_BUCKET_NAME'),
+    region=Sys.getenv('AWS_DEFAULT_REGION'),
+    access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
+    secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY'),
+    iam_role_arn=Sys.getenv('AWS_IAM_ROLE_ARN'),
+    wlm_slots=1,
+    sortkeys,
+    sortkey_style='compound',
+    distkey,
+    distkey_style='even',
+    compression=T
+    )
+  {
+
+  tableSchema = rs_create_statement(data, table_name = table_name, sortkeys=sortkeys,
+  sortkey_style = sortkey_style, distkey=distkey, distkey_style = distkey_style,
+  compression = compression)
+
+  queryStmt(dbcon, tableSchema)
+
+  return(rs_replace_table(data, dbcon, table_name, split_files, bucket, region, access_key, secret_key, iam_role_arn, wlm_slots))
+
+}
@@ -45,5 +45,29 @@ queryDo = function(dbcon, query){
 
 #' @importFrom DBI dbExecute
 queryStmt = function(dbcon, query){
-  dbExecute(dbcon, query)
+  if(inherits(dbcon, 'JDBCConnection')){
+    RJDBC::dbSendUpdate(dbcon, query)
+  }else{
+    dbExecute(dbcon, query)
+  }
+}
+
+s3ToRedshift = function(dbcon, table_name, bucket, prefix, region, access_key, secret_key, iam_role_arn){
+    stageTable=paste0(sample(letters,16),collapse = "")
+    # Create temporary table for staging data
+    queryStmt(dbcon, sprintf("create temp table %s (like %s)", stageTable, table_name))
+
+    print("Copying data from S3 into Redshift")
+    copyStr = "copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull COMPUPDATE FALSE %s"
+
+    # Use IAM Role if available
+    if (nchar(iam_role_arn) > 0) {
+      credsStr = sprintf("iam_role '%s'", iam_role_arn)
+    } else {
+      credsStr = sprintf("credentials 'aws_access_key_id=%s;aws_secret_access_key=%s'", access_key, secret_key)
+    }
+    statement = sprintf(copyStr, stageTable, bucket, prefix, region, credsStr)
+    queryStmt(dbcon,statement)
+
+    return(stageTable)
 }
@@ -5,7 +5,7 @@
 #'
 #' @param data a data frame
 #' @param dbcon an RPostgres connection to the redshift server
-#' @param tableName the name of the table to replace
+#' @param table_name the name of the table to replace
 #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
 #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
 #' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
@@ -23,15 +23,15 @@
 #' host='my-redshift-url.amazon.com', port='5439',
 #' user='myuser', password='mypassword',sslmode='require')
 #'
-#' rs_replace_table(data=a, dbcon=con, tableName='testTable',
+#' rs_replace_table(data=a, dbcon=con, table_name='testTable',
 #' bucket="my-bucket", split_files=4)
 #'
 #' }
 #' @export
 rs_replace_table = function(
     data,
     dbcon,
-    tableName,
+    table_name,
     split_files,
     bucket=Sys.getenv('AWS_BUCKET_NAME'),
     region=Sys.getenv('AWS_DEFAULT_REGION'),
@@ -62,26 +62,18 @@ rs_replace_table = function(
   }
 
   result = tryCatch({
-      stageTable=paste0(sample(letters,16),collapse = "")
+      stageTable=s3ToRedshift(dbcon, table_name, bucket, prefix, region, access_key, secret_key, iam_role_arn)
 
-      queryStmt(dbcon, sprintf("create temp table %s (like %s)", stageTable, tableName))
-
-      print("Copying data from S3 into Redshift")
-      copyStr = "copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull COMPUPDATE FALSE"
-      if (nchar(iam_role_arn) > 0) {
-        copyStr = paste(copyStr, sprintf("iam_role '%s'", iam_role_arn), sep=" ")
-      } else {
-        copyStr = paste(copyStr, sprintf("credentials 'aws_access_key_id=%s;aws_secret_access_key=%s'", access_key, secret_key), sep=" ")
+      # Use a single transaction if using RJDBC
+      if(inherits(dbcon, 'RJDBC')){
+        queryStmt(dbcon, 'begin')
       }
-      statement = sprintf(copyStr, stageTable, bucket, prefix, region)
-      queryStmt(dbcon, statement)
-
 
       print("Deleting target table for replacement")
-      queryStmt(dbcon, sprintf("delete from %s", tableName))
+      queryStmt(dbcon, sprintf("delete from %s", table_name))
 
       print("Insert new rows")
-      queryStmt(dbcon, sprintf('insert into %s select * from %s', tableName, stageTable))
+      queryStmt(dbcon, sprintf('insert into %s select * from %s', table_name, stageTable))
 
       print("Drop staging table")
       queryStmt(dbcon, sprintf("drop table %s", stageTable))
 
@@ -6,7 +6,7 @@
 #'
 #' @param data a data frame
 #' @param dbcon an RPostgres connection to the redshift server
-#' @param tableName the name of the table to replace
+#' @param table_name the name of the table to replace
 #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
 #' @param keys athis optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.
 #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
@@ -28,15 +28,15 @@
 #' host='my-redshift-url.amazon.com', port='5439',
 #' user='myuser', password='mypassword',sslmode='require')
 #'
-#' rs_upsert_table(data=nx, dbcon=con, tableName='testTable',
+#' rs_upsert_table(data=nx, dbcon=con, table_name='testTable',
 #' bucket="my-bucket", split_files=4, keys=c('a'))
 #'
 #'}
 #' @export
 rs_upsert_table = function(
     data,
     dbcon,
-    tableName,
+    table_name,
     keys,
     split_files,
     bucket=Sys.getenv('AWS_BUCKET_NAME'),
@@ -68,36 +68,27 @@ rs_upsert_table = function(
   }
 
   result = tryCatch({
-    stageTable=paste0(sample(letters,16),collapse = "")
+    stageTable=s3ToRedshift(dbcon, table_name, bucket, prefix, region, access_key, secret_key, iam_role_arn)
 
-    queryStmt(dbcon, sprintf("create temp table %s (like %s)", stageTable, tableName))
-
-    print("Copying data from S3 into Redshift")
-    copyStr = "copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull COMPUPDATE FALSE"
-
-    # Use iam role if available
-    if ((nchar(iam_role_arn) > 0)) {
-      copyStr = paste(copyStr, sprintf("iam_role '%s'", iam_role_arn), sep=" ")
-    } else {
-      copyStr = paste(copyStr, sprintf("credentials 'aws_access_key_id=%s;aws_secret_access_key=%s'", access_key, secret_key), sep=" ")
+    # Use a single transaction if using RJDBC
+    if(inherits(dbcon, 'RJDBC')){
+      queryStmt(dbcon, 'begin')
     }
-    statement = sprintf(copyStr, stageTable, bucket, prefix, region)
-    queryStmt(dbcon,statement)
-    if(!missing(keys)){
-      print("Deleting rows with same keys")
 
+    if(!missing(keys)){
       # where stage.key = table.key and...
-      keysCond = paste(stageTable,".",keys, "=", tableName,".",keys, sep="")
+      keysCond = paste(stageTable,".",keys, "=", table_name,".",keys, sep="")
       keysWhere = sub(" and $", "", paste0(keysCond, collapse="", sep=" and "))
 
       queryStmt(dbcon, sprintf('delete from %s using %s where %s',
-              tableName,
+              table_name,
               stageTable,
               keysWhere
               ))
     }
+
     print("Insert new rows")
-    queryStmt(dbcon, sprintf('insert into %s select * from %s', tableName, stageTable))
+    queryStmt(dbcon, sprintf('insert into %s select * from %s', table_name, stageTable))
 
     print("Drop staging table")
     queryStmt(dbcon, sprintf("drop table %s", stageTable))
 
@@ -23,10 +23,57 @@ To install this package, you'll need to execute these commands:
     install.packages(c('devtools', 'httr', 'aws.s3'))
     devtools::install_github("RcppCore/Rcpp")
     devtools::install_github("r-dbi/DBI")
-    devtools::install_github("r-dbi/RPostgres")
     devtools::install_github("sicarul/redshiftTools")
 ```
 
+
+## Drivers
+
+This library supports two official ways of connecting to Amazon Redshift (Others may work, but untested):
+
+### RPostgres
+This Postgres library is great, and it works even with Amazon Redshift servers with SSL enabled. The only weak point with Redshift is that it doesn't work with transactions
+
+To use it, please configure like this:
+
+``` r 
+    devtools::install_github("r-dbi/RPostgres")
+    library(RPostgres)
+    
+    con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+    host='my-redshift-url.amazon.com', port='5439',
+    user='myuser', password='mypassword',sslmode='require')
+    test=dbGetQuery('select 1')
+```
+
+### RJDBC
+If you download the official redshift driver .jar, you can use it with this R library, it's not great in the sense that you can't use it with dplyr for example, since it doesn't implement all the standard DBI interfaces. Though it has the great advantage when uploading data, that it supports transactions, enabling your data to remain consistent while uploading new data.
+
+To use it, please configure like this:
+
+``` r 
+    install.packages('RJDBC')
+    library(RJDBC)
+    
+    # Save the driver into a directory
+    dir.create('~/.redshiftTools')
+    download.file('http://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC41-1.1.9.1009.jar','~/.redshiftTools/redshift-driver.jar')
+    
+    # Use Redshift driver
+    driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "~/.redshiftTools/redshift-driver.jar", identifier.quote="`")
+
+    # Create connection    
+    dbname="dbname"
+    host='my-redshift-url.amazon.com'
+    port='5439'
+    user='myuser'
+    password='mypassword'
+    ssl='true'
+    url <- sprintf("jdbc:redshift://%s:%s/%s?tcpKeepAlive=true&ssl=%s&sslfactory=com.amazon.redshift.ssl.NonValidatingFactory", host, port, dbname, ssl)
+    conn <- dbConnect(driver, url, user, password)
+
+```
+
 ## Usage
 
 ### Creating tables
@@ -43,7 +90,7 @@ d=rep(as.POSIXct('2017-01-01 20:01:32'), n),
 e=rep(as.POSIXlt('2017-01-01 20:01:32'), n),
 f=rep(paste0(rep('a', 4000), collapse=''), n) )
 
-cat(rs_create_statement(testdf, tableName='dm_great_table'))
+cat(rs_create_statement(testdf, table_name='dm_great_table'))
 
 ```
 
@@ -82,7 +129,29 @@ For example, suppose we have a table to load with 2 integer columns, we could us
     host='my-redshift-url.amazon.com', port='5439',
     user='myuser', password='mypassword',sslmode='require')
 
-    b=rs_replace_table(a, dbcon=con, tableName='mytable', bucket="mybucket", split_files=4)
-    c=rs_upsert_table(nx, dbcon=con, tableName = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
+    b=rs_replace_table(a, dbcon=con, table_name='mytable', bucket="mybucket", split_files=4)
+    c=rs_upsert_table(nx, dbcon=con, table_name = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
+
+```
+
+
+### Creating tables with data
+
+A conjunction of `rs_create_statement` and `rs_replace_table` can be found in `rs_create_table`. You can create a table from scratch from R and upload the contents of the data frame, without needing to write SQL code at all.
+
+
+``` r
+    library("aws.s3")
+    library(RPostgres)
+    library(redshiftTools)
+
+    a=data.frame(a=seq(1,10000), b=seq(10000,1))
+    
+    con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
+    host='my-redshift-url.amazon.com', port='5439',
+    user='myuser', password='mypassword',sslmode='require')
+
+    b=rs_create_table(a, dbcon=con, table_name='mytable', bucket="mybucket", split_files=4)
+    
 
 ```