Skip to content
This repository was archived by the owner on Feb 27, 2025. It is now read-only.

Commit 0015f40

Browse files
authored
Merge branch 'master' into graph-tables
2 parents cb15588 + e328e20 commit 0015f40

File tree

9 files changed

+103
-76
lines changed

9 files changed

+103
-76
lines changed

.github/workflows/scala.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,19 @@ on:
88

99
jobs:
1010
build:
11-
12-
runs-on: ubuntu-latest
11+
name: ${{ matrix.os }}
12+
runs-on: ${{ matrix.os }}
13+
strategy:
14+
matrix:
15+
os: [ubuntu-latest, macOS-latest]
16+
profile: ['spark31']
17+
timeout-minutes: 15
1318

1419
steps:
1520
- uses: actions/checkout@v2
1621
- name: Set up JDK 1.8
1722
uses: actions/setup-java@v1
1823
with:
1924
java-version: 1.8
20-
- name: Run tests
21-
run: sbt test
25+
- name: Build and run tests
26+
run: mvn clean package -P ${{ matrix.profile }}

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,3 +353,9 @@ project/project/
353353
project/target/
354354
target/
355355
.idea
356+
357+
# VS Code
358+
.vscode
359+
.settings
360+
.classpath
361+
.project

README.md

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,21 @@ This library contains the source code for the Apache Spark Connector for SQL Ser
99

1010
[Apache Spark](https://spark.apache.org/) is a unified analytics engine for large-scale data processing.
1111

12-
There are two versions of the connector available through Maven, a 2.4.5 compatible version and a 3.0.0 compatible version. Both versions can be found [here](https://search.maven.org/search?q=spark-mssql-connector) and can be imported using the coordinates below:
12+
There are three version sets of the connector available through Maven, a 2.4.x, a 3.0.x and a 3.1.x compatible version. All versions can be found [here](https://search.maven.org/search?q=spark-mssql-connector) and can be imported using the coordinates below:
1313

14-
| Connector | Maven Coordinate |
15-
| --------- | ------------------ |
16-
| Spark 2.4.5 compatible connnector | `com.microsoft.azure:spark-mssql-connector:1.0.1` |
17-
| Spark 3.0.0 compatible connnector | `com.microsoft.azure:spark-mssql-connector_2.12_3.0:1.0.0-alpha` |
14+
| Connector | Maven Coordinate | Scala Version |
15+
| --------- | ---------------- | ------------- |
16+
| Spark 2.4.x compatible connnector | `com.microsoft.azure:spark-mssql-connector:1.0.2` | 2.11 |
17+
| Spark 3.0.x compatible connnector | `com.microsoft.azure:spark-mssql-connector_2.12:1.1.0` | 2.12 |
18+
| Spark 3.1.x compatible connnector | `com.microsoft.azure:spark-mssql-connector_2.12:1.2.0` | 2.12 |
1819

1920
## Current Releases
2021

21-
The Spark 2.4.5 compatible connector is on v1.0.1.
22-
The Spark 3.0.0 compatible connector is on v1.0.0-alpha.
22+
The latest Spark 2.4.x compatible connector is on v1.0.2.
23+
24+
The latest Spark 3.0.x compatible connector is on v1.1.0.
25+
26+
The latest Spark 3.1.x compatible connector is on v1.2.0.
2327

2428
For main changes from previous releases and known issues please refer to [CHANGELIST](docs/CHANGELIST.md)
2529

@@ -33,7 +37,7 @@ For main changes from previous releases and known issues please refer to [CHANGE
3337

3438
| Component | Versions Supported |
3539
| --------- | ------------------ |
36-
| Apache Spark | 2.4.5, 3.0.0 |
40+
| Apache Spark | 2.4.x, 3.0.x, 3.1.x |
3741
| Scala | 2.11, 2.12 |
3842
| Microsoft JDBC Driver for SQL Server | 8.4.1 |
3943
| Microsoft SQL Server | SQL Server 2008 or later |

pom.xml

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<groupId>com.microsoft.azure</groupId>
55
<artifactId>spark-mssql-connector</artifactId>
66
<packaging>jar</packaging>
7-
<version>1.0.0</version>
7+
<version>1.2.0</version>
88
<name>${project.groupId}:${project.artifactId}</name>
99
<description>The Apache Spark Connector for SQL Server and Azure SQL is a high-performance connector that enables you to use transactional data in big data analytics and persists results for ad-hoc queries or reporting.</description>
1010
<url>https://github.com/microsoft/sql-spark-connector</url>
@@ -200,47 +200,29 @@
200200
</plugins>
201201
</build>
202202
<profiles>
203-
<profile>
204-
<id>spark24</id>
205-
<properties>
206-
<scala.binary.version>2.11</scala.binary.version>
207-
<scala.version>2.11.12</scala.version>
208-
<spark.version>2.4.6</spark.version>
209-
</properties>
210-
<dependencies>
211-
<dependency>
212-
<groupId>org.scalatest</groupId>
213-
<artifactId>scalatest_${scala.binary.version}</artifactId>
214-
<version>3.0.5</version>
215-
<scope>test</scope>
216-
</dependency>
217-
<dependency>
218-
<groupId>com.microsoft.sqlserver</groupId>
219-
<artifactId>mssql-jdbc</artifactId>
220-
<version>8.4.1.jre8</version>
221-
</dependency>
222-
</dependencies>
223-
</profile>
224-
<profile>
225-
<id>spark30</id>
226-
<properties>
227-
<scala.binary.version>2.12</scala.binary.version>
228-
<scala.version>2.12.11</scala.version>
229-
<spark.version>3.0.0</spark.version>
230-
</properties>
231-
<dependencies>
232-
<dependency>
233-
<groupId>org.scalatest</groupId>
234-
<artifactId>scalatest_${scala.binary.version}</artifactId>
235-
<version>3.0.8</version>
236-
<scope>test</scope>
237-
</dependency>
238-
<dependency>
239-
<groupId>com.microsoft.sqlserver</groupId>
240-
<artifactId>mssql-jdbc</artifactId>
241-
<version>8.4.1.jre8</version>
242-
</dependency>
243-
</dependencies>
244-
</profile>
203+
<profile>
204+
<id>spark31</id>
205+
<activation>
206+
<activeByDefault>true</activeByDefault>
207+
</activation>
208+
<properties>
209+
<scala.binary.version>2.12</scala.binary.version>
210+
<scala.version>2.12.11</scala.version>
211+
<spark.version>3.1.2</spark.version>
212+
</properties>
213+
<dependencies>
214+
<dependency>
215+
<groupId>org.scalatest</groupId>
216+
<artifactId>scalatest_${scala.binary.version}</artifactId>
217+
<version>3.2.6</version>
218+
<scope>test</scope>
219+
</dependency>
220+
<dependency>
221+
<groupId>com.microsoft.sqlserver</groupId>
222+
<artifactId>mssql-jdbc</artifactId>
223+
<version>8.4.1.jre8</version>
224+
</dependency>
225+
</dependencies>
226+
</profile>
245227
</profiles>
246228
</project>

src/main/scala/com/microsoft/sqlserver/jdbc/spark/connectors/ReliableSingleInstanceStrategy.scala

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ object ReliableSingleInstanceStrategy extends DataIOStrategy with Logging {
3636
* Phase 2 Driver combines all staging tables to transactionally write to user specified table.
3737
* Driver does a cleanup of staging tables as a good practice. Staging tables are temporary tables
3838
* and should be cleanup automatically on job completion. Staging table names are prefixed
39-
* with appId to allow for identification.
39+
* with appId and destination fully qualified table name to allow for identification.
4040
* @param df dataframe to write
4141
* @param dfColMetaData for the table
4242
* @param options user specified options
@@ -50,7 +50,7 @@ object ReliableSingleInstanceStrategy extends DataIOStrategy with Logging {
5050
logInfo("write : reliable write to single instance called")
5151
// Initialize - create connection and cleanup existing tables if any
5252
val conn = createConnectionFactory(options)()
53-
val stagingTableList = getStagingTableNames(appId, df.rdd.getNumPartitions)
53+
val stagingTableList = getStagingTableNames(appId, options.dbtable, df.rdd.getNumPartitions)
5454
cleanupStagingTables(conn, stagingTableList, options)
5555
createStagingTables(conn, stagingTableList,options)
5656
// Phase1 - Executors write partitions to staging tables.
@@ -93,7 +93,7 @@ object ReliableSingleInstanceStrategy extends DataIOStrategy with Logging {
9393
try {
9494
df.rdd.mapPartitionsWithIndex(
9595
(index, iterator) => {
96-
val table_name = getStagingTableName(appId,index)
96+
val table_name = getStagingTableName(appId,options.dbtable,index)
9797
logDebug(s"writeToStagingTables: Writing partition index $index to Table $table_name")
9898
val newOptions = new SQLServerBulkJdbcOptions(options.parameters + ("tableLock" -> "true"))
9999
idempotentInsertToTable(iterator, table_name, dfColMetadata, newOptions)
@@ -157,28 +157,32 @@ object ReliableSingleInstanceStrategy extends DataIOStrategy with Logging {
157157

158158
/**
159159
* utility function to get all global temp table names as a list.
160-
* @param appId appId used as prefix of tablename
161-
* @param nrOfPartitions number of paritions in dataframe used as suffix
160+
* @param appId appId used as prefix of staging table name
161+
* @param dbtable destination fully qualified table name used as part of temp staging table name
162+
* @param nrOfPartitions number of partitions in dataframe used as suffix
162163
*/
163164
private def getStagingTableNames(
164165
appId: String,
166+
dbtable: String,
165167
nrOfPartitions: Int): IndexedSeq[String] = {
166168
val stagingTableList = for (index <- 0 until nrOfPartitions) yield {
167-
getStagingTableName(appId, index)
169+
getStagingTableName(appId, dbtable, index)
168170
}
169171
stagingTableList
170172
}
171173

172174
/**
173175
* utility function to create a staging table name
174-
* @param appId appId used as prefix of tablename
176+
* @param appId appId used as prefix of table name
177+
* @param dbtable destination fully qualified table name used as part of temp staging table name
175178
* @param index used as suffix
176179
*/
177180
private def getStagingTableName(
178181
appId: String,
182+
dbtable: String,
179183
index:Int) : String = {
180184
// Global table names in SQLServer are prefixed with ##
181-
s"[##${appId}_${index}]"
185+
s"[##${appId}_${dbtable}_${index}]"
182186
}
183187

184188
/**

src/main/scala/com/microsoft/sqlserver/jdbc/spark/utils/BulkCopyUtils.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ object BulkCopyUtils extends Logging {
507507
df: DataFrame,
508508
options: SQLServerBulkJdbcOptions): Unit = {
509509
logDebug("Creating table")
510-
val strSchema = schemaString(df, options.url, options.createTableColumnTypes)
510+
val strSchema = schemaString(df.schema, true, options.url, options.createTableColumnTypes)
511511
val createTableStr = s"CREATE TABLE ${options.dbtable} (${strSchema}) ${options.createTableOptions}"
512512
executeUpdate(conn,createTableStr)
513513
logDebug("Creating table succeeded")
@@ -525,7 +525,7 @@ object BulkCopyUtils extends Logging {
525525
df: DataFrame,
526526
options: SQLServerBulkJdbcOptions): Unit = {
527527
logDebug(s"Creating external table ${options.dbtable}")
528-
val strSchema = schemaString(df, "jdbc:sqlserver")
528+
val strSchema = schemaString(df.schema, true, "jdbc:sqlserver")
529529
val createExTableStr = s"CREATE EXTERNAL TABLE ${options.dbtable} (${strSchema}) " +
530530
s"WITH (DATA_SOURCE=${options.dataPoolDataSource}, DISTRIBUTION=${options.dataPoolDistPolicy});"
531531
executeUpdate(conn,createExTableStr)

src/test/scala/com/microsoft/sqlserver/jdbc/spark/bulkwrite/DataSourceTest.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
package com.microsoft.sqlserver.jdbc.spark
1515
import java.sql.Connection
1616

17-
import org.scalatest.Matchers
17+
import org.scalatest.matchers.should.Matchers
1818
import org.apache.spark.SparkFunSuite
1919
import org.apache.spark.sql.test.SharedSparkSession
2020

test/scala_test/pom.xml

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -123,19 +123,14 @@
123123
</build>
124124
<profiles>
125125
<profile>
126-
<id>spark24</id>
127-
<properties>
128-
<scala.binary.version>2.11</scala.binary.version>
129-
<scala.version>2.11.12</scala.version>
130-
<spark.version>2.4.6</spark.version>
131-
</properties>
132-
</profile>
133-
<profile>
134-
<id>spark30</id>
126+
<id>spark31</id>
127+
<activation>
128+
<activeByDefault>true</activeByDefault>
129+
</activation>
135130
<properties>
136131
<scala.binary.version>2.12</scala.binary.version>
137132
<scala.version>2.12.11</scala.version>
138-
<spark.version>3.0.0</spark.version>
133+
<spark.version>3.1.2</spark.version>
139134
</properties>
140135
</profile>
141136
</profiles>

test/scala_test/src/main/scala/MasterInstanceTest.scala

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ import org.apache.spark.sql.{SparkSession, SaveMode, Row, DataFrame}
1717
import org.apache.spark.sql.functions.asc
1818
import org.apache.spark.sql.types._
1919

20+
import java.util.concurrent.Executors
21+
import scala.concurrent.duration.Duration
22+
import scala.concurrent.{Await, ExecutionContext, Future}
23+
2024
/*
2125
* MasterInstanceTest
2226
* test cases for master instance. Most test can we used for Spark JDBC
@@ -536,4 +540,31 @@ class MasterInstanceTest(testUtils:Connector_TestUtils) {
536540
log.info("test_gci_reordered_columns : Reordered Write overwrite without truncate")
537541
testUtils.drop_test_table(table_name)
538542
}
543+
544+
// Test basic functionalities of writing to different databases in parallel
545+
def test_gci_write_parallel() {
546+
//Allowing a maximum of 2 threads to run
547+
val executorService = Executors.newFixedThreadPool(2)
548+
implicit val executionContext = ExecutionContext.fromExecutorService(executorService)
549+
550+
val table_name1 = s"test_write_parallel_1_${testType }"
551+
val table_name2 = s"test_write_parallel_2_${testType }"
552+
val df = testUtils.create_toy_df()
553+
val futureA = Future {
554+
testUtils.df_write(df, SaveMode.Overwrite, table_name1)
555+
}
556+
val futureB = Future {
557+
testUtils.df_write(df, SaveMode.Overwrite, table_name2)
558+
}
559+
Await.result(futureA, Duration.Inf)
560+
Await.result(futureB, Duration.Inf)
561+
562+
var result1 = testUtils.df_read(table_name1)
563+
assert(df.schema == result1.schema)
564+
var result2 = testUtils.df_read(table_name2)
565+
assert(df.schema == result2.schema)
566+
log.info("test_write_parallel : Exit")
567+
testUtils.drop_test_table(table_name1)
568+
testUtils.drop_test_table(table_name2)
569+
}
539570
}

0 commit comments

Comments
 (0)