Skip to content

Commit e098334

Browse files
chore: downgrade to spark3.3
Co-authored-by: Brendan Walsh <[email protected]> Co-authored-by: Mark Hamilton <[email protected]>
1 parent d9149d1 commit e098334

File tree

21 files changed

+64
-74
lines changed

21 files changed

+64
-74
lines changed

build.sbt

+7-9
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ import scala.xml.transform.{RewriteRule, RuleTransformer}
77
import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}
88

99
val condaEnvName = "synapseml"
10-
val sparkVersion = "3.4.1"
10+
val sparkVersion = "3.3.3"
1111
name := "synapseml"
1212
ThisBuild / organization := "com.microsoft.azure"
13-
ThisBuild / scalaVersion := "2.12.17"
13+
ThisBuild / scalaVersion := "2.12.15"
1414

1515
val scalaMajorVersion = 2.12
1616

@@ -20,27 +20,25 @@ val excludes = Seq(
2020
)
2121

2222
val coreDependencies = Seq(
23-
// Excluding protobuf-java, as spark-core is bringing the older version transitively.
24-
"org.apache.spark" %% "spark-core" % sparkVersion % "compile" exclude("com.google.protobuf", "protobuf-java"),
23+
"org.apache.spark" %% "spark-core" % sparkVersion % "compile",
2524
"org.apache.spark" %% "spark-mllib" % sparkVersion % "compile",
26-
"org.apache.spark" %% "spark-avro" % sparkVersion % "compile",
25+
"org.apache.spark" %% "spark-avro" % sparkVersion % "provided",
2726
"org.apache.spark" %% "spark-tags" % sparkVersion % "test",
2827
"com.globalmentor" % "hadoop-bare-naked-local-fs" % "0.1.0" % "test",
2928
"org.scalatest" %% "scalatest" % "3.2.14" % "test")
3029
val extraDependencies = Seq(
31-
"commons-lang" % "commons-lang" % "2.6",
3230
"org.scalactic" %% "scalactic" % "3.2.14",
3331
"io.spray" %% "spray-json" % "1.3.5",
3432
"com.jcraft" % "jsch" % "0.1.54",
3533
"org.apache.httpcomponents.client5" % "httpclient5" % "5.1.3",
3634
"org.apache.httpcomponents" % "httpmime" % "4.5.13",
37-
"com.linkedin.isolation-forest" %% "isolation-forest_3.4.2" % "3.0.4"
35+
"com.linkedin.isolation-forest" %% "isolation-forest_3.3.3" % "3.0.4"
3836
exclude("com.google.protobuf", "protobuf-java") exclude("org.apache.spark", "spark-mllib_2.12")
3937
exclude("org.apache.spark", "spark-core_2.12") exclude("org.apache.spark", "spark-avro_2.12")
4038
exclude("org.apache.spark", "spark-sql_2.12"),
41-
// Although breeze 2.1.0 is already provided by Spark, this is needed for Azure Synapse Spark 3.4 pools.
39+
// Although breeze 1.2 is already provided by Spark, this is needed for Fabric Spark 3.3 pools.
4240
// Otherwise a NoSuchMethodError will be thrown by interpretability code.
43-
"org.scalanlp" %% "breeze" % "2.1.0"
41+
"org.scalanlp" %% "breeze" % "1.2"
4442
).map(d => d excludeAll (excludes: _*))
4543
val dependencies = coreDependencies ++ extraDependencies
4644

core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ object PyCodegen {
7070
// There's `Already borrowed` error found in transformers 4.16.2 when using tokenizers
7171
s"""extras_require={"extras": [
7272
| "cmake",
73-
| "horovod==0.28.1",
73+
| "horovod==0.27.0",
7474
| "pytorch_lightning>=1.5.0,<1.5.10",
7575
| "torch==1.13.1",
7676
| "torchvision>=0.14.1",

core/src/main/scala/com/microsoft/azure/synapse/ml/core/env/PackageUtils.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ object PackageUtils {
2121
// Use a fixed version for local testing
2222
// val PackageMavenCoordinate = s"$PackageGroup:$PackageName:1.0.8"
2323

24-
private val AvroCoordinate = "org.apache.spark:spark-avro_2.12:3.4.1"
24+
private val AvroCoordinate = "org.apache.spark:spark-avro_2.12:3.3.3"
2525
val PackageRepository: String = SparkMLRepository
2626

2727
// If testing onnx package with snapshots repo, make sure to switch to using

core/src/main/scala/com/microsoft/azure/synapse/ml/exploratory/DistributionBalanceMeasure.scala

+1-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
package com.microsoft.azure.synapse.ml.exploratory
55

6-
import breeze.stats.distributions.{ChiSquared, RandBasis}
6+
import breeze.stats.distributions.{ChiSquared}
77
import com.microsoft.azure.synapse.ml.codegen.Wrappable
88
import com.microsoft.azure.synapse.ml.core.schema.DatasetExtensions
99
import com.microsoft.azure.synapse.ml.logging.{FeatureNames, SynapseMLLogging}
@@ -261,7 +261,6 @@ private[exploratory] case class DistributionMetrics(numFeatures: Int,
261261

262262
// Calculates left-tailed p-value from degrees of freedom and chi-squared test statistic
263263
def chiSquaredPValue: Column = {
264-
implicit val rand: RandBasis = RandBasis.mt0
265264
val degOfFreedom = numFeatures - 1
266265
val scoreCol = chiSquaredTestStatistic
267266
val chiSqPValueUdf = udf(

core/src/main/scala/com/microsoft/azure/synapse/ml/nn/BallTree.scala

+8-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import com.microsoft.azure.synapse.ml.core.env.StreamUtilities.using
88

99
import java.io._
1010
import scala.collection.JavaConverters._
11+
import breeze.linalg.functions.euclideanDistance
1112

1213
private case class Query(point: DenseVector[Double],
1314
normOfQueryPoint: Double,
@@ -100,16 +101,16 @@ trait BallTreeBase[V] {
100101
}
101102

102103
/** Performs fast lookups of nearest neighbors using the Ball Tree algorithm for space partitioning
103-
*
104-
* Note that this code borrows heavily from
105-
* https://github.com/felixmaximilian/mips
106-
*
107-
* @author Felix Maximilian
108-
*/
104+
*
105+
* Note that this code borrows heavily from
106+
* https://github.com/felixmaximilian/mips
107+
*
108+
* @author Felix Maximilian
109+
*/
109110
case class BallTree[V](override val keys: IndexedSeq[DenseVector[Double]],
110111
override val values: IndexedSeq[V],
111112
override val leafSize: Int = 50) //scalastyle:ignore magic.number
112-
extends Serializable with BallTreeBase[V] {
113+
extends Serializable with BallTreeBase[V] {
113114

114115
private val root: Node = makeBallTree(pointIdx)
115116

core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala

+5-8
Original file line numberDiff line numberDiff line change
@@ -199,20 +199,17 @@ object SparkHelpers {
199199

200200
def flatten(ratings: Dataset[_], num: Int, dstOutputColumn: String, srcOutputColumn: String): DataFrame = {
201201
import ratings.sparkSession.implicits._
202-
import org.apache.spark.sql.functions.{collect_top_k, struct}
202+
203+
val topKAggregator = new TopByKeyAggregator[Int, Int, Float](num, Ordering.by(_._2))
204+
val recs = ratings.as[(Int, Int, Float)].groupByKey(_._1).agg(topKAggregator.toColumn)
205+
.toDF("id", "recommendations")
203206

204207
val arrayType = ArrayType(
205208
new StructType()
206209
.add(dstOutputColumn, IntegerType)
207210
.add(Constants.RatingCol, FloatType)
208211
)
209-
210-
ratings.toDF(srcOutputColumn, dstOutputColumn, Constants.RatingCol).groupBy(srcOutputColumn)
211-
.agg(collect_top_k(struct(Constants.RatingCol, dstOutputColumn), num, false))
212-
.as[(Int, Seq[(Float, Int)])]
213-
.map(t => (t._1, t._2.map(p => (p._2, p._1))))
214-
.toDF(srcOutputColumn, Constants.Recommendations)
215-
.withColumn(Constants.Recommendations, col(Constants.Recommendations).cast(arrayType))
212+
recs.select(col("id").as(srcOutputColumn), col("recommendations").cast(arrayType))
216213
}
217214
}
218215

core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ object RTestGen {
101101
| "spark.sql.shuffle.partitions=10",
102102
| "spark.sql.crossJoin.enabled=true")
103103
|
104-
|sc <- spark_connect(master = "local", version = "3.4.1", config = conf)
104+
|sc <- spark_connect(master = "local", version = "3.3.3", config = conf)
105105
|
106106
|""".stripMargin, StandardOpenOption.CREATE)
107107

core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala

+8-8
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ object DatabricksUtilities {
3131

3232
// ADB Info
3333
val Region = "eastus"
34-
val PoolName = "synapseml-build-13.3"
35-
val GpuPoolName = "synapseml-build-13.3-gpu"
36-
val AdbRuntime = "13.3.x-scala2.12"
37-
// https://docs.databricks.com/en/release-notes/runtime/13.3lts-ml.html
38-
val AdbGpuRuntime = "13.3.x-gpu-ml-scala2.12"
34+
val PoolName = "synapseml-build-12.2"
35+
val GpuPoolName = "synapseml-build-12.2-gpu"
36+
val AdbRuntime = "12.2.x-scala2.12"
37+
// https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/
38+
val AdbGpuRuntime = "12.2.x-gpu-ml-scala2.12"
3939
val NumWorkers = 5
4040
val AutoTerminationMinutes = 15
4141

@@ -84,9 +84,9 @@ object DatabricksUtilities {
8484
Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)),
8585
Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")),
8686
Map("pypi" -> Map("package" -> "torchvision==0.14.1")),
87-
Map("pypi" -> Map("package" -> "transformers==4.32.1")),
88-
Map("pypi" -> Map("package" -> "petastorm==0.12.0")),
89-
Map("pypi" -> Map("package" -> "protobuf==3.20.3"))
87+
Map("pypi" -> Map("package" -> "transformers==4.25.1")),
88+
Map("pypi" -> Map("package" -> "petastorm==0.12.1")),
89+
Map("pypi" -> Map("package" -> "protobuf==3.19.4"))
9090
).toJson.compactPrint
9191

9292
val RapidsInitScripts: String = List(

core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseExtension/SynapseExtensionUtilities.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ object SynapseExtensionUtilities {
8383
|"{
8484
| 'Default${store}ArtifactId': '$storeId',
8585
| 'ExecutableFile': '$path',
86-
| 'SparkVersion':'3.4',
86+
| 'SparkVersion':'3.3',
8787
| 'SparkSettings': {
8888
| 'spark.jars.packages' : '$SparkMavenPackageList',
8989
| 'spark.jars.repositories' : '$SparkMavenRepositoryList',

core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ object SynapseUtilities {
255255
| "nodeSizeFamily": "MemoryOptimized",
256256
| "provisioningState": "Succeeded",
257257
| "sessionLevelPackagesEnabled": "true",
258-
| "sparkVersion": "3.4"
258+
| "sparkVersion": "3.3"
259259
| }
260260
|}
261261
|""".stripMargin

deep-learning/src/main/python/horovod_installation.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ set -eu
88
# Install prerequisite libraries that horovod depends on
99
pip install pytorch-lightning==1.5.0
1010
pip install torchvision==0.14.1
11-
pip install transformers==4.32.1
11+
pip install transformers==4.25.1
1212
pip install petastorm>=0.12.0
13-
pip install protobuf==3.20.3
13+
pip install protobuf==3.19.1
1414

1515
# Remove Outdated Signing Key:
1616
sudo apt-key del 7fa2af80
@@ -35,13 +35,13 @@ libcusparse-dev-11-0=11.1.1.245-1
3535

3636
git clone --recursive https://github.com/horovod/horovod.git
3737
cd horovod
38-
# git fetch origin refs/tags/v0.28.1:tags/v0.28.1
39-
git checkout 1d217b59949986d025f6db93c49943fb6b6cc78f
38+
# git fetch origin refs/tags/v0.27.0:tags/v0.27.0
39+
git checkout bfaca90d5cf66780a97d8799d4e1573855b64560
4040
git checkout -b tmp-branch
4141
rm -rf build/ dist/
4242
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \
4343
/databricks/python3/bin/python setup.py bdist_wheel
4444

4545
readlink -f dist/horovod-*.whl
4646

47-
pip install --no-cache-dir dist/horovod-0.28.1-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
47+
pip install --no-cache-dir dist/horovod-0.27.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps

deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@
1111
if _TRANSFORMERS_AVAILABLE:
1212
import transformers
1313

14-
_TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1"
15-
if _TRANSFORMERS_EQUAL_4_32_1:
14+
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
15+
if _TRANSFORMERS_EQUAL_4_25_1:
1616
from transformers import AutoTokenizer
1717
else:
1818
raise RuntimeError(
19-
"transformers should be == 4.32.1, found: {}".format(
19+
"transformers should be == 4.25.1, found: {}".format(
2020
transformers.__version__
2121
)
2222
)

deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
if _HOROVOD_AVAILABLE:
2020
import horovod
2121

22-
_HOROVOD_EQUAL_0_28_1 = horovod.__version__ == "0.28.1"
23-
if not _HOROVOD_EQUAL_0_28_1:
22+
_HOROVOD_EQUAL_0_27_0 = horovod.__version__ == "0.27.0"
23+
if not _HOROVOD_EQUAL_0_27_0:
2424
raise RuntimeError(
25-
"horovod should be of version 0.28.1, found: {}".format(horovod.__version__)
25+
"horovod should be of version 0.27.0, found: {}".format(horovod.__version__)
2626
)
2727
else:
2828
raise ModuleNotFoundError("module not found: horovod")

deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
if _TRANSFORMERS_AVAILABLE:
1414
import transformers
1515

16-
_TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1"
17-
if _TRANSFORMERS_EQUAL_4_32_1:
16+
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
17+
if _TRANSFORMERS_EQUAL_4_25_1:
1818
from transformers import AutoModelForSequenceClassification
1919
else:
2020
raise RuntimeError(
21-
"transformers should be == 4.32.1, found: {}".format(
21+
"transformers should be == 4.25.1, found: {}".format(
2222
transformers.__version__
2323
)
2424
)

environment.yml

+9-9
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,18 @@ channels:
33
- conda-forge
44
- default
55
dependencies:
6-
- python=3.11.8
6+
- python=3.8.8
77
- requests=2.26.0
88
- pip=21.3
99
- r-base=4.1.1
1010
- r-sparklyr=1.8.1
1111
- r-devtools=2.4.2
1212
- pip:
1313
- pyarrow>=0.15.0
14-
- pyspark==3.4.1
15-
- pandas==1.4.0
14+
- pyspark==3.3.3
15+
- pandas==1.2.5
1616
- wheel
17-
- sphinx==5.0.2
17+
- sphinx==4.2.0
1818
- sphinx_paramlinks==0.5.2
1919
- sphinx_rtd_theme
2020
- coverage
@@ -32,17 +32,17 @@ dependencies:
3232
- twine
3333
- mlflow
3434
- numpy
35-
- torch==2.0.0
36-
- torchvision==0.15.1
37-
- horovod==0.28.1
35+
- torch==1.13.1
36+
- torchvision==0.14.1
37+
- horovod==0.27.0
3838
- petastorm>=0.11.0
3939
- pytorch_lightning==1.5.0
4040
- onnxmltools==1.7.0
4141
- matplotlib
4242
- Pillow
43-
- transformers==4.32.1
43+
- transformers==4.25.1
4444
- huggingface-hub>=0.8.1
45-
- langchain==0.0.152
45+
- langchain==0.0.151
4646
- openai==0.27.5
4747
- black==22.3.0
4848
- black[jupyter]==22.3.0

pipeline.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ jobs:
461461
(timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup)
462462
sbt codegen
463463
sbt publishM2
464-
SPARK_VERSION=3.4.1
464+
SPARK_VERSION=3.3.3
465465
HADOOP_VERSION=3
466466
wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
467467
- task: AzureCLI@2

project/plugins.sbt

+2-6
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")
44
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8")
55
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
66
addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0")
7-
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8")
7+
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2")
88
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1")
9-
addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.26.0")
10-
11-
ThisBuild / libraryDependencySchemes ++= Seq(
12-
"org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
13-
)
9+
addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.26.0")

start

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#!/bin/bash
22

33
export OPENMPI_VERSION="3.1.2"
4-
5-
export SPARK_VERSION="3.4.1"
4+
export SPARK_VERSION="3.3.3"
65
export HADOOP_VERSION="3.3"
76
export SYNAPSEML_VERSION="1.0.8" # Binder compatibility version
87

tools/docker/demo/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04
33
ARG SYNAPSEML_VERSION=1.0.8
44
ARG DEBIAN_FRONTEND=noninteractive
55

6-
ENV SPARK_VERSION=3.4.1
6+
ENV SPARK_VERSION=3.3.3
77
ENV HADOOP_VERSION=3
88
ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION}
99
ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64

tools/docker/minimal/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04
33
ARG SYNAPSEML_VERSION=1.0.8
44
ARG DEBIAN_FRONTEND=noninteractive
55

6-
ENV SPARK_VERSION=3.4.1
6+
ENV SPARK_VERSION=3.3.3
77
ENV HADOOP_VERSION=3
88
ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION}
99
ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64

tools/tests/run_r_tests.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ if (!require("sparklyr")) {
33
library("sparklyr")
44
}
55

6-
spark_install_tar(paste(getwd(), "/../../../../../../spark-3.4.1-bin-hadoop3.tgz", sep = ""))
6+
spark_install_tar(paste(getwd(), "/../../../../../../spark-3.3.3-bin-hadoop3.tgz", sep = ""))
77

88
options("testthat.output_file" = "../../../../r-test-results.xml")
99
devtools::test(reporter = JunitReporter$new())

0 commit comments

Comments
 (0)