Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 218 additions & 0 deletions platforms/emr-airgapped/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,224 @@ Define the necessary Spark configuration parameters under the **Software Setting
}
]
```

**Config for python3.11**
```
[
{
"Classification": "spark",
"Properties": {
"maximizeResourceAllocation": "true"
}
},
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"JSL_EMR": "1",
"PYSPARK_DRIVER_PYTHON": "/usr/bin/python3.11",
"PYSPARK_PYTHON": "/usr/bin/python3.11",
"SPARK_NLP_LICENSE": ""
}
}
],
"Properties": {}
},
{
"Classification": "yarn-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"JSL_EMR": "1",
"SPARK_NLP_LICENSE": ""
}
}
],
"Properties": {}
},
{
"Classification": "livy-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_DRIVER_PYTHON": "/usr/bin/python3.11",
"PYSPARK_PYTHON": "/usr/bin/python3.11"
}
}
],
"Properties": {}
},
{
"Classification": "livy-conf",
"Properties": {
"livy.server.session.conf.spark.executorEnv.PYSPARK_PYTHON": "/usr/bin/python3.11",
"livy.server.session.conf.spark.pyspark.driver.python": "/usr/bin/python3.11",
"livy.server.session.conf.spark.pyspark.python": "/usr/bin/python3.11",
"livy.server.session.conf.spark.pyspark.virtualenv.enabled": "false",
"livy.server.session.conf.spark.yarn.appMasterEnv.PYSPARK_PYTHON": "/usr/bin/python3.11"
}
},
{
"Classification": "spark-defaults",
"Properties": {
"spark.driver.maxResultSize": "0",
"spark.dynamicAllocation.enabled": "true",
"spark.executorEnv.SPARK_NLP_LICENSE": "",
"spark.jsl.settings.aws.credentials.access_key_id": "",
"spark.jsl.settings.aws.credentials.secret_access_key": "",
"spark.jsl.settings.aws.region": "us-east-1",
"spark.jsl.settings.pretrained.credentials.access_key_id": "",
"spark.jsl.settings.pretrained.credentials.secret_access_key": "",
"spark.jsl.settings.storage.cluster_tmp_dir": "hdfs:///tmp",
"spark.kryoserializer.buffer.max": "2000M",
"spark.pyspark.driver.python": "/usr/bin/python3.11",
"spark.pyspark.python": "/usr/bin/python3.11",
"spark.rpc.message.maxSize": "1024",
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
"spark.yarn.appMasterEnv.SPARK_NLP_LICENSE": "",
"spark.yarn.preserve.staging.files": "true",
"spark.yarn.stagingDir": "hdfs:///tmp"
}
}
]
```

**Config For gpu setup**
For gpu setup we need to select the gpu type instance.
![gpu instance](gpu-instance.png)

The config for gpu setup
```
[
{
"Classification": "spark",
"Properties": {
"maximizeResourceAllocation": "true"
}
},
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"JSL_EMR": "1",
"PYSPARK_DRIVER_PYTHON": "/usr/bin/python3.11",
"PYSPARK_PYTHON": "/usr/bin/python3.11",
"SPARK_NLP_LICENSE": "XXXXXXXXX"
}
}
],
"Properties": {}
},
{
"Classification": "yarn-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"JSL_EMR": "1",
"SPARK_NLP_LICENSE": "XXXXXXXXX"
}
}
],
"Properties": {}
},
{
"Classification": "livy-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_DRIVER_PYTHON": "/usr/bin/python3.11",
"PYSPARK_PYTHON": "/usr/bin/python3.11"
}
}
],
"Properties": {}
},
{
"Classification": "livy-conf",
"Properties": {
"livy.server.session.conf.spark.executorEnv.PYSPARK_PYTHON": "/usr/bin/python3.11",
"livy.server.session.conf.spark.pyspark.driver.python": "/usr/bin/python3.11",
"livy.server.session.conf.spark.pyspark.python": "/usr/bin/python3.11",
"livy.server.session.conf.spark.pyspark.virtualenv.enabled": "false",
"livy.server.session.conf.spark.yarn.appMasterEnv.PYSPARK_PYTHON": "/usr/bin/python3.11"
}
},
{
"Classification": "yarn-site",
"Properties": {
"yarn.nodemanager.containerexecutor.class": "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor",
"yarn.nodemanager.linux-container-executor.cgroups.hierarchy": "yarn",
"yarn.nodemanager.linux-container-executor.cgroups.mount": "true",
"yarn.nodemanager.linux-container-executor.cgroups.mount-path": "/sparkrapids-cgroup",
"yarn.nodemanager.resource-plugins": "yarn.io/gpu",
"yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices": "auto",
"yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables": "/usr/bin",
"yarn.resource-types": "yarn.io/gpu"
}
},
{
"Classification": "container-executor",
"Configurations": [
{
"Classification": "gpu",
"Properties": {
"module.enabled": "true"
}
},
{
"Classification": "cgroups",
"Properties": {
"root": "/spark-rapids-cgroup",
"yarn-hierarchy": "yarn"
}
}
],
"Properties": {}
},
{
"Classification": "spark-defaults",
"Properties": {
"spark.driver.maxResultSize": "0",
"spark.dynamicAllocation.enabled": "true",
"spark.executor.extraLibraryPath": "/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat/lib:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native",
"spark.executor.resource.gpu.discoveryScript": "/usr/lib/spark/scripts/gpu/getGpusResources.sh",
"spark.executorEnv.SPARK_NLP_LICENSE": "XXXXXXX",
"spark.jsl.settings.aws.credentials.access_key_id": "XXXXXXXXX",
"spark.jsl.settings.aws.credentials.secret_access_key": "XXXXXXXXX",
"spark.jsl.settings.aws.region": "us-east-1",
"spark.jsl.settings.pretrained.credentials.access_key_id": "XXXXXXXX",
"spark.jsl.settings.pretrained.credentials.secret_access_key": "XXXXXXXXXX",
"spark.jsl.settings.storage.cluster_tmp_dir": "hdfs:///tmp",
"spark.kryoserializer.buffer.max": "2000M",
"spark.plugins": "com.nvidia.spark.SQLPlugin",
"spark.pyspark.driver.python": "/usr/bin/python3.11",
"spark.pyspark.python": "/usr/bin/python3.11",
"spark.rpc.message.maxSize": "1024",
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
"spark.yarn.appMasterEnv.SPARK_NLP_LICENSE": "XXXXXXXXXXX",
"spark.yarn.preserve.staging.files": "true",
"spark.yarn.stagingDir": "hdfs:///tmp"
}
}
]
```

**For python 3.11**
To use python 3.11, you should specify the given block of config in your notebook's first shell.
```
%%configure -f
{ "conf": { "spark.yarn.appMasterEnv.PYSPARK_PYTHON": "/usr/bin/python3.11",
"spark.executorEnv.PYSPARK_PYTHON": "/usr/bin/python3.11" } }
```

**__Important__**
Make sure to replace all placeholder values (marked as `XYXYXYXYXY`) with the actual credentials provided with your license.
If your EMR cluster is truly **air-gapped**, you do **not** need to specify `access_key_id` or `secret_access_key` in the configuration — since the `pretrained()` function cannot be used to download models in an offline environment.
Expand Down
Binary file added platforms/emr-airgapped/gpu-instance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.