fix: emr eks examples (#537)

vgkowski · web-flow · commit 091761a5747e · 2023-01-06T17:45:04.000+01:00
* change EMR on EKS example
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,4 @@ core/.github/*
 *.iml
 
 core/tmp.yaml
+examples/emr-eks-app/bin
diff --git a/core/src/integ.default.ts b/core/src/integ.default.ts
@@ -9,7 +9,7 @@ const mockApp = new App();
 const stack = new Stack(mockApp, 'EmrEksClustereE2eTest');
 
 const emrEks = EmrEksCluster.getOrCreate(stack, {
-  eksAdminRoleArn: 'arn:aws:iam::111111111111:role/myrole',
+  eksAdminRoleArn: 'arn:aws:iam::123445678912:role/gromav',
   autoscaling: Autoscaler.KARPENTER,
 });
 
diff --git a/examples/emr-eks-app/.gitignore b/examples/emr-eks-app/.gitignore
@@ -46,3 +46,4 @@ junit.xml
 .jsii
 tsconfig.json
 !/API.md
+bin/*
diff --git a/examples/emr-eks-app/README.md b/examples/emr-eks-app/README.md
@@ -1,14 +1,60 @@
-# Welcome to your CDK TypeScript project
+# Data platform with Amazon EMR on EKS and EMR Studio
+This example demonstrate how to consume the AWS Analytics Reference Architecture constructs library to implement a data platform on Amazon EKS. The constructs simplify the setup of Amazon EKS, Amazon EMR on EKS and Amazon EMR Studio.
 
-This is a blank project for CDK development with TypeScript.
+This example will provision the following resources and features:
 
-The `cdk.json` file tells the CDK Toolkit how to execute your app.
+* An EKS cluster configured with Karpenter autoscaler (can be replaced with the Cluster Autoscaler)
+* Predefined Karpenter provisioners (or managed nodegroups) to autoscale the EKS resources from 0 to X
+* EMR on EKS virtual clusters for batch job and notebooks with best practices configured (autoscaling, spot, graviton, local disks...)
+* IAM execution roles down only to EMR on EKS pods for a given namespace
+* An EMR Studio with IAM (can be replaced by SSO) user
+* A Managed endpoint for integration with EMR Studio with permissions scoped to the user
+* The demo can be extended using the EmrEksCluster construct API and the NotebookDataPlatform construct API
 
-## Useful commands
+## Getting started
 
-* `npm run build`   compile typescript to js
-* `npm run watch`   watch for changes and compile
-* `npm run test`    perform the jest unit tests
-* `cdk deploy`      deploy this stack to your default AWS account/region
-* `cdk diff`        compare deployed stack with current state
-* `cdk synth`       emits the synthesized CloudFormation template
+### Setup the environement
+
+The `EmrEksCluster` construct requires an IAM Role to be set as the Amazon EKS administrator.
+Edit the `lib/emr-eks-app-stack.ts` file (line 16) and add the ARN of the IAM Role you want to use as administrator.
+
+The code should look like this:
+
+```
+    const emrEks = ara.EmrEksCluster.getOrCreate(this,{
+      eksAdminRoleArn: 'arn:aws:iam::1234567890:role/AdminAccess',
+      eksClusterName:'dataplatform',
+      autoscaling: ara.Autoscaler.KARPENTER,
+    });
+```
+
+### Provision the data platform
+
+1. Run `npm install` to install all the dependencies
+2. Run `npm run build && cdk deploy --app './bin/emr-eks-app.js'`
+3. The CDK application generates various outputs:
+    * The EMR on EKS configuration for submitting jobs of different SLA (critical or shared)
+    * The excution role ARN to use for submitting jobs
+    * The results bucket used to write the output of the example job
+    * The EMR on EKS virtual cluster ID whihc is the entrypoint for submitting the example job
+    * Configuration commands for interacting with the EKS cluster (kubeconfig update, cluster access token, kubedashboard URL)
+    * EMR Studio URL
+
+### Run a batch job
+
+1. Modify the critical-job.json or shared-job.json file. Change the following information with values from the `cdk deploy` outputs:
+
+    `<VIRTUAL_CLUSTER_ID>`
+    `<EXECUTION_ROLE_ARN>`
+    `<RESULTS_BUCKET>`
+    `<CRITICAL_CONFIG_JSON>`
+
+2. Submit the job using the following CLI command 
+
+   `aws emr-containers start-job-run --cli-input-json file://spark-jobs/critical-job-test.json`
+
+### Cleanup
+
+1. Wait for jobs to finish
+2. Delete workspaces created in Amazon EMR Studio
+3. Run `cdk destroy`
diff --git a/examples/emr-eks-app/bin/emr-eks-app.ts b/examples/emr-eks-app/bin/emr-eks-app.ts
diff --git a/examples/emr-eks-app/lib/emr-eks-app-stack.ts b/examples/emr-eks-app/lib/emr-eks-app-stack.ts
@@ -2,30 +2,57 @@ import * as cdk from 'aws-cdk-lib';
 import { Construct } from 'constructs';
 import * as ara from 'aws-analytics-reference-architecture';
 import * as iam from 'aws-cdk-lib/aws-iam' ;
+import { User } from 'aws-cdk-lib/aws-iam';
 
 
 export class EmrEksAppStack extends cdk.Stack {
   constructor(scope: Construct, id: string, props?: cdk.StackProps) {
     super(scope, id, props);
 
+    const resultsBucket = ara.AraBucket.getOrCreate(this, {
+      bucketName: 'results-bucket',
+    })
+
     const emrEks = ara.EmrEksCluster.getOrCreate(this,{
-      eksAdminRoleArn:'',
-      eksClusterName:'',
+      eksAdminRoleArn: '<YOUR_ADMIN_ROLE>>',
+      eksClusterName:'dataplatform',
       autoscaling: ara.Autoscaler.KARPENTER,
     });
 
     const virtualCluster = emrEks.addEmrVirtualCluster(this,{
-      name:'my-emr-eks-cluster',
+      name:'batch-job-cluster',
       eksNamespace: 'batchjob',
       createNamespace: true,
     });
 
-    const emrEksPolicy = new iam.ManagedPolicy(this,'managed-policy',{
+    const emrEksPolicy = new iam.ManagedPolicy(this,'EmrPolicy',{
       statements: [
         new iam.PolicyStatement({
           effect: iam.Effect.ALLOW,
           actions:['s3:PutObject','s3:GetObject','s3:ListBucket'],
-          resources:['YOUR-S3-BUCKET'],
+          resources:[
+            resultsBucket.bucketArn,
+            resultsBucket.arnForObjects('*'),
+          ],
+        }),
+        new iam.PolicyStatement({
+          effect: iam.Effect.ALLOW,
+          actions:['s3:GetObject','s3:ListBucket'],
+          resources:[
+            "arn:aws:s3:::nyc-tlc",
+            "arn:aws:s3:::nyc-tlc/*",
+            "arn:aws:s3:::aws-data-lake-workshop/spark-eks/spark-eks-assembly-3.3.0.jar",
+          ],
+        }),
+        new iam.PolicyStatement({
+          effect: iam.Effect.ALLOW,
+          actions:['glue:*'],
+          resources:[
+            `arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:catalog`,
+            `arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:database/emr_eks_demo`,
+            `arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:table/emr_eks_demo/value_rides`,
+            `arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:table/emr_eks_demo/raw_rides`
+          ],
         }),
         new iam.PolicyStatement({
           effect: iam.Effect.ALLOW, actions:['logs:PutLogEvents','logs:CreateLogStream','logs:DescribeLogGroups','logs:DescribeLogStreams'],
@@ -35,32 +62,35 @@ export class EmrEksAppStack extends cdk.Stack {
     });
 
 
-    const role = emrEks.createExecutionRole(this,'emr-eks-execution-role',emrEksPolicy, 'batchjob','execRoleJob');
-
-    // Virtual cluster Id to reference in jobs
-    new cdk.CfnOutput(this, 'VirtualClusterId', { value: virtualCluster.attrId });
-    // Job config for each nodegroup
-    new cdk.CfnOutput(this, 'CriticalConfig', { value: emrEks.criticalDefaultConfig });
-    // Execution role arn
-    new cdk.CfnOutput(this, 'ExecRoleArn', { value: role.roleArn });
+    const role = emrEks.createExecutionRole(this,'EmrExecRole',emrEksPolicy, 'batchjob','execRoleJob');
 
+    const notebookUser = new User(this, 'NotebookUser', {userName: 'test'});
 
     const notebookPlatform = new ara.NotebookPlatform(this, 'platform-notebook', {
       emrEks: emrEks,
-      eksNamespace: 'dataanalysis',
+      eksNamespace: 'notebook',
       studioName: 'platform',
       studioAuthMode: ara.StudioAuthMode.IAM,
       });
     
       notebookPlatform.addUser([{
-        identityName:'',
+        iamUser: notebookUser,
         notebookManagedEndpoints: [{
-        emrOnEksVersion: ara.EmrVersion.V6_9,
-        executionPolicy: emrEksPolicy,
-        managedEndpointName: 'myendpoint'
-              }],
+          emrOnEksVersion: ara.EmrVersion.V6_9,
+          executionPolicy: emrEksPolicy,
+          managedEndpointName: 'platform-notebook'
+        }],
       }]);
  
 
+    // Virtual cluster Id to reference in jobs
+    new cdk.CfnOutput(this, 'VirtualClusterId', { value: virtualCluster.attrId });
+    // Job config for each nodegroup
+    new cdk.CfnOutput(this, 'CriticalConfig', { value: emrEks.criticalDefaultConfig });
+    new cdk.CfnOutput(this, 'SharedConfig', { value: emrEks.sharedDefaultConfig });
+    // Execution role arn
+    new cdk.CfnOutput(this, 'ExecRoleArn', { value: role.roleArn });
+    // Results bucket name
+    new cdk.CfnOutput(this, 'ResultsBucketName', { value: resultsBucket.bucketName });
   }
 }
diff --git a/examples/emr-eks-app/package.json b/examples/emr-eks-app/package.json
@@ -21,7 +21,7 @@
     "typescript": "^4.9.4"
   },
   "dependencies": {
-    "aws-analytics-reference-architecture": "2.6.0",
+    "aws-analytics-reference-architecture": "2.6.3",
     "aws-cdk-lib": "2.51.0",
     "constructs": "^10.0.0",
     "source-map-support": "^0.5.21"
diff --git a/examples/emr-eks-app/spark-jobs/critical-job-test.json b/examples/emr-eks-app/spark-jobs/critical-job-test.json
diff --git a/examples/emr-eks-app/spark-jobs/critical-job.json b/examples/emr-eks-app/spark-jobs/critical-job.json
@@ -5,9 +5,9 @@
   "releaseLabel": "emr-6.9.0-latest",
   "jobDriver": {
     "sparkSubmitJobDriver": {
-      "entryPoint": "s3://<REGION>.elasticmapreduce/emr-containers/samples/wordcount/scripts/wordcount.py",
-      "entryPointArguments": ["s3://<BUCKET>/wordcount_output"],
-      "sparkSubmitParameters": "--conf spark.executor.instances=2"
+      "entryPoint": "s3://aws-data-lake-workshop/spark-eks/spark-eks-assembly-3.3.0.jar",
+      "entryPointArguments": ["s3://nyc-tlc/csv_backup", "2017","s3://nyc-tlc/misc/taxi _zone_lookup.csv","s3://<RESULTS_BUCKET>/emr-eks-results", "emr_eks_demo"],
+      "sparkSubmitParameters": "--class ValueZones --conf spark.executor.instances=10"
     }
   },
   "configurationOverrides": <CRITICAL_CONFIG_JSON>
diff --git a/examples/emr-eks-app/spark-jobs/shared-job.json b/examples/emr-eks-app/spark-jobs/shared-job.json
@@ -5,9 +5,9 @@
   "releaseLabel": "emr-6.9.0-latest",
   "jobDriver": {
     "sparkSubmitJobDriver": {
-      "entryPoint": "s3://<REGION>.elasticmapreduce/emr-containers/samples/wordcount/scripts/wordcount.py",
-      "entryPointArguments": ["s3://<BUCKET>/wordcount_output"],
-      "sparkSubmitParameters": "--conf spark.executor.instances=2"
+      "entryPoint": "s3://aws-data-lake-workshop/spark-eks/spark-eks-assembly-3.3.0.jar",
+      "entryPointArguments": ["s3://nyc-tlc/csv_backup", "2017","s3://nyc-tlc/misc/taxi _zone_lookup.csv","s3://<RESULTS_BUCKET>/emr-eks-results", "emr_eks_demo"],
+      "sparkSubmitParameters": "--class ValueZones --conf spark.executor.instances=10"
     }
   },
   "configurationOverrides": <SHARED_CONFIG_JSON>

Original file line number	Diff line number	Diff line change
`@@ -33,3 +33,4 @@ core/.github/*`
`33`	`33`	`*.iml`
`34`	`34`
`35`	`35`	`core/tmp.yaml`
	`36`	`+examples/emr-eks-app/bin`