Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

Commit 091761a

Browse files
authored
fix: emr eks examples (#537)
* change EMR on EKS example
1 parent d07a599 commit 091761a

File tree

10 files changed

+115
-72
lines changed

10 files changed

+115
-72
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ core/.github/*
3333
*.iml
3434

3535
core/tmp.yaml
36+
examples/emr-eks-app/bin

core/src/integ.default.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ const mockApp = new App();
99
const stack = new Stack(mockApp, 'EmrEksClustereE2eTest');
1010

1111
const emrEks = EmrEksCluster.getOrCreate(stack, {
12-
eksAdminRoleArn: 'arn:aws:iam::111111111111:role/myrole',
12+
eksAdminRoleArn: 'arn:aws:iam::123445678912:role/gromav',
1313
autoscaling: Autoscaler.KARPENTER,
1414
});
1515

examples/emr-eks-app/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,4 @@ junit.xml
4646
.jsii
4747
tsconfig.json
4848
!/API.md
49+
bin/*

examples/emr-eks-app/README.md

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,60 @@
1-
# Welcome to your CDK TypeScript project
1+
# Data platform with Amazon EMR on EKS and EMR Studio
2+
This example demonstrate how to consume the AWS Analytics Reference Architecture constructs library to implement a data platform on Amazon EKS. The constructs simplify the setup of Amazon EKS, Amazon EMR on EKS and Amazon EMR Studio.
23

3-
This is a blank project for CDK development with TypeScript.
4+
This example will provision the following resources and features:
45

5-
The `cdk.json` file tells the CDK Toolkit how to execute your app.
6+
* An EKS cluster configured with Karpenter autoscaler (can be replaced with the Cluster Autoscaler)
7+
* Predefined Karpenter provisioners (or managed nodegroups) to autoscale the EKS resources from 0 to X
8+
* EMR on EKS virtual clusters for batch job and notebooks with best practices configured (autoscaling, spot, graviton, local disks...)
9+
* IAM execution roles down only to EMR on EKS pods for a given namespace
10+
* An EMR Studio with IAM (can be replaced by SSO) user
11+
* A Managed endpoint for integration with EMR Studio with permissions scoped to the user
12+
* The demo can be extended using the EmrEksCluster construct API and the NotebookDataPlatform construct API
613

7-
## Useful commands
14+
## Getting started
815

9-
* `npm run build` compile typescript to js
10-
* `npm run watch` watch for changes and compile
11-
* `npm run test` perform the jest unit tests
12-
* `cdk deploy` deploy this stack to your default AWS account/region
13-
* `cdk diff` compare deployed stack with current state
14-
* `cdk synth` emits the synthesized CloudFormation template
16+
### Setup the environement
17+
18+
The `EmrEksCluster` construct requires an IAM Role to be set as the Amazon EKS administrator.
19+
Edit the `lib/emr-eks-app-stack.ts` file (line 16) and add the ARN of the IAM Role you want to use as administrator.
20+
21+
The code should look like this:
22+
23+
```
24+
const emrEks = ara.EmrEksCluster.getOrCreate(this,{
25+
eksAdminRoleArn: 'arn:aws:iam::1234567890:role/AdminAccess',
26+
eksClusterName:'dataplatform',
27+
autoscaling: ara.Autoscaler.KARPENTER,
28+
});
29+
```
30+
31+
### Provision the data platform
32+
33+
1. Run `npm install` to install all the dependencies
34+
2. Run `npm run build && cdk deploy --app './bin/emr-eks-app.js'`
35+
3. The CDK application generates various outputs:
36+
* The EMR on EKS configuration for submitting jobs of different SLA (critical or shared)
37+
* The excution role ARN to use for submitting jobs
38+
* The results bucket used to write the output of the example job
39+
* The EMR on EKS virtual cluster ID whihc is the entrypoint for submitting the example job
40+
* Configuration commands for interacting with the EKS cluster (kubeconfig update, cluster access token, kubedashboard URL)
41+
* EMR Studio URL
42+
43+
### Run a batch job
44+
45+
1. Modify the critical-job.json or shared-job.json file. Change the following information with values from the `cdk deploy` outputs:
46+
47+
`<VIRTUAL_CLUSTER_ID>`
48+
`<EXECUTION_ROLE_ARN>`
49+
`<RESULTS_BUCKET>`
50+
`<CRITICAL_CONFIG_JSON>`
51+
52+
2. Submit the job using the following CLI command
53+
54+
`aws emr-containers start-job-run --cli-input-json file://spark-jobs/critical-job-test.json`
55+
56+
### Cleanup
57+
58+
1. Wait for jobs to finish
59+
2. Delete workspaces created in Amazon EMR Studio
60+
3. Run `cdk destroy`

examples/emr-eks-app/bin/emr-eks-app.ts

Lines changed: 0 additions & 21 deletions
This file was deleted.

examples/emr-eks-app/lib/emr-eks-app-stack.ts

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,57 @@ import * as cdk from 'aws-cdk-lib';
22
import { Construct } from 'constructs';
33
import * as ara from 'aws-analytics-reference-architecture';
44
import * as iam from 'aws-cdk-lib/aws-iam' ;
5+
import { User } from 'aws-cdk-lib/aws-iam';
56

67

78
export class EmrEksAppStack extends cdk.Stack {
89
constructor(scope: Construct, id: string, props?: cdk.StackProps) {
910
super(scope, id, props);
1011

12+
const resultsBucket = ara.AraBucket.getOrCreate(this, {
13+
bucketName: 'results-bucket',
14+
})
15+
1116
const emrEks = ara.EmrEksCluster.getOrCreate(this,{
12-
eksAdminRoleArn:'',
13-
eksClusterName:'',
17+
eksAdminRoleArn: '<YOUR_ADMIN_ROLE>>',
18+
eksClusterName:'dataplatform',
1419
autoscaling: ara.Autoscaler.KARPENTER,
1520
});
1621

1722
const virtualCluster = emrEks.addEmrVirtualCluster(this,{
18-
name:'my-emr-eks-cluster',
23+
name:'batch-job-cluster',
1924
eksNamespace: 'batchjob',
2025
createNamespace: true,
2126
});
2227

23-
const emrEksPolicy = new iam.ManagedPolicy(this,'managed-policy',{
28+
const emrEksPolicy = new iam.ManagedPolicy(this,'EmrPolicy',{
2429
statements: [
2530
new iam.PolicyStatement({
2631
effect: iam.Effect.ALLOW,
2732
actions:['s3:PutObject','s3:GetObject','s3:ListBucket'],
28-
resources:['YOUR-S3-BUCKET'],
33+
resources:[
34+
resultsBucket.bucketArn,
35+
resultsBucket.arnForObjects('*'),
36+
],
37+
}),
38+
new iam.PolicyStatement({
39+
effect: iam.Effect.ALLOW,
40+
actions:['s3:GetObject','s3:ListBucket'],
41+
resources:[
42+
"arn:aws:s3:::nyc-tlc",
43+
"arn:aws:s3:::nyc-tlc/*",
44+
"arn:aws:s3:::aws-data-lake-workshop/spark-eks/spark-eks-assembly-3.3.0.jar",
45+
],
46+
}),
47+
new iam.PolicyStatement({
48+
effect: iam.Effect.ALLOW,
49+
actions:['glue:*'],
50+
resources:[
51+
`arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:catalog`,
52+
`arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:database/emr_eks_demo`,
53+
`arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:table/emr_eks_demo/value_rides`,
54+
`arn:aws:glue:${cdk.Aws.REGION}:${cdk.Aws.ACCOUNT_ID}:table/emr_eks_demo/raw_rides`
55+
],
2956
}),
3057
new iam.PolicyStatement({
3158
effect: iam.Effect.ALLOW, actions:['logs:PutLogEvents','logs:CreateLogStream','logs:DescribeLogGroups','logs:DescribeLogStreams'],
@@ -35,32 +62,35 @@ export class EmrEksAppStack extends cdk.Stack {
3562
});
3663

3764

38-
const role = emrEks.createExecutionRole(this,'emr-eks-execution-role',emrEksPolicy, 'batchjob','execRoleJob');
39-
40-
// Virtual cluster Id to reference in jobs
41-
new cdk.CfnOutput(this, 'VirtualClusterId', { value: virtualCluster.attrId });
42-
// Job config for each nodegroup
43-
new cdk.CfnOutput(this, 'CriticalConfig', { value: emrEks.criticalDefaultConfig });
44-
// Execution role arn
45-
new cdk.CfnOutput(this, 'ExecRoleArn', { value: role.roleArn });
65+
const role = emrEks.createExecutionRole(this,'EmrExecRole',emrEksPolicy, 'batchjob','execRoleJob');
4666

67+
const notebookUser = new User(this, 'NotebookUser', {userName: 'test'});
4768

4869
const notebookPlatform = new ara.NotebookPlatform(this, 'platform-notebook', {
4970
emrEks: emrEks,
50-
eksNamespace: 'dataanalysis',
71+
eksNamespace: 'notebook',
5172
studioName: 'platform',
5273
studioAuthMode: ara.StudioAuthMode.IAM,
5374
});
5475

5576
notebookPlatform.addUser([{
56-
identityName:'',
77+
iamUser: notebookUser,
5778
notebookManagedEndpoints: [{
58-
emrOnEksVersion: ara.EmrVersion.V6_9,
59-
executionPolicy: emrEksPolicy,
60-
managedEndpointName: 'myendpoint'
61-
}],
79+
emrOnEksVersion: ara.EmrVersion.V6_9,
80+
executionPolicy: emrEksPolicy,
81+
managedEndpointName: 'platform-notebook'
82+
}],
6283
}]);
6384

6485

86+
// Virtual cluster Id to reference in jobs
87+
new cdk.CfnOutput(this, 'VirtualClusterId', { value: virtualCluster.attrId });
88+
// Job config for each nodegroup
89+
new cdk.CfnOutput(this, 'CriticalConfig', { value: emrEks.criticalDefaultConfig });
90+
new cdk.CfnOutput(this, 'SharedConfig', { value: emrEks.sharedDefaultConfig });
91+
// Execution role arn
92+
new cdk.CfnOutput(this, 'ExecRoleArn', { value: role.roleArn });
93+
// Results bucket name
94+
new cdk.CfnOutput(this, 'ResultsBucketName', { value: resultsBucket.bucketName });
6595
}
6696
}

examples/emr-eks-app/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"typescript": "^4.9.4"
2222
},
2323
"dependencies": {
24-
"aws-analytics-reference-architecture": "2.6.0",
24+
"aws-analytics-reference-architecture": "2.6.3",
2525
"aws-cdk-lib": "2.51.0",
2626
"constructs": "^10.0.0",
2727
"source-map-support": "^0.5.21"

examples/emr-eks-app/spark-jobs/critical-job-test.json

Lines changed: 0 additions & 14 deletions
This file was deleted.

examples/emr-eks-app/spark-jobs/critical-job.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
"releaseLabel": "emr-6.9.0-latest",
66
"jobDriver": {
77
"sparkSubmitJobDriver": {
8-
"entryPoint": "s3://<REGION>.elasticmapreduce/emr-containers/samples/wordcount/scripts/wordcount.py",
9-
"entryPointArguments": ["s3://<BUCKET>/wordcount_output"],
10-
"sparkSubmitParameters": "--conf spark.executor.instances=2"
8+
"entryPoint": "s3://aws-data-lake-workshop/spark-eks/spark-eks-assembly-3.3.0.jar",
9+
"entryPointArguments": ["s3://nyc-tlc/csv_backup", "2017","s3://nyc-tlc/misc/taxi _zone_lookup.csv","s3://<RESULTS_BUCKET>/emr-eks-results", "emr_eks_demo"],
10+
"sparkSubmitParameters": "--class ValueZones --conf spark.executor.instances=10"
1111
}
1212
},
1313
"configurationOverrides": <CRITICAL_CONFIG_JSON>

examples/emr-eks-app/spark-jobs/shared-job.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
"releaseLabel": "emr-6.9.0-latest",
66
"jobDriver": {
77
"sparkSubmitJobDriver": {
8-
"entryPoint": "s3://<REGION>.elasticmapreduce/emr-containers/samples/wordcount/scripts/wordcount.py",
9-
"entryPointArguments": ["s3://<BUCKET>/wordcount_output"],
10-
"sparkSubmitParameters": "--conf spark.executor.instances=2"
8+
"entryPoint": "s3://aws-data-lake-workshop/spark-eks/spark-eks-assembly-3.3.0.jar",
9+
"entryPointArguments": ["s3://nyc-tlc/csv_backup", "2017","s3://nyc-tlc/misc/taxi _zone_lookup.csv","s3://<RESULTS_BUCKET>/emr-eks-results", "emr_eks_demo"],
10+
"sparkSubmitParameters": "--class ValueZones --conf spark.executor.instances=10"
1111
}
1212
},
1313
"configurationOverrides": <SHARED_CONFIG_JSON>

0 commit comments

Comments
 (0)