jpacerqueira-zz
diff --git a/‎dazn_ott/logs-archive-production/aws_cli/create_folders.sh
Lines changed: 3 additions & 0 deletions b/‎dazn_ott/logs-archive-production/aws_cli/create_folders.sh
Lines changed: 3 additions & 0 deletions
diff --git a/‎dazn_ott/logs-archive-production/aws_cli/get_daily_logs.sh
Lines changed: 44 additions & 0 deletions b/‎dazn_ott/logs-archive-production/aws_cli/get_daily_logs.sh
Lines changed: 44 additions & 0 deletions
diff --git a/‎dazn_ott/logs-archive-production/aws_cli/install_AWS_CLI.sh
Lines changed: 12 additions & 0 deletions b/‎dazn_ott/logs-archive-production/aws_cli/install_AWS_CLI.sh
Lines changed: 12 additions & 0 deletions
diff --git a/‎dazn_ott/logs-archive-production/data_etl/stageprocess-SparkStageLogsArchiveProdWorkArround-chpbdamedia01
Lines changed: 68 additions & 0 deletions b/‎dazn_ott/logs-archive-production/data_etl/stageprocess-SparkStageLogsArchiveProdWorkArround-chpbdamedia01
Lines changed: 68 additions & 0 deletions
@@ -0,0 +1,3 @@
+#!/bin/bash
+#
+mkdir -p data/raw
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+##
+#
+DATE_V1=$1
+if [ -z "$DATE_V1" ]
+then
+ DATE_V1=190101
+fi
+#
+echo $DATE_V1
+# Make sure security Kerberos is cached
+#
+if [ -f ~/.keytabs/$(whoami).keytab ]; then
+  # for any:any credentials
+  export KRB5CCNAME=/tmp/krb5cc_$(id -u)
+
+  kinit -kt ~/.keytabs/$(whoami).keytab $(whoami)/[email protected] -c /tmp/krb5cc_$(id -u)
+  klist /tmp/krb5cc_$(id -u)
+else
+  echo "No Kerberos here"
+fi
+#
+#   GENERAL ACTION STARTS HERE
+#
+MYFOLDER=/home/siemanalyst/projects/logs-archive-production/aws_cli
+#
+mkdir -p ${MYFOLDER}/data/raw/${DATE_V1}
+#
+#/opt/cloudera/parcels/Anaconda/bin/python3.6 ~/.local/bin/aws --version
+#
+/opt/cloudera/parcels/Anaconda/bin/python3.6 ~/.local/bin/aws s3 ls s3://logs-archive-production/${DATE_V1}/ 
+#
+#
+rm -rf $PWD/data/raw/${DATE_V1}/*.*
+/opt/cloudera/parcels/Anaconda/bin/python3.6 ~/.local/bin/aws s3 cp s3://logs-archive-production/${DATE_V1}/  ${MYFOLDER}/data/raw/${DATE_V1}/ --recursive --exclude "*"  --include  "*.log.gz"
+#
+hdfs dfs -mkdir -p hdfs:///data/raw/ott_dazn/logs-archive-production/dt=20${DATE_V1}
+hdfs dfs -rm -f -skipTrash hdfs:///data/raw/ott_dazn/logs-archive-production/dt=20${DATE_V1}/*.*
+hdfs dfs -copyFromLocal ${MYFOLDER}/data/raw/${DATE_V1}/*.* hdfs:///data/raw/ott_dazn/logs-archive-production/dt=20${DATE_V1}
+rm -rf ${MYFOLDER}/data/raw/${DATE_V1}/*.*
+#
+exit 0
+#
@@ -0,0 +1,12 @@
+#!/bin/bash
+# 
+MYUSER="$(whoami)"
+#
+if [ $MYUSER = "root" ]
+then
+ exit 0
+fi
+#
+/opt/cloudera/parcels/Anaconda/bin/python3.6 /opt/cloudera/parcels/Anaconda/bin/pip install awscli --upgrade --user
+#
+exit 0
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+DATE_V1=$1
+if [ -z "$DATE_V1" ]
+then
+ DATE_V1=20180214
+fi
+
+if [ -f ~/.keytabs/$(whoami).keytab ]; then
+  # for any:any credentials
+  export KRB5CCNAME=/tmp/krb5cc_$(id -u)
+
+  kinit -kt ~/.keytabs/$(whoami).keytab $(whoami)/[email protected] -c /tmp/krb5cc_$(id -u)
+  klist /tmp/krb5cc_$(id -u)
+else
+  echo "No Kerberos here"
+fi
+
+echo "RAW d=${DATE_V1} FILES COUNT : "
+hdfs dfs -ls /data/raw/ott_dazn/logs-archive-production/dt=${DATE_V1}/*.gz | wc -l
+
+hdfs dfs -mkdir -p /data/staged/ott_dazn/logs-archive-production/parquet
+
+echo "Clean Stage to Avoid Duplicates of dt=${DATE_V1}"
+hdfs dfs -rm -r -f -skipTrash  /data/staged/ott_dazn/logs-archive-production/parquet/dt=${DATE_V1}/*.parquet 
+
+echo "TOTAL STAGED FILES COUNT : "
+hdfs dfs -ls -R /data/staged/ott_dazn/logs-archive-production/parquet/dt=*/*.parquet | wc -l
+
+echo "STAGE RAW DATA AS PARQUET FOR DATE ${DATE_V1} with SPARK"
+
+# Spark Job Operations
+MAIN_CLASS1=ptv.content.akamailogs.staged.parquetindexer.SparkStageLogsArchiveProdWorkArround
+APP_JAR=/home/siemanalyst/projects/logs-archive-production/data_etl/parquetindexer-1.0-TPOC-jar-with-dependencies.jar
+EXTRA_JAR=/home/siemanalyst/projects/logs-archive-production/data_etl/parquetindexer-1.0-TPOC-jar-with-dependencies.jar
+
+# Additional for Spark container 
+
+MASTER_URL=yarn
+DEPLOY_MODE=client
+
+DRIVER_MEMORY=17G
+DRIVER_CORES=4
+EXECUTOR_MEMORY=11G
+EXECUTOR_CORES=2
+NUM_EXECUTORS=48
+
+# Introduction of Coalesce Factor - Covers Size Growth
+#COALESCEFACTOR=72
+#COALESCEFACTOR=144
+COALESCEFACTOR=288
+#COALESCEFACTOR=576
+
+# Additional Logic workarround for new null token DI
+#
+IN_PATH="/data/raw/ott_dazn/logs-archive-production/"
+OUT_PATH="/data/staged/ott_dazn/logs-archive-production/parquet/"
+#
+
+spark2-submit  --class ${MAIN_CLASS1} --master ${MASTER_URL} --deploy-mode ${DEPLOY_MODE}  \
+  --principal $(whoami)/[email protected] --keytab ~/.keytabs/$(whoami).keytab \
+  --conf "spark.debug.maxToStringFields=1500" --conf "spark.dynamicAllocation.enabled=false" \
+  --driver-memory ${DRIVER_MEMORY} --driver-cores ${DRIVER_CORES} \
+  --num-executors ${NUM_EXECUTORS} --executor-memory ${EXECUTOR_MEMORY} --executor-cores ${EXECUTOR_CORES}  \
+  --jars ${APP_JAR} ${EXTRA_JAR} --processdate ${DATE_V1} --inputpath ${IN_PATH} --outputpath ${OUT_PATH} --coalescefactor ${COALESCEFACTOR}  >> /home/siemanalyst/projects/logs-archive-production/data_etl/logs/SparkStageLogsArchiveProd-1-${DATE_V1}.log
+#
+
+echo 1
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+#`
	`3`	`+mkdir -p data/raw`