Skip to content

Commit 12468fc

Browse files
committed
final launch and all executions of models saved
1 parent a9b7fd9 commit 12468fc

File tree

55 files changed

+1742386
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1742386
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
#
3+
mkdir -p data/raw
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#
3+
##
4+
#
5+
DATE_V1=$1
6+
if [ -z "$DATE_V1" ]
7+
then
8+
DATE_V1=190101
9+
fi
10+
#
11+
echo $DATE_V1
12+
# Make sure security Kerberos is cached
13+
#
14+
if [ -f ~/.keytabs/$(whoami).keytab ]; then
15+
# for any:any credentials
16+
export KRB5CCNAME=/tmp/krb5cc_$(id -u)
17+
18+
kinit -kt ~/.keytabs/$(whoami).keytab $(whoami)/[email protected] -c /tmp/krb5cc_$(id -u)
19+
klist /tmp/krb5cc_$(id -u)
20+
else
21+
echo "No Kerberos here"
22+
fi
23+
#
24+
# GENERAL ACTION STARTS HERE
25+
#
26+
MYFOLDER=/home/siemanalyst/projects/logs-archive-production/aws_cli
27+
#
28+
mkdir -p ${MYFOLDER}/data/raw/${DATE_V1}
29+
#
30+
#/opt/cloudera/parcels/Anaconda/bin/python3.6 ~/.local/bin/aws --version
31+
#
32+
/opt/cloudera/parcels/Anaconda/bin/python3.6 ~/.local/bin/aws s3 ls s3://logs-archive-production/${DATE_V1}/
33+
#
34+
#
35+
rm -rf $PWD/data/raw/${DATE_V1}/*.*
36+
/opt/cloudera/parcels/Anaconda/bin/python3.6 ~/.local/bin/aws s3 cp s3://logs-archive-production/${DATE_V1}/ ${MYFOLDER}/data/raw/${DATE_V1}/ --recursive --exclude "*" --include "*.log.gz"
37+
#
38+
hdfs dfs -mkdir -p hdfs:///data/raw/ott_dazn/logs-archive-production/dt=20${DATE_V1}
39+
hdfs dfs -rm -f -skipTrash hdfs:///data/raw/ott_dazn/logs-archive-production/dt=20${DATE_V1}/*.*
40+
hdfs dfs -copyFromLocal ${MYFOLDER}/data/raw/${DATE_V1}/*.* hdfs:///data/raw/ott_dazn/logs-archive-production/dt=20${DATE_V1}
41+
rm -rf ${MYFOLDER}/data/raw/${DATE_V1}/*.*
42+
#
43+
exit 0
44+
#
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
#
3+
MYUSER="$(whoami)"
4+
#
5+
if [ $MYUSER = "root" ]
6+
then
7+
exit 0
8+
fi
9+
#
10+
/opt/cloudera/parcels/Anaconda/bin/python3.6 /opt/cloudera/parcels/Anaconda/bin/pip install awscli --upgrade --user
11+
#
12+
exit 0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env bash
2+
3+
DATE_V1=$1
4+
if [ -z "$DATE_V1" ]
5+
then
6+
DATE_V1=20180214
7+
fi
8+
9+
if [ -f ~/.keytabs/$(whoami).keytab ]; then
10+
# for any:any credentials
11+
export KRB5CCNAME=/tmp/krb5cc_$(id -u)
12+
13+
kinit -kt ~/.keytabs/$(whoami).keytab $(whoami)/[email protected] -c /tmp/krb5cc_$(id -u)
14+
klist /tmp/krb5cc_$(id -u)
15+
else
16+
echo "No Kerberos here"
17+
fi
18+
19+
echo "RAW d=${DATE_V1} FILES COUNT : "
20+
hdfs dfs -ls /data/raw/ott_dazn/logs-archive-production/dt=${DATE_V1}/*.gz | wc -l
21+
22+
hdfs dfs -mkdir -p /data/staged/ott_dazn/logs-archive-production/parquet
23+
24+
echo "Clean Stage to Avoid Duplicates of dt=${DATE_V1}"
25+
hdfs dfs -rm -r -f -skipTrash /data/staged/ott_dazn/logs-archive-production/parquet/dt=${DATE_V1}/*.parquet
26+
27+
echo "TOTAL STAGED FILES COUNT : "
28+
hdfs dfs -ls -R /data/staged/ott_dazn/logs-archive-production/parquet/dt=*/*.parquet | wc -l
29+
30+
echo "STAGE RAW DATA AS PARQUET FOR DATE ${DATE_V1} with SPARK"
31+
32+
# Spark Job Operations
33+
MAIN_CLASS1=ptv.content.akamailogs.staged.parquetindexer.SparkStageLogsArchiveProdWorkArround
34+
APP_JAR=/home/siemanalyst/projects/logs-archive-production/data_etl/parquetindexer-1.0-TPOC-jar-with-dependencies.jar
35+
EXTRA_JAR=/home/siemanalyst/projects/logs-archive-production/data_etl/parquetindexer-1.0-TPOC-jar-with-dependencies.jar
36+
37+
# Additional for Spark container
38+
39+
MASTER_URL=yarn
40+
DEPLOY_MODE=client
41+
42+
DRIVER_MEMORY=17G
43+
DRIVER_CORES=4
44+
EXECUTOR_MEMORY=11G
45+
EXECUTOR_CORES=2
46+
NUM_EXECUTORS=48
47+
48+
# Introduction of Coalesce Factor - Covers Size Growth
49+
#COALESCEFACTOR=72
50+
#COALESCEFACTOR=144
51+
COALESCEFACTOR=288
52+
#COALESCEFACTOR=576
53+
54+
# Additional Logic workarround for new null token DI
55+
#
56+
IN_PATH="/data/raw/ott_dazn/logs-archive-production/"
57+
OUT_PATH="/data/staged/ott_dazn/logs-archive-production/parquet/"
58+
#
59+
60+
spark2-submit --class ${MAIN_CLASS1} --master ${MASTER_URL} --deploy-mode ${DEPLOY_MODE} \
61+
--principal $(whoami)/[email protected] --keytab ~/.keytabs/$(whoami).keytab \
62+
--conf "spark.debug.maxToStringFields=1500" --conf "spark.dynamicAllocation.enabled=false" \
63+
--driver-memory ${DRIVER_MEMORY} --driver-cores ${DRIVER_CORES} \
64+
--num-executors ${NUM_EXECUTORS} --executor-memory ${EXECUTOR_MEMORY} --executor-cores ${EXECUTOR_CORES} \
65+
--jars ${APP_JAR} ${EXTRA_JAR} --processdate ${DATE_V1} --inputpath ${IN_PATH} --outputpath ${OUT_PATH} --coalescefactor ${COALESCEFACTOR} >> /home/siemanalyst/projects/logs-archive-production/data_etl/logs/SparkStageLogsArchiveProd-1-${DATE_V1}.log
66+
#
67+
68+
echo 1

0 commit comments

Comments
 (0)