diff --git a/SPARK2_THRIFT1/README.md b/SPARK2_THRIFT1/README.md new file mode 100644 index 0000000..93ef1a0 --- /dev/null +++ b/SPARK2_THRIFT1/README.md @@ -0,0 +1,11 @@ +This CSD can be used to add Spark2 ThriftServer as a service to Cloudera Manager. +Following are the dependancies before adding Spark2 TS1 service: +1. This CSD has a dependancy on Spark2 service which needs to be installed on the cluster before trying to install this service. +2. The nodes on which Spark2 TS1 is added should also have Spark2 Gateways roles present. +3. The distribution of Spark2 supported by Cloudera doesn't have Spark Thriftserver support hence you need to build one with Thriftserver support.You can follow this link https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/ + to create one that can be used with this Spark2 TS1 service. +4. The property Custom Spark Home needs to point to the location where the Custom Spark tar file created in the above step has been extracted. +5. To enable high availability for multiple Spark2 ThriftServer hosts, configure a load balancer to manage them and enable the property sparkthrift.ha.enable to yes and fill up the sparkthrift.loadbalancer.host and sparkthrift.loadbalancer.port property with respective values. +6. The logs can be viewed at /var/log/sparkthrift +7. The service runs as hive user and port 20000 by default +8. Change the sparkthrift.cmd.opts property to --conf spark.eventLog.enabled=true --conf spark.eventLog.dir=hdfs:///user/spark/spark2ApplicationHistory --conf spark.yarn.historyServer.address=http://:18089 if one needs to integrate with Spark History Server. Else one needs to create a local folder /tmp/spark-events on all the nodes where this service is installed and set the folder ownership to hive:hive recursively. diff --git a/SPARK2_THRIFT1/pom.xml b/SPARK2_THRIFT1/pom.xml new file mode 100644 index 0000000..f9318a1 --- /dev/null +++ b/SPARK2_THRIFT1/pom.xml @@ -0,0 +1,97 @@ + + + 4.0.0 + + + org.apache.spark + spark-parent_2.11 + 2.0.0.cloudera2 + ../pom.xml + + + com.cloudera.csd + SPARK2_THRIFT1 + 2.0.0.cloudera2 + The Spark2 TS1 CSD + pom + + + none + true + + + + + + ${project.build.directory}/generated-resources + + + + + org.apache.maven.plugins + maven-antrun-plugin + 1.8 + + + create-version-file + generate-resources + + run + + + + + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + false + + ../assembly.xml + + + + + make-assembly + package + + single + + + + + + + org.apache.maven.plugins + maven-install-plugin + + true + + + + + diff --git a/SPARK2_THRIFT1/src/aux/client/spark-defaults.conf b/SPARK2_THRIFT1/src/aux/client/spark-defaults.conf new file mode 100644 index 0000000..4713b04 --- /dev/null +++ b/SPARK2_THRIFT1/src/aux/client/spark-defaults.conf @@ -0,0 +1,25 @@ +spark.authenticate=false +spark.dynamicAllocation.enabled=true +spark.dynamicAllocation.executorIdleTimeout=60 +spark.dynamicAllocation.minExecutors=0 +spark.dynamicAllocation.schedulerBacklogTimeout=1 +spark.eventLog.enabled=true +spark.serializer=org.apache.spark.serializer.KryoSerializer +spark.shuffle.service.enabled=true +spark.shuffle.service.port=7337 +spark.ui.killEnabled=true +spark.master=yarn +spark.submit.deployMode=client +spark.sql.hive.metastore.version=1.1.0 +spark.sql.catalogImplementation=hive +spark.driver.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native +spark.executor.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native +spark.yarn.am.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native +spark.hadoop.mapreduce.application.classpath= +spark.hadoop.yarn.application.classpath= +spark.yarn.config.gatewayPath=/opt/cloudera/parcels +spark.yarn.config.replacementPath={{HADOOP_COMMON_HOME}}/../../.. +spark.akka.frameSize=100 +spark.akka.threads=10 +spark.io.compression.snappy.blockSize=8192 +spark.executor.extraJavaOptions=-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+CMSParallelRemarkEnabled diff --git a/SPARK2_THRIFT1/src/aux/client/spark-env.sh b/SPARK2_THRIFT1/src/aux/client/spark-env.sh new file mode 100644 index 0000000..1615c1e --- /dev/null +++ b/SPARK2_THRIFT1/src/aux/client/spark-env.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +## +# Generated by Cloudera Manager and should not be modified directly +## + +SELF="$(cd $(dirname $BASH_SOURCE) && pwd)" +if [ -z "$SPARK_CONF_DIR" ]; then + export SPARK_CONF_DIR="$SELF" +fi +## +# Needs to be changed if parcel is implemented +## +CLOUDERA_HOME=/opt/cloudera/parcels +#export SPARK_HOME=$CLOUDERA_HOME/spark-2.0.0.cloudera2-bin-custom-spark-1 +export SPARK_HOME=CUSTOM_SPARK_HOME +SPARK_LOG_DIR=/var/log/spark2/ts1 +SPARK_PID_DIR=/var/log/spark2/ts1 +SPARK_PYTHON_PATH="" +if [ -n "$SPARK_PYTHON_PATH" ]; then + export PYTHONPATH="$PYTHONPATH:$SPARK_PYTHON_PATH" +fi + +export HADOOP_HOME="/opt/cloudera/parcels/CDH/lib/hadoop" +export HADOOP_COMMON_HOME="$HADOOP_HOME" + +if [ -n "$HADOOP_HOME" ]; then + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${HADOOP_HOME}/lib/native +fi + +SPARK_EXTRA_LIB_PATH="" +if [ -n "$SPARK_EXTRA_LIB_PATH" ]; then + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$SPARK_EXTRA_LIB_PATH +fi + +export LD_LIBRARY_PATH + +HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-$SPARK_CONF_DIR/yarn-conf} +export HADOOP_CONF_DIR + +PYLIB="$SPARK_HOME/python/lib" +if [ -f "$PYLIB/pyspark.zip" ]; then + PYSPARK_ARCHIVES_PATH= + for lib in "$PYLIB"/*.zip; do + if [ -n "$PYSPARK_ARCHIVES_PATH" ]; then + PYSPARK_ARCHIVES_PATH="$PYSPARK_ARCHIVES_PATH,local:$lib" + else + PYSPARK_ARCHIVES_PATH="local:$lib" + fi + done + export PYSPARK_ARCHIVES_PATH +fi + +# Spark uses `set -a` to export all variables created or modified in this +# script as env vars. We use a temporary variables to avoid env var name +# collisions. +# If PYSPARK_PYTHON is unset, set to CDH_PYTHON +TMP_PYSPARK_PYTHON=${PYSPARK_PYTHON:-'{{CDH_PYTHON}}'} +# If PYSPARK_DRIVER_PYTHON is unset, set to CDH_PYTHON +TMP_PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:-{{CDH_PYTHON}}} + +if [ -n "$TMP_PYSPARK_PYTHON" ] && [ -n "$TMP_PYSPARK_DRIVER_PYTHON" ]; then + export PYSPARK_PYTHON="$TMP_PYSPARK_PYTHON" + export PYSPARK_DRIVER_PYTHON="$TMP_PYSPARK_DRIVER_PYTHON" +fi + +export SPARK_DIST_CLASSPATH=$(paste -sd: "$SELF/classpath.txt") diff --git a/SPARK2_THRIFT1/src/descriptor/service.sdl b/SPARK2_THRIFT1/src/descriptor/service.sdl new file mode 100644 index 0000000..5a44c22 --- /dev/null +++ b/SPARK2_THRIFT1/src/descriptor/service.sdl @@ -0,0 +1,233 @@ +{ + "name": "SPARK2_THRIFT1", + "label": "Spark2 TS1", + "description": "Apache Spark Thrift is an open source cluster computing system. This service runs Spark2 Thriftserver as an application on YARN.", + "version": "2.0.0.cloudera2", + "compatibility": { + "cdhVersion": { + "min": "5.7.0" + } + }, + "runAs": { + "user": "hive", + "group": "hive", + "principal": "hive" + + }, + "inExpressWizard": true, + "icon": "images/icon.png", + "serviceDependencies": [ + { + "name": "HIVE", + "required": "true" + }, + { + "name": "SPARK2_ON_YARN", + "required": "false" + }, + { + "name": "YARN", + "required": "true" + } + ], + "kerberos": "${kerberos.auth.enable}", + "parameters": [ + { + "name": "kerberos.auth.enable", + "label": "Enable Kerberos Authentication", + "description": "Enable Kerberos authentication for this THRIFT service.", + "type": "boolean", + "default": "false", + "configurableInWizard": true + }, + { + "name": "spark.home", + "label": "Custom Spark Home", + "description": "This is the path to the custom built spark folder.", + "type": "string", + "default": "/opt/cloudera/parcels/SPARK2_WITH_TS", + "configurableInWizard": true + }, + { + "name": "default.realm", + "label": "Kerberos Security Realm", + "description": "The realm to use for Kerberos security.", + "type": "string", + "default": "EXAMPLE.COM", + "configurableInWizard": true + }, + { + "name": "spark.thrift.server.port", + "label": "Spark ThriftServer running Port", + "description": "The port on which the spark thriftserver will run.", + "required": "true", + "type": "port", + "default": 20000, + "configurableInWizard": false + }, + { + "name": "spark.exec.mem", + "label": "Spark ThriftServer executor memory", + "description": "Executor memory of Spark ThriftServer in gb", + "required": "true", + "type": "string", + "default": "1g", + "configurableInWizard": true + }, + { + "name": "spark.driver.memory", + "label": "Spark ThriftServer driver memory", + "description": "Driver memory of Spark ThriftServer in gb", + "required": "true", + "type": "string", + "default": "1g", + "configurableInWizard": true + }, + { + "name": "sparkthrift.queuename", + "label": "Spark ThriftServer queue name", + "description": "The queue on which Spark ThriftServer will execute jobs", + "required": "true", + "type": "string", + "default": "root.sts1", + "configurableInWizard": true + }, + { + "name": "spark.exec.cores", + "label": "Spark ThriftServer executor cores", + "description": "The number of executor cores for Spark ThriftServer1", + "required": "true", + "type": "string", + "default": "1", + "configurableInWizard": true + }, + { + "name": "spark.dynamicAllocation.maxExecutors", + "label": "Spark ThriftServer dynamicAllocation maximum Executors.", + "description": "The maximum number of executors that can be allocated for Spark ThriftServer1", + "required": "true", + "type": "string", + "default": "16", + "configurableInWizard": true + }, + { + "name": "spark.dynamicAllocation.minExecutors", + "label": "Spark ThriftServer dynamicAllocation minimum Executors.", + "description": "The minimum number of executors that can be allocated for Spark ThriftServer1", + "required": "true", + "type": "string", + "default": "0", + "configurableInWizard": true + }, + { + "name": "sparkthrift.webui.port", + "label": "Spark ThriftServer web ui port", + "description": "The web ui port of Spark ThriftServer", + "required": "true", + "type": "string", + "default": "4040", + "configurableInWizard": false + }, + { + "name": "sparkthrift.ha.enable", + "label": "Spark thrift ha status", + "description": "Ha status", + "required": "true", + "type": "boolean", + "default": "false", + "configurableInWizard": true + }, + { + "name": "sparkthrift.loadbalancer.host", + "label": "Spark ThriftServer Load Balancer Host", + "description": "Address of the load balancer used for Spark ThriftServer roles", + "required": "false", + "type": "string", + "default": "", + "configurableInWizard": true + }, + { + "name": "sparkthrift.loadbalancer.port", + "label": "Spark ThriftServer Load Balancer Port", + "description": "Port of the load balancer used for Spark ThriftServer roles", + "required": "false", + "type": "port", + "default": "8010", + "configurableInWizard": true + }, + { + "name": "sparkthrift.cmd.opts", + "label": "Spark ThriftServer Command options", + "description": "Extra properties used for starting Spark ThriftServer roles", + "required": "false", + "type": "string", + "default": "", + "configurableInWizard": true + } + ], + "roles": [ + { + "name": "SPARK2_THRIFT1", + "label": "Spark2 Thrift Server", + "pluralLabel": "Spark2 Thrift Servers", + "jvmBased": true, + "startRunner": { + "program": "scripts/control.sh", + "args": [ "start_thrift_server" ], + "environmentVariables" : { + "SPARK_THRIFT_SERVER_PORT" : "${spark.thrift.server.port}", + "SPARK_EXEC_MEM" : "${spark.exec.mem}", + "SPARK_DRIVER_MEM" : "${spark.driver.memory}", + "PORTAL_QUEUE" : "${sparkthrift.queuename}", + "CUSTOM_SPARK_HOME_DIR": "${spark.home}", + "KEYTAB_USER": "hive", + "SPARK_HOST": "${host}", + "SPARK_HA_STATUS": "${sparkthrift.ha.enable}", + "DEFAULT_REALM": "${default.realm}", + "SPARK_EXEC_CORES": "${spark.exec.cores}", + "SPARK_MAX_EXEC": "${spark.dynamicAllocation.maxExecutors}", + "SPARK_MIN_EXEC": "${spark.dynamicAllocation.minExecutors}", + "SPARKTHRIFT_WEBUI_PORT": "${sparkthrift.webui.port}", + "KERBEROS_AUTH_ENABLE": "${kerberos.auth.enable}", + "SPARK_LOADBALANCER_HOST": "${sparkthrift.loadbalancer.host}", + "SPARK_LOADBALANCER_PORT": "${sparkthrift.loadbalancer.port}", + "SPARKTHRIFT_CMD_OPTS": "${sparkthrift.cmd.opts}" + } + }, + "kerberosPrincipals": [ + { + "name": "THRIFT1_PRINCIPAL", + "primary": "${principal}", + "instance": "${host}" + }, + { + "name": "THRIFT1_LOADBALANCER_PRINCIPAL", + "primary": "${principal}", + "instance": "${sparkthrift.loadbalancer.host}" + } + ], + "topology": { + "minInstances": 1 + }, + "logging": { + "configFilename": "spark2-ts-conf/log4j.properties", + "dir": "/var/log/sparkthrift", + "filename": "spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-1-${host}.out", + "modifiable": true, + "loggingType": "log4j" + }, + "configWriter" : { + "auxConfigGenerators" : [ + { + "filename" : "spark2-ts-conf/spark-env.sh", + "sourceFilename" : "aux/client/spark-env.sh" + }, + { + "filename" : "spark2-ts-conf/spark-defaults.conf", + "sourceFilename" : "aux/client/spark-defaults.conf" + } + ] + } + } + ] +} diff --git a/SPARK2_THRIFT1/src/images/icon.png b/SPARK2_THRIFT1/src/images/icon.png new file mode 100644 index 0000000..ca78824 Binary files /dev/null and b/SPARK2_THRIFT1/src/images/icon.png differ diff --git a/SPARK2_THRIFT1/src/scripts/common.sh b/SPARK2_THRIFT1/src/scripts/common.sh new file mode 100644 index 0000000..bf5f070 --- /dev/null +++ b/SPARK2_THRIFT1/src/scripts/common.sh @@ -0,0 +1,53 @@ +#!/bin/bash +CURRENT_DIRECTORY=$(cd $(dirname $0) && pwd) +cp -r $CURRENT_DIRECTORY/../yarn-conf/ $CURRENT_DIRECTORY/../spark2-ts-conf/ +#Delete hive.server2.authentication.kerberos.principal from hive-site.xml +line_number=`grep -in hive.server2.authentication.kerberos.principal $CURRENT_DIRECTORY/../hive-conf/hive-site.xml | cut -d : -f 1` +if [ ! -z "$line_number" ] +then + start_line_number=$(($line_number -1)) + end_line_number=$(($line_number +2)) + sed -i "${start_line_number},${end_line_number}d" $CURRENT_DIRECTORY/../hive-conf/hive-site.xml +fi +#Delete hive.server2.enable.doAs from hive-site.xml +line_number=`grep -in hive.server2.enable.doAs $CURRENT_DIRECTORY/../hive-conf/hive-site.xml | cut -d : -f 1` +if [ ! -z "$line_number" ] +then + start_line_number=$(($line_number -1)) + end_line_number=$(($line_number +2)) + sed -i "${start_line_number},${end_line_number}d" $CURRENT_DIRECTORY/../hive-conf/hive-site.xml +fi +#Copy hive-site to yarn and spark conf folders +cp $CURRENT_DIRECTORY/../hive-conf/hive-site.xml $CURRENT_DIRECTORY/../spark2-ts-conf/yarn-conf/ +cp $CURRENT_DIRECTORY/../hive-conf/hive-site.xml $CURRENT_DIRECTORY/../spark2-ts-conf/ + +#Change SPARK_HOME in spark-defaults.conf and spark-env.sh +sed -i "s,CUSTOM_SPARK_HOME,$CUSTOM_SPARK_HOME_DIR,g" $CURRENT_DIRECTORY/../spark2-ts-conf/spark-defaults.conf +sed -i "s,CUSTOM_SPARK_HOME,$CUSTOM_SPARK_HOME_DIR,g" $CURRENT_DIRECTORY/../spark2-ts-conf/spark-env.sh + +#Exporting environment variables needed to start spark thrift server +export HADOOP_CONF_DIR=$CURRENT_DIRECTORY/../spark2-ts-conf/yarn-conf +export SPARK_HOME=$CUSTOM_SPARK_HOME_DIR +export SPARK_CONF_DIR=$CURRENT_DIRECTORY/../spark2-ts-conf + +function start_thrift_server { + EXEC_CMD="$CUSTOM_SPARK_HOME_DIR/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 1 --master yarn --executor-memory ${SPARK_EXEC_MEM} --driver-memory ${SPARK_DRIVER_MEM} --queue $PORTAL_QUEUE --executor-cores $SPARK_EXEC_CORES --conf spark.dynamicAllocation.maxExecutors=$SPARK_MAX_EXEC --conf spark.dynamicAllocation.minExecutors=$SPARK_MIN_EXEC --conf spark.ui.port=$SPARKTHRIFT_WEBUI_PORT $HISTORY_SERVER_CONFIG --hiveconf hive.server2.thrift.port=$SPARK_THRIFT_SERVER_PORT $SPARKTHRIFT_CMD_OPTS --driver-java-options -Dlog4j.configuration=file:$CURRENT_DIRECTORY/../spark2-ts-conf/log4j.properties" + if [ "$SPARK_HA_STATUS" = true ] && { [ -z "$SPARK_LOADBALANCER_HOST" ] || [ -z "$SPARK_LOADBALANCER_PORT" ]; }; then + echo "Load balancer host and port should be defined if HA state is enabled" + exit 1 + fi + if [ "$SPARK_HA_STATUS" = true ]; then + export SCM_KERBEROS_PRINCIPAL=$THRIFT1_LOADBALANCER_PRINCIPAL + + else + export SCM_KERBEROS_PRINCIPAL=$THRIFT1_PRINCIPAL + fi +#Check if Kerberos is enabled and if so start the service with proper pricipal and keytab + if [ "$KERBEROS_AUTH_ENABLE" = true ]; then + export KEYTAB_FILE=$CURRENT_DIRECTORY/../spark_thrift.keytab + kinit -kt $KEYTAB_FILE $SCM_KERBEROS_PRINCIPAL + EXEC_CMD=$EXEC_CMD" --hiveconf hive.server2.authentication.kerberos.principal=$SCM_KERBEROS_PRINCIPAL --hiveconf hive.server2.authentication.kerberos.keytab=$KEYTAB_FILE" + fi + echo $EXEC_CMD + exec $EXEC_CMD +} diff --git a/SPARK2_THRIFT1/src/scripts/control.sh b/SPARK2_THRIFT1/src/scripts/control.sh new file mode 100644 index 0000000..2a9fa29 --- /dev/null +++ b/SPARK2_THRIFT1/src/scripts/control.sh @@ -0,0 +1,12 @@ +#!/bin/bash +. $(cd $(dirname $0) && pwd)/common.sh +case $1 in + (start_thrift_server) + start_thrift_server + ;; + + (*) + log "Don't understand [$1]" + exit 1 + ;; +esac