cloudera · souviksarkhel · Dec 8, 2017 · Dec 22, 2017 · Dec 22, 2017
diff --git a/SPARK2_THRIFT1/README.md b/SPARK2_THRIFT1/README.md
@@ -0,0 +1,11 @@
+This CSD can be used to add Spark2 ThriftServer as a service to Cloudera Manager.
+Following are the dependancies before adding Spark2 TS1 service:
+1. This CSD has a dependancy on Spark2 service which needs to be installed on the cluster before trying to install this service.
+2. The nodes on which Spark2 TS1 is added should also have Spark2 Gateways roles present.
+3. The distribution of Spark2 supported by Cloudera doesn't have Spark Thriftserver support hence you need to build one with Thriftserver support.You can follow this   link https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/
+   to create one that can be used with this Spark2 TS1 service.
+4. The property Custom Spark Home needs to point to the location where the Custom Spark tar file created in the above step has been extracted.  
+5. To enable high availability for multiple Spark2 ThriftServer hosts, configure a load balancer to manage them  and enable the property sparkthrift.ha.enable to yes   and fill up the sparkthrift.loadbalancer.host and sparkthrift.loadbalancer.port property with respective values.
+6. The logs can be viewed at /var/log/sparkthrift
+7. The service runs as hive user and port 20000 by default
+8. Change the sparkthrift.cmd.opts property to --conf spark.eventLog.enabled=true --conf spark.eventLog.dir=hdfs://<namenode_hostname:8020 or NameNode Nameservice>/user/spark/spark2ApplicationHistory --conf spark.yarn.historyServer.address=http://<spark_history_server_hostname>:18089 if one needs to integrate with Spark History Server. Else one needs to create a local folder /tmp/spark-events on all the nodes where this service is installed and set the folder ownership to hive:hive recursively.
diff --git a/SPARK2_THRIFT1/pom.xml b/SPARK2_THRIFT1/pom.xml
@@ -0,0 +1,97 @@
+<!-- Copyright (c) 2016 Cloudera, Inc. All rights reserved. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.0.0.cloudera2</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>com.cloudera.csd</groupId>
+  <artifactId>SPARK2_THRIFT1</artifactId>
+  <version>2.0.0.cloudera2</version>
+  <name>The Spark2 TS1 CSD</name>
+  <packaging>pom</packaging>
+
+  <properties>
+    <build.testJarPhase>none</build.testJarPhase>
+    <skipTests>true</skipTests>
+  </properties>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>${project.build.directory}/generated-resources</directory>
+      </resource>
+    </resources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <version>1.8</version>
+        <executions>
+          <execution>
+            <id>create-version-file</id>
+            <phase>generate-resources</phase>
+            <goals>
+              <goal>run</goal>
+            </goals>
+            <configuration>
+              <target>
+                <echo message="${project.version}"
+                      file="${project.build.directory}/generated-resources/meta/version"/>
+              </target>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <appendAssemblyId>false</appendAssemblyId>
+          <descriptors>
+            <descriptor>../assembly.xml</descriptor>
+          </descriptors>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+<!--    <plugin>
+        <groupId>com.cloudera.enterprise</groupId>
+        <artifactId>schema-validator-maven-plugin</artifactId>
+        <version>5.12.0</version>
+        <executions>
+          <execution>
+            <id>validate-schema</id>
+            <phase>test</phase>
+            <goals>
+              <goal>validate</goal>
+            </goals>
+            <configuration>
+              <sourceDirectory>src</sourceDirectory>
+              <strictMode>true</strictMode>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>-->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-install-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
+    </plugins>
+   </build>
+</project>
diff --git a/SPARK2_THRIFT1/src/aux/client/spark-defaults.conf b/SPARK2_THRIFT1/src/aux/client/spark-defaults.conf
@@ -0,0 +1,25 @@
+spark.authenticate=false
+spark.dynamicAllocation.enabled=true
+spark.dynamicAllocation.executorIdleTimeout=60
+spark.dynamicAllocation.minExecutors=0
+spark.dynamicAllocation.schedulerBacklogTimeout=1
+spark.eventLog.enabled=true
+spark.serializer=org.apache.spark.serializer.KryoSerializer
+spark.shuffle.service.enabled=true
+spark.shuffle.service.port=7337
+spark.ui.killEnabled=true
+spark.master=yarn
+spark.submit.deployMode=client
+spark.sql.hive.metastore.version=1.1.0
+spark.sql.catalogImplementation=hive
+spark.driver.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native
+spark.executor.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native
+spark.yarn.am.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native
+spark.hadoop.mapreduce.application.classpath=
+spark.hadoop.yarn.application.classpath=
+spark.yarn.config.gatewayPath=/opt/cloudera/parcels
+spark.yarn.config.replacementPath={{HADOOP_COMMON_HOME}}/../../..
+spark.akka.frameSize=100
+spark.akka.threads=10
+spark.io.compression.snappy.blockSize=8192
+spark.executor.extraJavaOptions=-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+CMSParallelRemarkEnabled
diff --git a/SPARK2_THRIFT1/src/aux/client/spark-env.sh b/SPARK2_THRIFT1/src/aux/client/spark-env.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+##
+# Generated by Cloudera Manager and should not be modified directly
+##
+
+SELF="$(cd $(dirname $BASH_SOURCE) && pwd)"
+if [ -z "$SPARK_CONF_DIR" ]; then
+  export SPARK_CONF_DIR="$SELF"
+fi
+##
+# Needs to be changed if parcel is implemented
+##
+CLOUDERA_HOME=/opt/cloudera/parcels
+#export SPARK_HOME=$CLOUDERA_HOME/spark-2.0.0.cloudera2-bin-custom-spark-1
+export SPARK_HOME=CUSTOM_SPARK_HOME
+SPARK_LOG_DIR=/var/log/spark2/ts1
+SPARK_PID_DIR=/var/log/spark2/ts1
+SPARK_PYTHON_PATH=""
+if [ -n "$SPARK_PYTHON_PATH" ]; then
+  export PYTHONPATH="$PYTHONPATH:$SPARK_PYTHON_PATH"
+fi
+
+export HADOOP_HOME="/opt/cloudera/parcels/CDH/lib/hadoop"
+export HADOOP_COMMON_HOME="$HADOOP_HOME"
+
+if [ -n "$HADOOP_HOME" ]; then
+  LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${HADOOP_HOME}/lib/native
+fi
+
+SPARK_EXTRA_LIB_PATH=""
+if [ -n "$SPARK_EXTRA_LIB_PATH" ]; then
+  LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$SPARK_EXTRA_LIB_PATH
+fi
+
+export LD_LIBRARY_PATH
+
+HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-$SPARK_CONF_DIR/yarn-conf}
+export HADOOP_CONF_DIR
+
+PYLIB="$SPARK_HOME/python/lib"
+if [ -f "$PYLIB/pyspark.zip" ]; then
+  PYSPARK_ARCHIVES_PATH=
+  for lib in "$PYLIB"/*.zip; do
+    if [ -n "$PYSPARK_ARCHIVES_PATH" ]; then
+      PYSPARK_ARCHIVES_PATH="$PYSPARK_ARCHIVES_PATH,local:$lib"
+    else
+      PYSPARK_ARCHIVES_PATH="local:$lib"
+    fi
+  done
+  export PYSPARK_ARCHIVES_PATH
+fi
+
+# Spark uses `set -a` to export all variables created or modified in this
+# script as env vars. We use a temporary variables to avoid env var name
+# collisions.
+# If PYSPARK_PYTHON is unset, set to CDH_PYTHON
+TMP_PYSPARK_PYTHON=${PYSPARK_PYTHON:-'{{CDH_PYTHON}}'}
+# If PYSPARK_DRIVER_PYTHON is unset, set to CDH_PYTHON
+TMP_PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:-{{CDH_PYTHON}}}
+
+if [ -n "$TMP_PYSPARK_PYTHON" ] && [ -n "$TMP_PYSPARK_DRIVER_PYTHON" ]; then
+  export PYSPARK_PYTHON="$TMP_PYSPARK_PYTHON"
+  export PYSPARK_DRIVER_PYTHON="$TMP_PYSPARK_DRIVER_PYTHON"
+fi
+
+export SPARK_DIST_CLASSPATH=$(paste -sd: "$SELF/classpath.txt")