Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions SPARK2_THRIFT1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
This CSD can be used to add Spark2 ThriftServer as a service to Cloudera Manager.
Following are the dependancies before adding Spark2 TS1 service:
1. This CSD has a dependancy on Spark2 service which needs to be installed on the cluster before trying to install this service.
2. The nodes on which Spark2 TS1 is added should also have Spark2 Gateways roles present.
3. The distribution of Spark2 supported by Cloudera doesn't have Spark Thriftserver support hence you need to build one with Thriftserver support.You can follow this link https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/
to create one that can be used with this Spark2 TS1 service.
4. The property Custom Spark Home needs to point to the location where the Custom Spark tar file created in the above step has been extracted.
5. To enable high availability for multiple Spark2 ThriftServer hosts, configure a load balancer to manage them and enable the property sparkthrift.ha.enable to yes and fill up the sparkthrift.loadbalancer.host and sparkthrift.loadbalancer.port property with respective values.
6. The logs can be viewed at /var/log/sparkthrift
7. The service runs as hive user and port 20000 by default
8. Change the sparkthrift.cmd.opts property to --conf spark.eventLog.enabled=true --conf spark.eventLog.dir=hdfs://<namenode_hostname:8020 or NameNode Nameservice>/user/spark/spark2ApplicationHistory --conf spark.yarn.historyServer.address=http://<spark_history_server_hostname>:18089 if one needs to integrate with Spark History Server. Else one needs to create a local folder /tmp/spark-events on all the nodes where this service is installed and set the folder ownership to hive:hive recursively.
97 changes: 97 additions & 0 deletions SPARK2_THRIFT1/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
<!-- Copyright (c) 2016 Cloudera, Inc. All rights reserved. -->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.0.0.cloudera2</version>
<relativePath>../pom.xml</relativePath>
</parent>

<groupId>com.cloudera.csd</groupId>
<artifactId>SPARK2_THRIFT1</artifactId>
<version>2.0.0.cloudera2</version>
<name>The Spark2 TS1 CSD</name>
<packaging>pom</packaging>

<properties>
<build.testJarPhase>none</build.testJarPhase>
<skipTests>true</skipTests>
</properties>

<build>
<resources>
<resource>
<directory>${project.build.directory}/generated-resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.8</version>
<executions>
<execution>
<id>create-version-file</id>
<phase>generate-resources</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<echo message="${project.version}"
file="${project.build.directory}/generated-resources/meta/version"/>
</target>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptors>
<descriptor>../assembly.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- <plugin>
<groupId>com.cloudera.enterprise</groupId>
<artifactId>schema-validator-maven-plugin</artifactId>
<version>5.12.0</version>
<executions>
<execution>
<id>validate-schema</id>
<phase>test</phase>
<goals>
<goal>validate</goal>
</goals>
<configuration>
<sourceDirectory>src</sourceDirectory>
<strictMode>true</strictMode>
</configuration>
</execution>
</executions>
</plugin>-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>
25 changes: 25 additions & 0 deletions SPARK2_THRIFT1/src/aux/client/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
spark.authenticate=false
spark.dynamicAllocation.enabled=true
spark.dynamicAllocation.executorIdleTimeout=60
spark.dynamicAllocation.minExecutors=0
spark.dynamicAllocation.schedulerBacklogTimeout=1
spark.eventLog.enabled=true
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.shuffle.service.enabled=true
spark.shuffle.service.port=7337
spark.ui.killEnabled=true
spark.master=yarn
spark.submit.deployMode=client
spark.sql.hive.metastore.version=1.1.0
spark.sql.catalogImplementation=hive
spark.driver.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.executor.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.yarn.am.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.hadoop.mapreduce.application.classpath=
spark.hadoop.yarn.application.classpath=
spark.yarn.config.gatewayPath=/opt/cloudera/parcels
spark.yarn.config.replacementPath={{HADOOP_COMMON_HOME}}/../../..
spark.akka.frameSize=100
spark.akka.threads=10
spark.io.compression.snappy.blockSize=8192
spark.executor.extraJavaOptions=-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+CMSParallelRemarkEnabled
66 changes: 66 additions & 0 deletions SPARK2_THRIFT1/src/aux/client/spark-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
##
# Generated by Cloudera Manager and should not be modified directly
##

SELF="$(cd $(dirname $BASH_SOURCE) && pwd)"
if [ -z "$SPARK_CONF_DIR" ]; then
export SPARK_CONF_DIR="$SELF"
fi
##
# Needs to be changed if parcel is implemented
##
CLOUDERA_HOME=/opt/cloudera/parcels
#export SPARK_HOME=$CLOUDERA_HOME/spark-2.0.0.cloudera2-bin-custom-spark-1
export SPARK_HOME=CUSTOM_SPARK_HOME
SPARK_LOG_DIR=/var/log/spark2/ts1
SPARK_PID_DIR=/var/log/spark2/ts1
SPARK_PYTHON_PATH=""
if [ -n "$SPARK_PYTHON_PATH" ]; then
export PYTHONPATH="$PYTHONPATH:$SPARK_PYTHON_PATH"
fi

export HADOOP_HOME="/opt/cloudera/parcels/CDH/lib/hadoop"
export HADOOP_COMMON_HOME="$HADOOP_HOME"

if [ -n "$HADOOP_HOME" ]; then
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${HADOOP_HOME}/lib/native
fi

SPARK_EXTRA_LIB_PATH=""
if [ -n "$SPARK_EXTRA_LIB_PATH" ]; then
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$SPARK_EXTRA_LIB_PATH
fi

export LD_LIBRARY_PATH

HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-$SPARK_CONF_DIR/yarn-conf}
export HADOOP_CONF_DIR

PYLIB="$SPARK_HOME/python/lib"
if [ -f "$PYLIB/pyspark.zip" ]; then
PYSPARK_ARCHIVES_PATH=
for lib in "$PYLIB"/*.zip; do
if [ -n "$PYSPARK_ARCHIVES_PATH" ]; then
PYSPARK_ARCHIVES_PATH="$PYSPARK_ARCHIVES_PATH,local:$lib"
else
PYSPARK_ARCHIVES_PATH="local:$lib"
fi
done
export PYSPARK_ARCHIVES_PATH
fi

# Spark uses `set -a` to export all variables created or modified in this
# script as env vars. We use a temporary variables to avoid env var name
# collisions.
# If PYSPARK_PYTHON is unset, set to CDH_PYTHON
TMP_PYSPARK_PYTHON=${PYSPARK_PYTHON:-'{{CDH_PYTHON}}'}
# If PYSPARK_DRIVER_PYTHON is unset, set to CDH_PYTHON
TMP_PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:-{{CDH_PYTHON}}}

if [ -n "$TMP_PYSPARK_PYTHON" ] && [ -n "$TMP_PYSPARK_DRIVER_PYTHON" ]; then
export PYSPARK_PYTHON="$TMP_PYSPARK_PYTHON"
export PYSPARK_DRIVER_PYTHON="$TMP_PYSPARK_DRIVER_PYTHON"
fi

export SPARK_DIST_CLASSPATH=$(paste -sd: "$SELF/classpath.txt")
Loading