summaryrefslogtreecommitdiff
path: root/apache-spark
diff options
context:
space:
mode:
authorErich Eckner <git@eckner.net>2017-09-20 12:20:41 +0200
committerErich Eckner <git@eckner.net>2017-09-20 12:20:41 +0200
commit046b5d8334ec0b16253079407c6b1c652eb8b925 (patch)
treecd0941f169b77bf10bb78affa6ca3188c02d22e3 /apache-spark
parentc0a1033ef093daa270ec480effee6b709b3e8ab9 (diff)
downloadarchlinuxewe-046b5d8334ec0b16253079407c6b1c652eb8b925.tar.xz
apache-spark neu
Diffstat (limited to 'apache-spark')
-rw-r--r--apache-spark/PKGBUILD76
-rw-r--r--apache-spark/apache-spark-master.service12
-rw-r--r--apache-spark/apache-spark-slave@.service13
-rw-r--r--apache-spark/apache-spark.install14
-rw-r--r--apache-spark/other-pkgbuild60
-rwxr-xr-xapache-spark/run-master.sh81
-rwxr-xr-xapache-spark/run-slave.sh91
-rwxr-xr-xapache-spark/spark-daemon-run.sh139
-rw-r--r--apache-spark/spark-env.sh6
9 files changed, 492 insertions, 0 deletions
diff --git a/apache-spark/PKGBUILD b/apache-spark/PKGBUILD
new file mode 100644
index 000000000..4999cd5a1
--- /dev/null
+++ b/apache-spark/PKGBUILD
@@ -0,0 +1,76 @@
+# Maintainer: Erich Eckner <arch at eckner dot net>
+# Contributor: François Garillot ("huitseeker") <francois [at] garillot.net>
+# Contributor: Christian Krause ("wookietreiber") <kizkizzbangbang@gmail.com>
+
+pkgname=apache-spark
+pkgver=2.2.0
+pkgrel=3
+pkgdesc="fast and general engine for large-scale data processing"
+arch=('any')
+url="http://spark.apache.org"
+license=('APACHE')
+depends=('java-environment>=6')
+optdepends=('python2: python2 support for pyspark'
+ 'ipython2: ipython2 support for pyspark'
+ 'python: python3 support for pyspark'
+ 'ipython: ipython3 support for pyspark'
+ 'r: support for sparkR'
+ 'rsync: support rsync hadoop binaries from master'
+ 'hadoop: support for running on YARN')
+install=apache-spark.install
+source=("http://d3kbcqa49mib13.cloudfront.net/spark-${pkgver}-bin-without-hadoop.tgz"
+ 'apache-spark-master.service'
+ 'apache-spark-slave@.service'
+ 'spark-env.sh'
+ 'spark-daemon-run.sh'
+ 'run-master.sh'
+ 'run-slave.sh')
+sha1sums=('15b9577049638fc1afe8d2843ac1ae9dec470962'
+ 'ac71d12070a9a10323e8ec5aed4346b1dd7f21c6'
+ 'a191e4f8f7f8bbc596f4fadfb3c592c3efbc4fc0'
+ 'e52d327571e84b9b350bc594131fcaf50a3dd0f4'
+ '08557d2d5328d5c99e533e16366fd893fffaad78'
+ '323445b8d64aea0534a2213d2600d438f406855b'
+ '65b1bc5fce63d1fa7a1b90f2d54a09acf62012a4')
+backup=('etc/apache-spark/spark-env.sh')
+
+PKGEXT=${PKGEXT:-'.pkg.tar.xz'}
+
+prepare() {
+ cd "$srcdir/spark-${pkgver}-bin-without-hadoop"
+}
+
+package() {
+ cd "$srcdir/spark-${pkgver}-bin-without-hadoop"
+
+ install -d "$pkgdir/usr/bin" "$pkgdir/opt" "$pkgdir/var/log/apache-spark" "$pkgdir/var/lib/apache-spark/work"
+ chmod 2775 "$pkgdir/var/log/apache-spark" "$pkgdir/var/lib/apache-spark/work"
+
+ cp -r "$srcdir/spark-${pkgver}-bin-without-hadoop" "$pkgdir/opt/apache-spark/"
+
+ cd "$pkgdir/usr/bin"
+ for binary in beeline pyspark sparkR spark-class spark-shell spark-sql spark-submit load-spark-env.sh; do
+ binpath="/opt/apache-spark/bin/$binary"
+ ln -s "$binpath" $binary
+ sed -i 's|^export SPARK_HOME=.*$|export SPARK_HOME=/opt/apache-spark|' "$pkgdir/$binpath"
+ done
+
+ mkdir -p $pkgdir/etc/profile.d
+ echo '#!/bin/sh' > $pkgdir/etc/profile.d/apache-spark.sh
+ echo 'SPARK_HOME=/opt/apache-spark' >> $pkgdir/etc/profile.d/apache-spark.sh
+ echo 'export SPARK_HOME' >> $pkgdir/etc/profile.d/apache-spark.sh
+ chmod 755 $pkgdir/etc/profile.d/apache-spark.sh
+
+ install -Dm644 "$srcdir/apache-spark-master.service" "$pkgdir/usr/lib/systemd/system/apache-spark-master.service"
+ install -Dm644 "$srcdir/apache-spark-slave@.service" "$pkgdir/usr/lib/systemd/system/apache-spark-slave@.service"
+ install -Dm644 "$srcdir/spark-env.sh" "$pkgdir/etc/apache-spark/spark-env.sh"
+ for script in run-master.sh run-slave.sh spark-daemon-run.sh; do
+ install -Dm755 "$srcdir/$script" "$pkgdir/opt/apache-spark/sbin/$script"
+ done
+ install -Dm644 "$srcdir/spark-${pkgver}-bin-without-hadoop/conf"/* "$pkgdir/etc/apache-spark"
+
+ cd "$pkgdir/opt/apache-spark"
+ mv conf conf-templates
+ ln -sf "/etc/apache-spark" conf
+ ln -sf "/var/lib/apache-spark/work" .
+}
diff --git a/apache-spark/apache-spark-master.service b/apache-spark/apache-spark-master.service
new file mode 100644
index 000000000..b8bc98bce
--- /dev/null
+++ b/apache-spark/apache-spark-master.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Apache Spark Standalone Master
+After=network.target
+
+[Service]
+User=apache-spark
+Group=apache-spark
+Environment=SPARK_LOG_DIR=/var/log/apache-spark
+ExecStart=/opt/apache-spark/sbin/run-master.sh
+
+[Install]
+WantedBy=multi-user.target
diff --git a/apache-spark/apache-spark-slave@.service b/apache-spark/apache-spark-slave@.service
new file mode 100644
index 000000000..453b3465c
--- /dev/null
+++ b/apache-spark/apache-spark-slave@.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=Apache Spark Standalone Master
+After=network.target
+
+[Service]
+User=apache-spark
+Group=apache-spark
+Environment=SPARK_LOG_DIR=/var/log/apache-spark
+ExecStart=/opt/apache-spark/sbin/run-slave.sh %i
+
+[Install]
+WantedBy=multi-user.target
+DefaultInstance=127.0.0.1:7077
diff --git a/apache-spark/apache-spark.install b/apache-spark/apache-spark.install
new file mode 100644
index 000000000..7aa034808
--- /dev/null
+++ b/apache-spark/apache-spark.install
@@ -0,0 +1,14 @@
+post_install() {
+ groupadd -r -f apache-spark
+ useradd -r -g apache-spark -s /usr/bin/nologin -d /var/lib/apache-spark apache-spark || true
+
+ [[ ! -d /var/lib/apache-spark/work ]] &&
+ install -d /var/lib/apache-spark/work
+
+ chown -R apache-spark:apache-spark /var/{lib,log}/apache-spark
+}
+
+post_remove() {
+ /usr/sbin/userdel apache-spark
+ /usr/sbin/groupdel apache-spark
+}
diff --git a/apache-spark/other-pkgbuild b/apache-spark/other-pkgbuild
new file mode 100644
index 000000000..2e7d2aac2
--- /dev/null
+++ b/apache-spark/other-pkgbuild
@@ -0,0 +1,60 @@
+# Maintainer: Franois Garillot ("huitseeker") <francois [at] garillot.net>
+# Contributor: Christian Krause ("wookietreiber") <kizkizzbangbang@gmail.com>
+
+pkgname=apache-spark
+pkgver=1.4.0
+pkgrel=1
+pkgdesc="fast and general engine for large-scale data processing"
+arch=('any')
+url="http://spark.apache.org"
+license=('APACHE')
+depends=('maven' 'java-environment>=6' 'scala' 'python2>=2.7')
+optdepends=('python: PYSPARK_PYTHON=python3 pyspark'
+ 'ipython: PYSPARK_DRIVER_PYTHON=ipython pyspark; IPYTHON=1 pyspark')
+install=apache-spark.install
+source=("http://d3kbcqa49mib13.cloudfront.net/spark-$pkgver.tgz"
+ 'apache-spark-standalone.service'
+ 'spark-env.sh')
+md5sums=('180382ccce97616bcbf5f8278411519f'
+ 'bb7d8b85366e6f9cc0b2777eaea161a8'
+ '0913001583e607849270090555dbd309')
+backup=('etc/apache-spark/spark-env.sh')
+
+PKGEXT=${PKGEXT:-'.pkg.tar.xz'}
+
+prepare() {
+ cd "$srcdir/spark-$pkgver"
+
+ sed -i 's|pid=$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid|pid=/var/lib/apache-spark/spark-daemon.pid|' sbin/spark-daemon.sh
+}
+
+build() {
+ cd "$srcdir/spark-$pkgver"
+
+ export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
+
+ dev/change-version-to-2.11.sh
+
+ JAVA_HOME=/usr/lib/jvm/default-runtime ./make-distribution.sh -Dscala-2.11 -Dmaven.repo.local=/tmp
+}
+
+package() {
+ cd "$srcdir/spark-$pkgver"
+
+ install -d "$pkgdir/usr/bin" "$pkgdir/usr/share"
+
+ cp -r "$srcdir/spark-$pkgver/dist" "$pkgdir/usr/share/apache-spark/"
+
+ cd "$pkgdir/usr/bin"
+ for binary in beeline pyspark sparkR spark-class spark-shell spark-sql spark-submit load-spark-env.sh; do
+ binpath="/usr/share/apache-spark/bin/$binary"
+ ln -s "$binpath" $binary
+ sed -i 's|^export SPARK_HOME=.*$|export SPARK_HOME=/usr/share/apache-spark|' "$pkgdir/$binpath"
+ done
+
+ install -Dm644 "$srcdir/apache-spark-standalone.service" "$pkgdir/usr/lib/systemd/system/apache-spark-standalone.service"
+ install -Dm644 "$srcdir/spark-env.sh" "$pkgdir/etc/apache-spark/spark-env.sh"
+
+ cd "$pkgdir/usr/share/apache-spark/conf"
+ ln -sf "/etc/apache-spark/spark-env.sh" .
+}
diff --git a/apache-spark/run-master.sh b/apache-spark/run-master.sh
new file mode 100755
index 000000000..a60ca791a
--- /dev/null
+++ b/apache-spark/run-master.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Starts the master on the machine this script is executed on.
+
+if [ -z "${SPARK_HOME}" ]; then
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
+# NOTE: This exact class name is matched downstream by SparkSubmit.
+# Any changes need to be reflected there.
+CLASS="org.apache.spark.deploy.master.Master"
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+ echo "Usage: ./sbin/run-master.sh [options]"
+ pattern="Usage:"
+ pattern+="\|Using Spark's default log4j profile:"
+ pattern+="\|Registered signal handlers for"
+
+ "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
+ exit 1
+fi
+
+ORIGINAL_ARGS="$@"
+
+START_TACHYON=false
+
+while (( "$#" )); do
+case $1 in
+ --with-tachyon)
+ if [ ! -e "${SPARK_HOME}"/tachyon/bin/tachyon ]; then
+ echo "Error: --with-tachyon specified, but tachyon not found."
+ exit -1
+ fi
+ START_TACHYON=true
+ ;;
+ esac
+shift
+done
+
+. "${SPARK_HOME}/sbin/spark-config.sh"
+
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+if [ "$SPARK_MASTER_PORT" = "" ]; then
+ SPARK_MASTER_PORT=7077
+fi
+
+if [ "$SPARK_MASTER_IP" = "" ]; then
+ SPARK_MASTER_IP=`hostname`
+fi
+
+if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then
+ SPARK_MASTER_WEBUI_PORT=8080
+fi
+
+"${SPARK_HOME}/sbin"/spark-daemon-run.sh start $CLASS 1 \
+ --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \
+ $ORIGINAL_ARGS
+
+if [ "$START_TACHYON" == "true" ]; then
+ "${SPARK_HOME}"/tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP
+ "${SPARK_HOME}"/tachyon/bin/tachyon format -s
+ "${SPARK_HOME}"/tachyon/bin/tachyon-start.sh master
+fi
diff --git a/apache-spark/run-slave.sh b/apache-spark/run-slave.sh
new file mode 100755
index 000000000..1f92aa3be
--- /dev/null
+++ b/apache-spark/run-slave.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Starts a slave on the machine this script is executed on.
+#
+# Environment Variables
+#
+# SPARK_WORKER_INSTANCES The number of worker instances to run on this
+# slave. Default is 1.
+# SPARK_WORKER_PORT The base port number for the first worker. If set,
+# subsequent workers will increment this number. If
+# unset, Spark will find a valid port number, but
+# with no guarantee of a predictable pattern.
+# SPARK_WORKER_WEBUI_PORT The base port for the web interface of the first
+# worker. Subsequent workers will increment this
+# number. Default is 8081.
+
+if [ -z "${SPARK_HOME}" ]; then
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
+# NOTE: This exact class name is matched downstream by SparkSubmit.
+# Any changes need to be reflected there.
+CLASS="org.apache.spark.deploy.worker.Worker"
+
+if [[ $# -lt 1 ]] || [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+ echo "Usage: ./sbin/run-slave.sh [options] <master>"
+ pattern="Usage:"
+ pattern+="\|Using Spark's default log4j profile:"
+ pattern+="\|Registered signal handlers for"
+
+ "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
+ exit 1
+fi
+
+. "${SPARK_HOME}/sbin/spark-config.sh"
+
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+# First argument should be the master; we need to store it aside because we may
+# need to insert arguments between it and the other arguments
+MASTER=$1
+shift
+
+# Determine desired worker port
+if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then
+ SPARK_WORKER_WEBUI_PORT=8081
+fi
+
+# Start up the appropriate number of workers on this machine.
+# quick local function to start a worker
+function start_instance {
+ WORKER_NUM=$1
+ shift
+
+ if [ "$SPARK_WORKER_PORT" = "" ]; then
+ PORT_FLAG=
+ PORT_NUM=
+ else
+ PORT_FLAG="--port"
+ PORT_NUM=$(( $SPARK_WORKER_PORT + $WORKER_NUM - 1 ))
+ fi
+ WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 ))
+
+ "${SPARK_HOME}/sbin"/spark-daemon-run.sh start $CLASS $WORKER_NUM \
+ --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+}
+
+if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
+ start_instance 1 "$@"
+else
+ for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do
+ start_instance $(( 1 + $i )) "$@"
+ done
+fi
diff --git a/apache-spark/spark-daemon-run.sh b/apache-spark/spark-daemon-run.sh
new file mode 100755
index 000000000..34e3a80fa
--- /dev/null
+++ b/apache-spark/spark-daemon-run.sh
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Runs a Spark daemon foreground.
+#
+# Environment Variables
+#
+# SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf.
+# SPARK_LOG_DIR Where log files are stored. ${SPARK_HOME}/logs by default.
+# SPARK_MASTER host:path where spark code should be rsync'd from
+# SPARK_IDENT_STRING A string representing this instance of spark. $USER by default
+# SPARK_NICENESS The scheduling priority for daemons. Defaults to 0.
+##
+
+usage="Usage: spark-daemon-run.sh [--config <conf-dir>] (class|submit) <spark-command> <spark-instance-number> <args...>"
+
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+if [ -z "${SPARK_HOME}" ]; then
+ export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+
+. "${SPARK_HOME}/sbin/spark-config.sh"
+
+# get arguments
+
+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir="$1"
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR="$conf_dir"
+ fi
+ shift
+fi
+
+mode=$1
+shift
+command=$1
+shift
+instance=$1
+shift
+
+spark_rotate_log ()
+{
+ log=$1;
+ num=5;
+ if [ -n "$2" ]; then
+ num=$2
+ fi
+ if [ -f "$log" ]; then # rotate logs
+ while [ $num -gt 1 ]; do
+ prev=`expr $num - 1`
+ [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num"
+ num=$prev
+ done
+ mv "$log" "$log.$num";
+ fi
+}
+
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+if [ "$SPARK_IDENT_STRING" = "" ]; then
+ export SPARK_IDENT_STRING="$USER"
+fi
+
+
+
+# get log directory
+if [ "$SPARK_LOG_DIR" = "" ]; then
+ export SPARK_LOG_DIR="${SPARK_HOME}/logs"
+fi
+mkdir -p "$SPARK_LOG_DIR"
+touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1
+TEST_LOG_DIR=$?
+if [ "${TEST_LOG_DIR}" = "0" ]; then
+ rm -f "$SPARK_LOG_DIR"/.spark_test
+else
+ chown "$SPARK_IDENT_STRING" "$SPARK_LOG_DIR"
+fi
+
+# some variables
+log="$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out"
+
+# Set default scheduling priority
+if [ "$SPARK_NICENESS" = "" ]; then
+ export SPARK_NICENESS=0
+fi
+
+if [ "$SPARK_MASTER" != "" ]; then
+ echo rsync from "$SPARK_MASTER"
+ rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
+fi
+
+spark_rotate_log "$log"
+echo "running $command, logging to $log"
+
+case "$mode" in
+ (start)
+ exec nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null
+ ;;
+
+ (submit)
+ exec nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null
+ ;;
+
+ (*)
+ echo "unknown mode: $mode"
+ exit 1
+ ;;
+esac
diff --git a/apache-spark/spark-env.sh b/apache-spark/spark-env.sh
new file mode 100644
index 000000000..146253997
--- /dev/null
+++ b/apache-spark/spark-env.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+export JAVA_HOME=/usr/lib/jvm/default-runtime
+export SPARK_DIST_CLASSPATH=$(hadoop classpath)
+SPARK_MASTER_IP=localhost
+SPARK_LOCAL_IP=localhost