Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restart services only if they are monitored by monit #13

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions common/utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

function get_running_services() {
running_svcs=()
for s in "$@"; do
monitored=$(wget -q -O - localhost:2812/_status?format=xml | xmlstarlet sel -t -v "/monit/service[name='${s}']/monitor")
if [[ "$monitored" == "1" ]]; then
# Order of services is retained when returning
running_svcs=(${running_svcs[@]} "$s")
fi
done

echo "${running_svcs[@]}"
}
89 changes: 65 additions & 24 deletions hadoop/util.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,71 @@
#!/bin/bash

source /usr/lib/hustler/bin/qubole-bash-lib.sh
source /usr/lib/qubole/bootstrap-functions/common/utils.sh
export PROFILE_FILE=${PROFILE_FILE:-/etc/profile}
export HADOOP_ETC_DIR=${HADOOP_ETC_DIR:-/usr/lib/hadoop2/etc/hadoop}
declare -A SVC_USERS=([namenode]=hdfs [timelineserver]=yarn [historyserver]=mapred [resourcemanager]=yarn [datanode]=hdfs)

function start_daemon() {
daemon=$1;
case "${SVC_USERS[$daemon]}" in
yarn)
/bin/su -s /bin/bash -c "/usr/lib/hadoop2/sbin/yarn-daemon.sh start $daemon" yarn
;;
hdfs)
/bin/su -s /bin/bash -c "/usr/lib/hadoop2/sbin/hadoop-daemon.sh start $daemon" hdfs
;;
mapred)
/bin/su -s /bin/bash -c "HADOOP_LIBEXEC_DIR=/usr/lib/hadoop2/libexec /usr/lib/hadoop2/sbin/mr-jobhistory-daemon.sh start $daemon" mapred
;;
*)
echo "Invalid daemon $daemon"
;;
esac
}

function stop_daemon() {
daemon=$1;
case "${SVC_USERS[$daemon]}" in
yarn)
/bin/su -s /bin/bash -c "/usr/lib/hadoop2/sbin/yarn-daemon.sh stop $daemon" yarn
;;
hdfs)
/bin/su -s /bin/bash -c "/usr/lib/hadoop2/sbin/hadoop-daemon.sh stop $daemon" hdfs
;;
mapred)
/bin/su -s /bin/bash -c "HADOOP_LIBEXEC_DIR=/usr/lib/hadoop2/libexec /usr/lib/hadoop2/sbin/mr-jobhistory-daemon.sh stop $daemon" mapred
;;
*)
echo "Invalid daemon $daemon"
;;
esac
}

function restart_services() {
svcs=("$@")
running_svcs=($(get_running_services "${svcs[@]}"))
for s in "${running_svcs[@]}"; do
monit unmonitor "$s"
done

for s in "${running_svcs[@]}"; do
stop_daemon "$s"
done

last=${#running_svcs[@]}

# Restart services in reverse order of how
# they were stopped
for (( i=0; i <last; i++ )); do
start_daemon "${running_svcs[~i]}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's more readable to loop from $last-1 down to 0, instead of using bitwise negated i as the index. But we can leave it like this if it's idiomatic Bash

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's pretty standard, but I'm ok with changing it if you feel it's confusing.

done

# Order doesn't matter for (un)monitor
for s in "${running_svcs[@]}"; do
monit monitor "$s"
done
}

##
# Restart hadoop services on the cluster master
Expand All @@ -11,26 +74,7 @@ export HADOOP_ETC_DIR=${HADOOP_ETC_DIR:-/usr/lib/hadoop2/etc/hadoop}
# of Java, for example
#
function restart_master_services() {

monit unmonitor namenode
monit unmonitor timelineserver
monit unmonitor historyserver
monit unmonitor resourcemanager

/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/yarn-daemon.sh stop timelineserver' yarn
/bin/su -s /bin/bash -c 'HADOOP_LIBEXEC_DIR=/usr/lib/hadoop2/libexec /usr/lib/hadoop2/sbin/mr-jobhistory-daemon.sh stop historyserver' mapred
/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/yarn-daemon.sh stop resourcemanager' yarn
/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/hadoop-daemon.sh stop namenode' hdfs

/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/hadoop-daemon.sh start namenode' hdfs
/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/yarn-daemon.sh start resourcemanager' yarn
/bin/su -s /bin/bash -c 'HADOOP_LIBEXEC_DIR=/usr/lib/hadoop2/libexec /usr/lib/hadoop2/sbin/mr-jobhistory-daemon.sh start historyserver' mapred
/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/yarn-daemon.sh start timelineserver' yarn

monit monitor namenode
monit monitor resourcemanager
monit monitor historyserver
monit monitor timelineserver
restart_services timelineserver historyserver resourcemanager namenode
}


Expand All @@ -41,10 +85,7 @@ function restart_master_services() {
# nodemanager is started after the bootstrap is run
#
function restart_worker_services() {
monit unmonitor datanode
/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/hadoop-daemon.sh stop datanode' hdfs
/bin/su -s /bin/bash -c '/usr/lib/hadoop2/sbin/hadoop-daemon.sh start datanode' hdfs
monit monitor datanode
restart_services datanode
# No need to restart nodemanager since it starts only
# after thhe bootstrap is finished
}
Expand Down