diff options
| -rw-r--r-- | doc/features/ganesha-ha.md | 43 | ||||
| -rw-r--r-- | extras/ganesha/ocf/ganesha_grace | 141 | ||||
| -rw-r--r-- | extras/ganesha/ocf/ganesha_mon | 110 | ||||
| -rw-r--r-- | extras/ganesha/ocf/ganesha_nfsd | 89 | ||||
| -rw-r--r-- | extras/ganesha/scripts/ganesha-ha.sh | 207 | 
5 files changed, 284 insertions, 306 deletions
diff --git a/doc/features/ganesha-ha.md b/doc/features/ganesha-ha.md new file mode 100644 index 00000000000..4b226a22ccf --- /dev/null +++ b/doc/features/ganesha-ha.md @@ -0,0 +1,43 @@ +# Overview of Ganesha HA Resource Agents in GlusterFS 3.7 + +The ganesha_mon RA monitors its ganesha.nfsd daemon. While the +daemon is running, it creates two attributes: ganesha-active and +grace-active. When the daemon stops for any reason, the attributes +are deleted. Deleting the ganesha-active attribute triggers the +failover of the virtual IP (the IPaddr RA) to another node — +according to constraint location rules — where ganesha.nfsd is +still running. + +The ganesha_grace RA monitors the grace-active attribute. When +the grace-active attibute is deleted, the ganesha_grace RA stops, +and will not restart. This triggers pacemaker to invoke the notify +action in the ganesha_grace RAs on the other nodes in the cluster; +which send a DBUS message to their respective ganesha.nfsd. + +(N.B. grace-active is a bit of a misnomer. while the grace-active +attribute exists, everything is normal and healthy. Deleting the +attribute triggers putting the surviving ganesha.nfsds into GRACE.) + +To ensure that the remaining/surviving ganesha.nfsds are put into + NFS-GRACE before the IPaddr (virtual IP) fails over there is a +short delay (sleep) between deleting the grace-active attribute +and the ganesha-active attribute. To summarize, e.g. in a four +node cluster: + +1. on node 2 ganesha_mon::monitor notices that ganesha.nfsd has died + +2. on node 2 ganesha_mon::monitor deletes its grace-active attribute + +3. on node 2 ganesha_grace::monitor notices that grace-active is gone +and returns OCF_ERR_GENERIC, a.k.a. new error. When pacemaker tries +to (re)start ganesha_grace, its start action will return +OCF_NOT_RUNNING, a.k.a. known error, don't attempt further restarts. + +4. on nodes 1, 3, and 4, ganesha_grace::notify receives a post-stop +notification indicating that node 2 is gone, and sends a DBUS message +to its ganesha.nfsd, putting it into NFS-GRACE. + +5. on node 2 ganesha_mon::monitor waits a short period, then deletes +its ganesha-active attribute. This triggers the IPaddr (virt IP) +failover according to constraint location rules. + diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace index 75ec16c0fd1..a82c9af417a 100644 --- a/extras/ganesha/ocf/ganesha_grace +++ b/extras/ganesha/ocf/ganesha_grace @@ -36,6 +36,9 @@ else  . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs  fi +OCF_RESKEY_grace_active_default="grace-active" +: ${OCF_RESKEY_grace_active=${OCF_RESKEY_grace_active_default}} +  ganesha_meta_data() {          cat <<END  <?xml version="1.0"?> @@ -51,19 +54,25 @@ resource agent for nfs-ganesha.  <shortdesc lang="en">Manages the user-space nfs-ganesha NFS server</shortdesc>  <parameters> +<parameter name="grace_active"> +<longdesc lang="en">NFS-Ganesha grace active attribute</longdesc> +<shortdesc lang="en">NFS-Ganesha grace active attribute</shortdesc> +<content type="string" default="grace-active" /> +</parameter>  </parameters>  <actions>  <action name="start"   timeout="40s" />  <action name="stop"    timeout="40s" /> -<action name="status" depth="0"  timeout="20s" interval="5s" /> -<action name="monitor" depth="0"  timeout="20s" interval="5s" /> +<action name="status"  timeout="20s" interval="60s" /> +<action name="monitor" depth="0" timeout="10s" interval="5s" /> +<action name="notify"  timeout="10s" />  <action name="meta-data"  timeout="20s" />  </actions>  </resource-agent>  END -return $OCF_SUCCESS +return ${OCF_SUCCESS}  }  ganesha_grace_usage() { @@ -73,10 +82,10 @@ ganesha_grace_usage() {  # Make sure meta-data and usage always succeed  case $__OCF_ACTION in  	meta-data)	ganesha_meta_data -			exit $OCF_SUCCESS +			exit ${OCF_SUCCESS}  			;;  	usage|help)	ganesha_usage -			exit $OCF_SUCCESS +			exit ${OCF_SUCCESS}  			;;  	*)  			;; @@ -84,81 +93,89 @@ esac  ganesha_grace_start()  { -	local result="" -	local resourcename="" -	local deadserver="" -	local tmpIFS=${IFS} -	local pid_file="/var/run/ganesha.nfsd.pid" - -	# RHEL6 /etc/init.d/nfs-ganesha adds "-p /var/run/ganesha.nfsd.pid" -	# RHEL7 systemd does not. Would be nicer if all distros used the -	# same pid file. -	if [ -e /usr/lib/systemd/system/nfs-ganesha.service ]; then -		pid_file="/var/run/ganesha.pid" +        rc=${OCF_ERR_GENERIC} +	ocf_log debug "ganesha_grace_start()" +	attr=$(attrd_updater -Q -n ${OCF_RESKEY_grace_active}) + +	# Three possibilities: +	# 1. There is no attribute at all and attr_updater returns +	#    a zero length string. This happens when +	#    ganesha_mon::monitor hasn't run at least once to set +	#    the attribute. The assumption here is that the system +	#    is coming up. We pretend, for now, that the node is +	#    healthy, to allow the system to continue coming up. +	#    It will cure itself in a few seconds +	# 2. There is an attribute, and it has the value "1"; this +	#    node is healthy. +	# 3. There is an attribute, but it has no value or the value +	#    "0"; this node is not healthy. + +	# case 1 +	if [[ -z "${attr}" ]]; then +		return ${OCF_SUCCESS}  	fi -	# logger "ganesha_grace_start()" -	# we're here because somewhere in the cluster one or more -	# of the ganesha.nfsds have died, triggering a floating IP -	# address to move. Resource constraint location rules ensure -	# that this is invoked before the floating IP is moved. -	if [ -e ${pid_file} -a \ -	     -d /proc/$(cat ${pid_file} ) ]; then -		# my ganesha.nfsd is still running -		# find out which one died? - -		pcs status | grep dead_ip-1 | sort > /tmp/.pcs_status - -		result=$(diff /var/run/ganesha/pcs_status /tmp/.pcs_status | grep '^>') -		if [[ ${result} ]]; then -			# logger "ganesha_grace_start(), ${result}" -			IFS=$'\n' -			for line in ${result}; do -				resourcename=$(echo ${line} | cut -f 1 | cut -d ' ' -f 3) -				deadserver=${resourcename%"-dead_ip-1"} - -				if [[ ${deadserver} ]]; then -					# logger "ganesha_grace_start(), ${line}" -					# logger "ganesha_grace_start(), dbus-send --print-reply --system --dest=org.ganesha.nfsd /org/ganesha/nfsd/admin org.ganesha.nfsd.admin.grace string:${deadserver}" -					dbus-send --print-reply --system --dest=org.ganesha.nfsd /org/ganesha/nfsd/admin org.ganesha.nfsd.admin.grace string:${deadserver} -					if [ $? -ne 0 ]; then -						logger "warning: dbus-send --print-reply --system --dest=org.ganesha.nfsd /org/ganesha/nfsd/admin org.ganesha.nfsd.admin.grace string:${deadserver} failed" -					fi -				fi -			done -			IFS=${tmpIFS} -		fi - +	# case 2 +	if [[ "${attr}" = *"value=\"1\"" ]]; then +		return ${OCF_SUCCESS}  	fi -	return $OCF_SUCCESS + +	# case 3 +	return ${OCF_NOT_RUNNING}  }  ganesha_grace_stop()  { -	# logger "ganesha_grace_stop()" -	return $OCF_SUCCESS +	ocf_log debug "ganesha_grace_stop()" +	return ${OCF_SUCCESS} +} + +ganesha_grace_notify() +{ +	mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" +	case "$mode" in +	post-stop) +		ocf_log debug "stop_uname:${OCF_RESKEY_CRM_meta_notify_stop_uname}" +		dbus-send --print-reply --system --dest=org.ganesha.nfsd /org/ganesha/nfsd/admin org.ganesha.nfsd.admin.grace string:${OCF_RESKEY_CRM_meta_notify_stop_uname} +		if [ $? -ne 0 ]; then +			ocf_log info "dbus-send --print-reply --system --dest=org.ganesha.nfsd /org/ganesha/nfsd/admin org.ganesha.nfsd.admin.grace string:${OCF_RESKEY_CRM_meta_notify_stop_uname} failed" +		fi +		;; +	esac + +	return ${OCF_SUCCESS}  }  ganesha_grace_monitor()  { -	# logger "ganesha_grace_monitor()" -	if [ ! -d /var/run/ganesha ]; then -		mkdir -p /var/run/ganesha +        rc=${OCF_ERR_GENERIC} +	ocf_log debug "monitor" + +	attr=$(attrd_updater -Q -n ${OCF_RESKEY_grace_active}) + +	# if there is no attribute (yet), maybe it's because +	# this RA started before ganesha_mon (nfs-mon) has had +	# chance to create it. In which case we'll pretend +	# everything is okay this time around +	if [[ -z "${attr}" ]]; then +		return ${OCF_SUCCESS} +	fi + +	if [[ "${attr}" = *"value=\"1\"" ]]; then +		rc=${OCF_SUCCESS}  	fi -	pcs status | grep dead_ip-1 | sort > /var/run/ganesha/pcs_status -	return $OCF_SUCCESS + +	return ${rc}  }  ganesha_grace_validate()  { -	return $OCF_SUCCESS +	return ${OCF_SUCCESS}  }  ganesha_grace_validate -# logger "ganesha_grace ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION" -  # Translate each action into the appropriate function call  case $__OCF_ACTION in  start)          ganesha_grace_start @@ -167,14 +184,16 @@ stop)           ganesha_grace_stop  		;;  status|monitor) ganesha_grace_monitor  		;; +notify)         ganesha_grace_notify +		;;  *)              ganesha_grace_usage -                exit $OCF_ERR_UNIMPLEMENTED +                exit ${OCF_ERR_UNIMPLEMENTED}                  ;;  esac  rc=$?  # The resource agent may optionally log a debug message -ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" +ocf_log debug "${OCF_RESOURCE_INSTANCE} ${__OCF_ACTION} returned $rc"  exit $rc diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon index c8e7de9c45e..f55cf7f2af3 100644 --- a/extras/ganesha/ocf/ganesha_mon +++ b/extras/ganesha/ocf/ganesha_mon @@ -29,14 +29,21 @@  : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}  . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs -if [ -n "$OCF_DEBUG_LIBRARY" ]; then -    . $OCF_DEBUG_LIBRARY +if [ -n "${OCF_DEBUG_LIBRARY}" ]; then +    . ${OCF_DEBUG_LIBRARY}  else      : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}  . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs  fi -GRACE_DELAY=7 +# Defaults +OCF_RESKEY_ganesha_active_default="ganesha-active" +OCF_RESKEY_grace_active_default="grace-active" +OCF_RESKEY_grace_delay_default="5" + +: ${OCF_RESKEY_ganesha_active=${OCF_RESKEY_ganesha_active_default}} +: ${OCF_RESKEY_grace_active=${OCF_RESKEY_grace_active_default}} +: ${OCF_RESKEY_grace_delay=${OCF_RESKEY_grace_delay_default}}  ganesha_meta_data() {          cat <<END @@ -53,19 +60,37 @@ resource agent for nfs-ganesha.  <shortdesc lang="en">Manages the user-space nfs-ganesha NFS server</shortdesc>  <parameters> +<parameter name="ganesha_active"> +<longdesc lang="en">NFS-Ganesha daemon active attribute</longdesc> +<shortdesc lang="en">NFS-Ganesha daemon active attribute</shortdesc> +<content type="string" default="ganesha-active" /> +</parameter> +<parameter name="grace_active"> +<longdesc lang="en">NFS-Ganesha grace active attribute</longdesc> +<shortdesc lang="en">NFS-Ganesha grace active attribute</shortdesc> +<content type="string" default="grace-active" /> +</parameter> +<parameter name="grace_delay"> +<longdesc lang="en"> +NFS-Ganesha grace delay. +When changing this, adjust the ganesha_grace RA's monitor interval to match. +</longdesc> +<shortdesc lang="en">NFS-Ganesha grace delay</shortdesc> +<content type="string" default="5" /> +</parameter>  </parameters>  <actions>  <action name="start"   timeout="40s" />  <action name="stop"    timeout="40s" /> -<action name="status" depth="0"  timeout="20s" interval="10s" /> +<action name="status"  timeout="20s" interval="60s" />  <action name="monitor" depth="0"  timeout="10s" interval="10s" />  <action name="meta-data"  timeout="20s" />  </actions>  </resource-agent>  END -return $OCF_SUCCESS +return ${OCF_SUCCESS}  }  ganesha_mon_usage() { @@ -73,12 +98,12 @@ ganesha_mon_usage() {  }  # Make sure meta-data and usage always succeed -case $__OCF_ACTION in +case ${__OCF_ACTION} in  	meta-data)	ganesha_meta_data -			exit $OCF_SUCCESS +			exit ${OCF_SUCCESS}  			;;  	usage|help)	ganesha_usage -			exit $OCF_SUCCESS +			exit ${OCF_SUCCESS}  			;;  	*)  			;; @@ -86,12 +111,15 @@ esac  ganesha_mon_start()  { -	return $OCF_SUCCESS +	ocf_log debug "ganesha_mon_start" +	ganesha_mon_monitor +        return $OCF_SUCCESS  }  ganesha_mon_stop()  { -	return $OCF_SUCCESS +	ocf_log debug "ganesha_mon_stop" +        return $OCF_SUCCESS  }  ganesha_mon_monitor() @@ -108,50 +136,56 @@ ganesha_mon_monitor()  	if [ -e ${pid_file} -a \  	     -d /proc/$(cat ${pid_file} ) ]; then -		( pcs resource delete ${short_host}-dead_ip-1 > /dev/null 2>&1 ) -		attrd_updater -n ganesha-active -v 1 +		attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1  		if [ $? -ne 0 ]; then -			logger "warning: attrd_updater -n ganesha-active -v 1 failed" +			ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 failed"  		fi -	else -		( pcs resource create ${short_host}-dead_ip-1 ocf:heartbeat:Dummy > /dev/null 2>&1 ) +		attrd_updater -n ${OCF_RESKEY_grace_active} -v 1  		if [ $? -ne 0 ]; then -			logger "warning: pcs resource create ${short_host}-dead_ip-1 ocf:heartbeat:Dummy failed" +			ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 failed"  		fi -		# The ${this-node}-dead_ip-1 resource is used to indicate -		# that this ganesha.nfsd has died. -		# VIP fail-over is then triggered by clearing the -		# ganesha-active node attribute on this node. -		# -		# Meanwhile the ganesha_grace monitor() runs every 5 -		# seconds. We need to allow time for it to run and put -		# the remaining ganesha.nfsds into grace before initiating -		# the VIP fail-over. -		sleep ${GRACE_DELAY} - -		attrd_updater -D -n ganesha-active -		if [ $? -ne 0 ]; then -			logger "warning: attrd_updater -D -n ganesha-active failed" -		fi +		return ${OCF_SUCCESS}  	fi -	return $OCF_SUCCESS +	# VIP fail-over is triggered by clearing the +	# ganesha-active node attribute on this node. +	# +	# Meanwhile the ganesha_grace notify() runs when its +	# nfs-grace resource is disabled on a node; which +	# is triggered by clearing the ganesha-grace node +	# attribute on this node. +	# +	# We need to allow time for it to run and put +	# the remaining ganesha.nfsds into grace before +	# initiating the VIP fail-over. + +	attrd_updater -D -n ${OCF_RESKEY_grace_active} +	if [ $? -ne 0 ]; then +		ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed" +	fi + +	sleep ${OCF_RESKEY_grace_delay} + +	attrd_updater -D -n ${OCF_RESKEY_ganesha_active} +	if [ $? -ne 0 ]; then +		ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_ganesha_active} failed" +	fi + +	return ${OCF_SUCCESS}  }  ganesha_mon_validate()  { -	return $OCF_SUCCESS +	return ${OCF_SUCCESS}  }  ganesha_mon_validate -# logger "ganesha_mon ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION" -  # Translate each action into the appropriate function call -case $__OCF_ACTION in +case ${__OCF_ACTION} in  start)          ganesha_mon_start  		;;  stop)           ganesha_mon_stop @@ -159,13 +193,13 @@ stop)           ganesha_mon_stop  status|monitor) ganesha_mon_monitor  		;;  *)              ganesha_mon_usage -                exit $OCF_ERR_UNIMPLEMENTED +                exit ${OCF_ERR_UNIMPLEMENTED}                  ;;  esac  rc=$?  # The resource agent may optionally log a debug message -ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" +ocf_log debug "${OCF_RESOURCE_INSTANCE} ${__OCF_ACTION} returned $rc"  exit $rc diff --git a/extras/ganesha/ocf/ganesha_nfsd b/extras/ganesha/ocf/ganesha_nfsd index e064183daef..a9d3e4d860f 100644 --- a/extras/ganesha/ocf/ganesha_nfsd +++ b/extras/ganesha/ocf/ganesha_nfsd @@ -29,13 +29,16 @@  : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}  . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs -if [ -n "$OCF_DEBUG_LIBRARY" ]; then -    . $OCF_DEBUG_LIBRARY +if [ -n "${OCF_DEBUG_LIBRARY}" ]; then +    . ${OCF_DEBUG_LIBRARY}  else      : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}  . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs  fi +OCF_RESKEY_ha_vol_mnt_default="/var/run/gluster/shared_storage" +: ${OCF_RESKEY_ha_vol_mnt=${OCF_RESKEY_ha_vol_mnt_default}} +  ganesha_meta_data() {          cat <<END  <?xml version="1.0"?> @@ -59,16 +62,16 @@ resource agent for nfs-ganesha.  </parameters>  <actions> -<action name="start"   timeout="40s" /> -<action name="stop"    timeout="40s" /> -<action name="status" depth="0"  timeout="20s" interval="1m" /> -<action name="monitor" depth="0"  timeout="10s" interval="1m" /> +<action name="start"   timeout="5s" /> +<action name="stop"    timeout="5s" /> +<action name="status" depth="0"  timeout="5s" interval="0" /> +<action name="monitor" depth="0"  timeout="5s" interval="0" />  <action name="meta-data"  timeout="20s" />  </actions>  </resource-agent>  END -return $OCF_SUCCESS +return ${OCF_SUCCESS}  }  ganesha_nfsd_usage() { @@ -78,10 +81,10 @@ ganesha_nfsd_usage() {  # Make sure meta-data and usage always succeed  case $__OCF_ACTION in  	meta-data)	ganesha_meta_data -			exit $OCF_SUCCESS +			exit ${OCF_SUCCESS}  			;;  	usage|help)	ganesha_usage -			exit $OCF_SUCCESS +			exit ${OCF_SUCCESS}  			;;  	*)  			;; @@ -89,58 +92,60 @@ esac  ganesha_nfsd_start()  { -	return $OCF_SUCCESS +	local long_host=$(hostname) + +	if [[ -d /var/lib/nfs ]]; then +		mv /var/lib/nfs /var/lib/nfs.backup +		if [ $? -ne 0 ]; then +			ocf_log notice "mv /var/lib/nfs /var/lib/nfs.backup failed" +		fi +		ln -s ${OCF_RESKEY_ha_vol_mnt}/nfs-ganesha/${long_host}/nfs /var/lib/nfs +		if [ $? -ne 0 ]; then +			ocf_log notice "ln -s ${OCF_RESKEY_ha_vol_mnt}/nfs-ganesha/${long_host}/nfs /var/lib/nfs failed" +		fi +	fi + +	return ${OCF_SUCCESS}  }  ganesha_nfsd_stop()  { -	local short_host=$(hostname -s) -	local long_host="" - -	if [ "X${OCF_RESOURCE_INSTANCE:0:9}X" = "Xnfs_startX" ]; then - -		# if this is any nfs_start, go ahead. worst case we -		# find the link already exists and do nothing -		long_host=$(hostname) - -		if [ -d /var/lib/nfs ]; then -			mv /var/lib/nfs /var/lib/nfs.backup -			ln -s $OCF_RESKEY_ha_vol_mnt/nfs-ganesha/${long_host}/nfs /var/lib/nfs -			if [ $? -ne 0 ]; then -				logger "warning: ln -s $OCF_RESKEY_ha_vol_mnt/nfs-ganesha/${long_host}/nfs /var/lib/nfs failed" -			fi +	if [ -L /var/lib/nfs -a -d /var/lib/nfs.backup ]; then +		rm -f /var/lib/nfs +		if [ $? -ne 0 ]; then +			ocf_log notice "rm -f /var/lib/nfs failed"  		fi -	else - -		# if this is a clone resource or is specific to this node -		# remove the symlink and restore /var/lib/nfs - -		if [ "X${OCF_RESOURCE_INSTANCE}X" = "Xnfs_stopX" ] || -		   [ "X${OCF_RESOURCE_INSTANCE}X" = "Xnfs_stop-${short_host}X" ]; then -			if [ -L /var/lib/nfs -a -d /var/lib/nfs.backup ]; then -				rm -f /var/lib/nfs -				mv /var/lib/nfs.backup /var/lib/nfs -			fi +		mv /var/lib/nfs.backup /var/lib/nfs +		if [ $? -ne 0 ]; then +			ocf_log notice "mv /var/lib/nfs.backup /var/lib/nfs failed"  		fi  	fi -	return $OCF_SUCCESS +	return ${OCF_SUCCESS}  }  ganesha_nfsd_monitor()  { -	return $OCF_SUCCESS +	# pacemaker checks to see if RA is already running before starting it. +	# if we return success, then it's presumed it's already running and +	# doesn't need to be started, i.e. invoke the start action. +	# return something other than success to make pacemaker invoke the +	# start action +	if [[ -L /var/lib/nfs ]]; then +		return ${OCF_SUCCESS} +	fi +	return ${OCF_NOT_RUNNING}  }  ganesha_nfsd_validate()  { -	return $OCF_SUCCESS +	return ${OCF_SUCCESS}  }  ganesha_nfsd_validate -# logger "ganesha_nfsd ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION" +# ocf_log notice "ganesha_nfsd ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION"  # Translate each action into the appropriate function call  case $__OCF_ACTION in @@ -151,13 +156,13 @@ stop)           ganesha_nfsd_stop  status|monitor) ganesha_nfsd_monitor  		;;  *)              ganesha_nfsd_usage -                exit $OCF_ERR_UNIMPLEMENTED +                exit ${OCF_ERR_UNIMPLEMENTED}                  ;;  esac  rc=$?  # The resource agent may optionally log a debug message -ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" +ocf_log debug "${OCF_RESOURCE_INSTANCE} ${__OCF_ACTION} returned $rc"  exit $rc diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh index 84397477e32..16ede52f8b2 100644 --- a/extras/ganesha/scripts/ganesha-ha.sh +++ b/extras/ganesha/scripts/ganesha-ha.sh @@ -170,8 +170,8 @@ setup_cluster()      logger "setting up cluster ${name} with the following ${servers}"      pcs cluster auth ${servers} -# fedora    pcs cluster setup ${name} ${servers} -# rhel6     pcs cluster setup --name ${name} ${servers} +    # fedora    pcs cluster setup ${name} ${servers} +    # rhel6     pcs cluster setup --name ${name} ${servers}      pcs cluster setup ${RHEL6_PCS_CNAME_OPTION} ${name} ${servers}      if [ $? -ne 0 ]; then          logger "pcs cluster setup ${RHEL6_PCS_CNAME_OPTION} ${name} ${servers} failed" @@ -204,7 +204,7 @@ setup_cluster()  } -setup_finalize() +setup_finalize_ha()  {      local cibfile=${1}      local stopped="" @@ -215,7 +215,7 @@ setup_finalize()           stopped=$(pcs status | grep -u "Stopped")      done -    pcs status | grep dead_ip-1 | sort > /var/run/ganesha/pcs_status +    # pcs resource cleanup  } @@ -292,7 +292,7 @@ string:\"EXPORT(Path=/$VOL)\" 2>&1")          exit 1      fi -#Run the same command on the localhost, +    # Run the same command on the localhost,          output=$(dbus-send --print-reply --system --dest=org.ganesha.nfsd \  /org/ganesha/nfsd/ExportMgr org.ganesha.nfsd.exportmgr.RemoveExport \  uint16:$removed_id 2>&1) @@ -373,13 +373,13 @@ teardown_cluster()          fi      done -# BZ 1193433 - pcs doesn't reload cluster.conf after modification -# after teardown completes, a subsequent setup will appear to have -# 'remembered' the deleted node. You can work around this by -# issuing another `pcs cluster node remove $node`, -# `crm_node -f -R $server`, or -# `cibadmin --delete --xml-text '<node id="$server" -# uname="$server"/>' +    # BZ 1193433 - pcs doesn't reload cluster.conf after modification +    # after teardown completes, a subsequent setup will appear to have +    # 'remembered' the deleted node. You can work around this by +    # issuing another `pcs cluster node remove $node`, +    # `crm_node -f -R $server`, or +    # `cibadmin --delete --xml-text '<node id="$server" +    # uname="$server"/>'      pcs cluster stop --all      if [ $? -ne 0 ]; then @@ -479,28 +479,26 @@ setup_create_resources()  {      local cibfile=$(mktemp -u) -    # mount the HA-state volume and start ganesha.nfsd on all nodes -    pcs resource create nfs_start ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone +    # fixup /var/lib/nfs +    logger "pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone" +    pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone      if [ $? -ne 0 ]; then -        logger "warning: pcs resource create nfs_start ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed" +        logger "warning: pcs resource create nfs_setup ocf:heartbeat:ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed"      fi -    sleep 1 -    # cloned resources seem to never have their start() invoked when they -    # are created, but stop() is invoked when they are destroyed. Why???. -    # No matter, we don't want this resource agent hanging around anyway -    pcs resource delete nfs_start-clone + +    pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone      if [ $? -ne 0 ]; then -        logger "warning: pcs resource delete nfs_start-clone failed" +        logger "warning: pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone failed"      fi -    pcs resource create nfs-mon ganesha_mon --clone +    pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone meta notify=true      if [ $? -ne 0 ]; then -        logger "warning: pcs resource create nfs-mon ganesha_mon --clone failed" +        logger "warning: pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone failed"      fi -    pcs resource create nfs-grace ganesha_grace --clone +    pcs constraint location nfs-grace-clone rule score=-INFINITY grace-active ne 1      if [ $? -ne 0 ]; then -        logger "warning: pcs resource create nfs-grace ganesha_grace --clone failed" +        logger "warning: pcs constraint location nfs-grace-clone rule score=-INFINITY grace-active ne 1"      fi      pcs cluster cib ${cibfile} @@ -530,21 +528,6 @@ setup_create_resources()              logger "warning pcs resource create ${1}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${ipaddr} cidr_netmask=32 op monitor interval=15s failed"          fi -        pcs -f ${cibfile} resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy -        if [ $? -ne 0 ]; then -            logger "warning: pcs resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy failed" -        fi - -        pcs -f ${cibfile} constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 -        if [ $? -ne 0 ]; then -            logger "warning: pcs constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 failed" -        fi - -        pcs -f ${cibfile} constraint order ${1}-trigger_ip-1 then nfs-grace-clone -        if [ $? -ne 0 ]; then -            logger "warning: pcs constraint order ${1}-trigger_ip-1 then nfs-grace-clone failed" -        fi -          pcs -f ${cibfile} constraint order nfs-grace-clone then ${1}-cluster_ip-1          if [ $? -ne 0 ]; then              logger "warning: pcs constraint order nfs-grace-clone then ${1}-cluster_ip-1 failed" @@ -567,6 +550,13 @@ teardown_resources()  {      # local mntpt=$(grep ha-vol-mnt ${HA_CONFIG_FILE} | cut -d = -f 2) +    # restore /var/lib/nfs +    logger "notice: pcs resource delete nfs_setup-clone" +    pcs resource delete nfs_setup-clone +    if [ $? -ne 0 ]; then +        logger "warning: pcs resource delete nfs_setup-clone failed" +    fi +      # delete -clone resource agents      # in particular delete the ganesha monitor so we don't try to      # trigger anything when we shut down ganesha next. @@ -580,32 +570,11 @@ teardown_resources()          logger "warning: pcs resource delete nfs-grace-clone failed"      fi -    # unmount the HA-state volume and terminate ganesha.nfsd on all nodes -    pcs resource create nfs_stop ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone -    if [ $? -ne 0 ]; then -        logger "warning: pcs resource create nfs_stop ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} --clone failed" -    fi -    sleep 1 -    # cloned resources seem to never have their start() invoked when they -    # are created, but stop() is invoked when they are destroyed. Why???. -    pcs resource delete nfs_stop-clone -    if [ $? -ne 0 ]; then -        logger "warning: pcs resource delete nfs_stop-clone failed" -    fi -      while [[ ${1} ]]; do          pcs resource delete ${1}-cluster_ip-1          if [ $? -ne 0 ]; then              logger "warning: pcs resource delete ${1}-cluster_ip-1 failed"          fi -        pcs resource delete ${1}-trigger_ip-1 -        if [ $? -ne 0 ]; then -            logger "warning: pcs resource delete ${1}-trigger_ip-1 failed" -        fi -        pcs resource delete ${1}-dead_ip-1 -        if [ $? -ne 0 ]; then -            logger "info: pcs resource delete ${1}-dead_ip-1 failed" -        fi          shift      done @@ -632,21 +601,6 @@ recreate_resources()              logger "warning pcs resource create ${1}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${ipaddr} cidr_netmask=32 op monitor interval=10s failed"          fi -        pcs -f ${cibfile} resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy -        if [ $? -ne 0 ]; then -            logger "warning: pcs resource create ${1}-trigger_ip-1 ocf:heartbeat:Dummy failed" -        fi - -        pcs -f ${cibfile} constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 -        if [ $? -ne 0 ]; then -            logger "warning: pcs constraint colocation add ${1}-cluster_ip-1 with ${1}-trigger_ip-1 failed" -        fi - -        pcs -f ${cibfile} constraint order ${1}-trigger_ip-1 then nfs-grace-clone -        if [ $? -ne 0 ]; then -            logger "warning: pcs constraint order ${1}-trigger_ip-1 then nfs-grace-clone failed" -        fi -          pcs -f ${cibfile} constraint order nfs-grace-clone then ${1}-cluster_ip-1          if [ $? -ne 0 ]; then              logger "warning: pcs constraint order nfs-grace-clone then ${1}-cluster_ip-1 failed" @@ -670,21 +624,6 @@ addnode_recreate_resources()          logger "warning pcs resource create ${add_node}-cluster_ip-1 ocf:heartbeat:IPaddr ip=${add_vip} cidr_netmask=32 op monitor interval=10s failed"      fi -    pcs -f ${cibfile} resource create ${add_node}-trigger_ip-1 ocf:heartbeat:Dummy -    if [ $? -ne 0 ]; then -        logger "warning: pcs resource create ${add_node}-trigger_ip-1 ocf:heartbeat:Dummy failed" -    fi - -    pcs -f ${cibfile} constraint colocation add ${add_node}-cluster_ip-1 with ${add_node}-trigger_ip-1 -    if [ $? -ne 0 ]; then -        logger "warning: pcs constraint colocation add ${add_node}-cluster_ip-1 with ${add_node}-trigger_ip-1 failed" -    fi - -    pcs -f ${cibfile} constraint order ${add_node}-trigger_ip-1 then nfs-grace-clone -    if [ $? -ne 0 ]; then -        logger "warning: pcs constraint order ${add_node}-trigger_ip-1 then nfs-grace-clone failed" -    fi -      pcs -f ${cibfile} constraint order nfs-grace-clone then ${add_node}-cluster_ip-1      if [ $? -ne 0 ]; then          logger "warning: pcs constraint order nfs-grace-clone then ${add_node}-cluster_ip-1 failed" @@ -702,11 +641,6 @@ clear_resources()              logger "warning: pcs -f ${cibfile} resource delete ${1}-cluster_ip-1"          fi -        pcs -f ${cibfile} resource delete ${1}-trigger_ip-1 -        if [ $? -ne 0 ]; then -            logger "warning: pcs -f ${cibfile} resource delete ${1}-trigger_ip-1" -        fi -          shift      done  } @@ -718,52 +652,19 @@ addnode_create_resources()      local add_vip=${1}; shift      local cibfile=$(mktemp -u) -    # mount the HA-state volume and start ganesha.nfsd on the new node -    pcs cluster cib ${cibfile} -    if [ $? -ne 0 ]; then -        logger "warning: pcs cluster cib ${cibfile} failed" -    fi - -    pcs -f ${cibfile} resource create nfs_start-${add_node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} -    if [ $? -ne 0 ]; then -        logger "warning: pcs -f ${cibfile} resource create nfs_start-${add_node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} failed" -    fi - -    pcs -f ${cibfile} constraint location nfs_start-${add_node} prefers ${add_node}=INFINITY -    if [ $? -ne 0 ]; then -        logger "warning: pcs -f ${cibfile} constraint location nfs_start-${add_node} prefers ${add_node}=INFINITY failed" -    fi - -    pcs -f ${cibfile} constraint order nfs_start-${add_node} then nfs-mon-clone -    if [ $? -ne 0 ]; then -        logger "warning: pcs -f ${cibfile} constraint order nfs_start-${add_node} then nfs-mon-clone failed" -    fi - -    pcs cluster cib-push ${cibfile} -    if [ $? -ne 0 ]; then -        logger "warning: pcs cluster cib-push ${cibfile} failed" -    fi -    rm -f ${cibfile} -      # start HA on the new node      pcs cluster start ${add_node}      if [ $? -ne 0 ]; then         logger "warning: pcs cluster start ${add_node} failed"      fi -    pcs resource delete nfs_start-${add_node} -    if [ $? -ne 0 ]; then -        logger "warning: pcs resource delete nfs_start-${add_node} failed" -    fi - -      pcs cluster cib ${cibfile}      if [ $? -ne 0 ]; then          logger "warning: pcs cluster cib ${cibfile} failed"      fi -    # delete all the -cluster_ip-1 and -trigger_ip-1 resources, -    # clearing their constraints, then create them again so we can +    # delete all the -cluster_ip-1 resources, clearing +    # their constraints, then create them again so we can      # recompute their constraints      clear_resources ${cibfile} ${HA_SERVERS}      addnode_recreate_resources ${cibfile} ${add_node} ${add_vip} @@ -805,31 +706,6 @@ deletenode_delete_resources()      fi      rm -f ${cibfile} -    pcs cluster cib ${cibfile} -    if [ $? -ne 0 ]; then -        logger "warning: pcs cluster cib ${cibfile} failed" -    fi - -    pcs -f ${cibfile} resource create nfs_stop-${node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} -    if [ $? -ne 0 ]; then -        logger "warning: pcs -f ${cibfile} resource create nfs_stop-${node} ganesha_nfsd ha_vol_mnt=${HA_VOL_MNT} failed" -    fi - -    pcs -f ${cibfile} constraint location nfs_stop-${node} prefers ${node}=INFINITY -    if [ $? -ne 0 ]; then -        logger "warning: pcs -f ${cibfile} constraint location nfs_stop-${node} prefers ${node}=INFINITY failed" -    fi - -    pcs cluster cib-push ${cibfile} -    if [ $? -ne 0 ]; then -        logger "warning: pcs cluster cib-push ${cibfile} failed" -    fi -    rm -f ${cibfile} - -    pcs resource delete nfs_stop-${node} -    if [ $? -ne 0 ]; then -        logger "warning: pcs resource delete nfs_stop-${node} failed" -    fi  } @@ -973,11 +849,12 @@ main()              setup_create_resources ${HA_SERVERS} +            setup_finalize_ha +              setup_state_volume ${HA_SERVERS}              setup_copy_config ${HA_SERVERS} -            setup_finalize          else              logger "insufficient servers for HA, aborting" @@ -1018,15 +895,15 @@ main()          fi          addnode_create_resources ${node} ${vip} -        #Subsequent add-node recreates resources for all the nodes -        #that already exist in the cluster. The nodes are picked up -        #from the entries in the ganesha-ha.conf file. Adding the -        #newly added node to the file so that the resources specfic -        #to this node is correctly recreated in the future. +        # Subsequent add-node recreates resources for all the nodes +        # that already exist in the cluster. The nodes are picked up +        # from the entries in the ganesha-ha.conf file. Adding the +        # newly added node to the file so that the resources specfic +        # to this node is correctly recreated in the future.          clean_node=${node//[-.]/_}          echo "VIP_$clean_node=\"${vip}\"" >> ${HA_CONFDIR}/ganesha-ha.conf -        NEW_NODES="$HA_CLUSTER_NODES,$node" +        NEW_NODES="$HA_CLUSTER_NODES,${node}"          sed -i s/HA_CLUSTER_NODES.*/"HA_CLUSTER_NODES=\"$NEW_NODES\""/ \  $HA_CONFDIR/ganesha-ha.conf @@ -1053,7 +930,7 @@ $HA_CONFDIR/ganesha-ha.conf          setup_copy_config ${HA_SERVERS} -        rm -rf ${HA_VOL_MNT}/nfs-ganesha/{node} +        rm -rf ${HA_VOL_MNT}/nfs-ganesha/${node}          determine_service_manager @@ -1074,7 +951,7 @@ $HA_CONFDIR/ganesha-ha.conf          refresh_config ${VOL} ${HA_CONFDIR} ${HA_SERVERS}          ;; -      *) +    *)          # setup and teardown are not intended to be used by a          # casual user          usage  | 
