#!/bin/bash
#
# Script to help with recovering an unbootable drive on EC2.
#
# Looks for a host tagged with the provided hostname, shuts it down,
# detaches its drive, attaches that drive to any other instance you
# have, then drops to a shell where you can log into the other
# instance and fix up the drive; when you exit the shell it puts
# everything back in place and restarts the stopped instance.
#
set -eu

while getopts hp:s: arg
do
	case $arg
	in
		p)
			AWS_PROFILE="$OPTARG"
			;;
		s)
			SICK_HOSTNAME="$OPTARG"
			;;
		*)
			echo "Usage: $0 -p aws_profile -s sick_host" >&2
			exit 1
			;;
	esac
done

if [ -z "${AWS_PROFILE:-}" ]
then
	echo "$0: AWS profile not specified" >&2
	echo "Usage: $0 -p aws_profile -s sick_host" >&2
	exit 1
fi

if [ -z "${SICK_HOSTNAME:-}" ]
then
	echo "$0: sick hostname not specified" >&2
	echo "Usage: $0 -p aws_profile -s sick_host" >&2
	exit 1
fi


wait_for_status() {
	INSTANCE_ID=$1
	STATUS=$2

	echo -n " waiting for '$INSTANCE_ID' to reach status '$STATUS': "
	
	while true
	do
		echo -n "."
		status=$(aws --profile $AWS_PROFILE ec2 describe-instances --query 'Reservations[0].Instances[0].State.Name' --instance-ids $INSTANCE_ID |sed -e 's/"//g')
		if [ "$status" == "$STATUS" ]
		then
			break
		fi
		sleep 10
	done
	echo "done"
}

wait_for_volume_status() {
	VOLUME_ID=$1
	STATUS=$2

	echo -n " waiting for '$VOLUME_ID' to reach status '$STATUS': "
	
	while true
	do
		echo -n "."
		status=$(aws --profile $AWS_PROFILE ec2 describe-volumes --volume-id $VOLUME_ID --query "Volumes[0].Attachments[*].State" | egrep '"' | sed -e 's/[ 	]*"[ 	]*//g')
		if [ "$status" == "$STATUS" ]
		then
			break
		fi
		sleep 10
	done
	echo "done"
}

# Find the instance that needs attention
SICK_INSTANCE=$(aws --profile $AWS_PROFILE ec2 describe-instances --query 'Reservations[0].Instances[0].InstanceId' --filters "Name=tag:Name,Values=$SICK_HOSTNAME" | sed -e 's/"//g' -e 's/null//')

if [ "$SICK_INSTANCE" == "" ]
then
	echo "can't find '$SICK_HOSTNAME'" >&2
	exit 1
fi
echo "$(date +%H:%M:%S) Collected sick instance ID '$SICK_INSTANCE' for '$SICK_HOSTNAME'"

# Collect information about the drive to be recovered. Assumes only a single drive!
for ROW in $(aws --profile $AWS_PROFILE ec2 describe-instances --query 'Reservations[0].Instances[0].BlockDeviceMappings[0].[DeviceName,Ebs.VolumeId]' --instance-ids $SICK_INSTANCE | grep '"')
do
	if echo $ROW | grep -q '/dev'
	then
		SICK_DEVICE=$(echo $ROW | sed -e 's/[ 	]*"[, 	]*//g')
	elif echo $ROW | grep -q 'vol-'
	then
		SICK_VOLUME=$(echo $ROW | sed -e 's/[ 	]*"[, 	]*//g')
	fi
done
echo "$(date +%H:%M:%S) Sick device '$SICK_DEVICE' is '$SICK_VOLUME'"

# Find an instance to attach the sick drive to
RECOVERY_INSTANCE=$(aws --profile $AWS_PROFILE ec2 describe-instances --query 'Reservations[*].Instances[*].InstanceId' | fgrep -v $SICK_INSTANCE | fgrep 'i-' | head -1 | sed -e 's/[ 	]*"[ 	]*//g')

if [ "$RECOVERY_INSTANCE" == "" ]
then
	echo "Can't find a recovery instance" >&2
	exit 1
fi
if [ "$RECOVERY_INSTANCE" == "$SICK_INSTANCE" ]
then
	echo "Recovery instance is the same as sick instance" >&2
	exit 1
fi

echo "$(date +%H:%M:%S) Collected recovery instance ID '$RECOVERY_INSTANCE'"

# find a mount point for the sick volume on the recovery instance
USED_DEVICES=$(aws --profile $AWS_PROFILE ec2 describe-instances --query 'Reservations[0].Instances[0].BlockDeviceMappings[*].[DeviceName]' --instance-ids $RECOVERY_INSTANCE | grep '"')

for dev in a b c d e f g # more than enough thanks
do
	if ! echo $USED_DEVICES | grep -q /dev/xvd$dev
	then
		RECOVERY_DEVICE=/dev/xvd$dev
		break
	fi
done
if [ "$RECOVERY_DEVICE" == "" ]
then
	echo "Couldn't allocate a recovery device" >&2
	exit 1
fi

echo "$(date +%H:%M:%S) Will attach sick volume as '$RECOVERY_DEVICE'"

# stop the sick instance
echo "$(date +%H:%M:%S) Stopping $SICK_INSTANCE:"
aws --profile $AWS_PROFILE ec2 stop-instances --instance-ids $SICK_INSTANCE > /dev/null
wait_for_status $SICK_INSTANCE "stopped"

echo "$(date +%H:%M:%S) Detaching sick volume $SICK_VOLUME:"
# detach the sick volume
aws --profile $AWS_PROFILE ec2 detach-volume --volume-id $SICK_VOLUME > /dev/null
wait_for_volume_status $SICK_VOLUME ""

# attach the volume to the recovery instance
echo "$(date +%H:%M:%S) Attaching $SICK_VOLUME to $RECOVERY_INSTANCE:"
aws --profile $AWS_PROFILE ec2 attach-volume --volume-id $SICK_VOLUME --instance-id $RECOVERY_INSTANCE --device $RECOVERY_DEVICE > /dev/null
wait_for_volume_status $SICK_VOLUME "attached"

echo "Sick volume is attached to recovery instance. Log in to $RECOVERY_INSTANCE and do your stuff, then exit the shell to continue"
PS1="\h:\W \u [drive recovery - exit when done] \$ " $SHELL

# detach volume from rescue instance
echo "$(date +%H:%M:%S) Detaching $SICK_VOLUME from $RECOVERY_INSTANCE:"
aws --profile $AWS_PROFILE ec2 detach-volume --volume-id $SICK_VOLUME > /dev/null
wait_for_volume_status $SICK_VOLUME ""

echo "$(date +%H:%M:%S) Re-attaching $SICK_VOLUME to $SICK_INSTANCE:"
aws --profile $AWS_PROFILE ec2 attach-volume --volume-id $SICK_VOLUME --instance-id $SICK_INSTANCE --device $SICK_DEVICE >/dev/null
wait_for_volume_status $SICK_VOLUME "attached"

# restart stopped instance
echo "$(date +%H:%M:%S) Restarting $SICK_INSTANCE:"
aws --profile $AWS_PROFILE ec2 start-instances --instance-ids $SICK_INSTANCE > /dev/null
wait_for_status $SICK_INSTANCE "running"

echo "$(date +%H:%M:%S) All done."