freeleaps-ops/docs/add_k8s_node.sh

398 lines
12 KiB
Bash
Raw Normal View History

2025-09-03 23:59:04 +00:00
#!/bin/bash
# Azure Kubernetes Node Addition Script
# This script automates the process of adding new Azure VMs to an existing Kubernetes cluster
set -e # Exit on any error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INVENTORY_FILE="freeleaps-ops/cluster/ansible/manifests/inventory.ini"
KUBESPRAY_DIR="freeleaps-ops/3rd/kubespray"
ANSIBLE_USER="wwwadmin@mathmast.com"
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to validate input
validate_input() {
if [[ -z "$1" ]]; then
print_error "Input cannot be empty"
return 1
fi
return 0
}
# Function to check prerequisites
check_prerequisites() {
print_status "Checking prerequisites..."
# Check if kubectl is installed
if ! command -v kubectl &> /dev/null; then
print_error "kubectl is not installed"
exit 1
fi
# Check if ansible is installed
if ! command -v ansible &> /dev/null; then
print_error "ansible is not installed"
exit 1
fi
# Check if az CLI is installed
if ! command -v az &> /dev/null; then
print_error "Azure CLI is not installed"
exit 1
fi
# Check if inventory file exists
if [[ ! -f "$INVENTORY_FILE" ]]; then
print_error "Inventory file not found: $INVENTORY_FILE"
exit 1
fi
# Check if kubespray directory exists
if [[ ! -d "$KUBESPRAY_DIR" ]]; then
print_error "Kubespray directory not found: $KUBESPRAY_DIR"
exit 1
fi
print_success "All prerequisites are met"
}
# Function to get VM details from Azure
get_vm_details() {
local vm_name="$1"
local resource_group="$2"
print_status "Getting VM details from Azure..."
# Get VM private IP
local private_ip=$(az vm show --resource-group "$resource_group" --name "$vm_name" --query "privateIps" -o tsv 2>/dev/null)
if [[ -z "$private_ip" ]]; then
print_error "Failed to get private IP for VM: $vm_name"
return 1
fi
# Get VM power state
local power_state=$(az vm show --resource-group "$resource_group" --name "$vm_name" --query "powerState" -o tsv 2>/dev/null)
if [[ "$power_state" != "VM running" ]]; then
print_warning "VM is not running. Current state: $power_state"
read -p "Do you want to start the VM? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
az vm start --resource-group "$resource_group" --name "$vm_name"
print_status "Waiting for VM to start..."
sleep 30
else
print_error "VM must be running to proceed"
return 1
fi
fi
echo "$private_ip"
}
# Function to test SSH connectivity
test_ssh_connectivity() {
local ip_address="$1"
print_status "Testing SSH connectivity to $ip_address..."
# Test SSH connection
if timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$ANSIBLE_USER@$ip_address" "echo 'SSH connection successful'" 2>/dev/null; then
print_success "SSH connectivity verified"
return 0
else
print_error "SSH connection failed to $ip_address"
print_warning "Please ensure:"
print_warning "1. VM is running"
print_warning "2. Network security group allows SSH (port 22)"
print_warning "3. SSH service is running on the VM"
return 1
fi
}
# Function to update inventory file
update_inventory() {
local vm_name="$1"
local ip_address="$2"
local node_type="$3"
print_status "Updating inventory file..."
# Create backup of inventory file
cp "$INVENTORY_FILE" "${INVENTORY_FILE}.backup.$(date +%Y%m%d_%H%M%S)"
# Add node to inventory based on type
if [[ "$node_type" == "worker" ]]; then
echo "$vm_name ansible_host=$ip_address ansible_user=$ANSIBLE_USER host_name=$vm_name" >> "$INVENTORY_FILE"
print_success "Added worker node to inventory"
elif [[ "$node_type" == "master" ]]; then
echo "$vm_name ansible_host=$ip_address ansible_user=$ANSIBLE_USER etcd_member_name=${vm_name}-etcd host_name=$vm_name" >> "$INVENTORY_FILE"
print_success "Added master node to inventory"
else
print_error "Invalid node type: $node_type"
return 1
fi
}
# Function to verify inventory
verify_inventory() {
print_status "Verifying inventory configuration..."
# Test inventory syntax
if ansible-inventory -i "$INVENTORY_FILE" --list > /dev/null 2>&1; then
print_success "Inventory syntax is valid"
else
print_error "Inventory syntax is invalid"
return 1
fi
# Test connectivity to all nodes
print_status "Testing connectivity to all nodes..."
if ansible -i "$INVENTORY_FILE" all -m ping -kK; then
print_success "Connectivity to all nodes verified"
else
print_error "Connectivity test failed"
return 1
fi
}
# Function to run kubespray scale playbook
run_scale_playbook() {
print_status "Running Kubespray scale playbook..."
cd "$(dirname "$INVENTORY_FILE")"
# Run the scale playbook
if ansible-playbook -i inventory.ini "$KUBESPRAY_DIR/scale.yml" -kK -b; then
print_success "Scale playbook completed successfully"
else
print_error "Scale playbook failed"
return 1
fi
}
# Function to verify node addition
verify_node_addition() {
local vm_name="$1"
print_status "Verifying node addition..."
# Wait for node to appear
local max_attempts=30
local attempt=1
while [[ $attempt -le $max_attempts ]]; do
if kubectl get nodes | grep -q "$vm_name"; then
print_success "Node $vm_name found in cluster"
break
fi
print_status "Waiting for node to appear... (attempt $attempt/$max_attempts)"
sleep 10
((attempt++))
done
if [[ $attempt -gt $max_attempts ]]; then
print_error "Node $vm_name did not appear in cluster"
return 1
fi
# Wait for node to be ready
attempt=1
while [[ $attempt -le $max_attempts ]]; do
local node_status=$(kubectl get nodes "$vm_name" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
if [[ "$node_status" == "True" ]]; then
print_success "Node $vm_name is ready"
break
fi
print_status "Waiting for node to be ready... (attempt $attempt/$max_attempts)"
sleep 10
((attempt++))
done
if [[ $attempt -gt $max_attempts ]]; then
print_error "Node $vm_name is not ready"
kubectl describe node "$vm_name"
return 1
fi
}
# Function to test pod scheduling
test_pod_scheduling() {
local vm_name="$1"
print_status "Testing pod scheduling on new node..."
# Create a test pod
local test_pod_name="test-pod-$(date +%s)"
kubectl run "$test_pod_name" --image=nginx --restart=Never --overrides="{\"spec\":{\"nodeSelector\":{\"kubernetes.io/hostname\":\"$vm_name\"}}}"
# Wait for pod to be scheduled
local max_attempts=30
local attempt=1
while [[ $attempt -le $max_attempts ]]; do
local pod_status=$(kubectl get pod "$test_pod_name" -o jsonpath='{.status.phase}' 2>/dev/null)
if [[ "$pod_status" == "Running" ]]; then
print_success "Test pod is running on node $vm_name"
break
fi
print_status "Waiting for test pod to be ready... (attempt $attempt/$max_attempts)"
sleep 10
((attempt++))
done
# Clean up test pod
kubectl delete pod "$test_pod_name"
if [[ $attempt -gt $max_attempts ]]; then
print_error "Test pod failed to run on node $vm_name"
kubectl describe pod "$test_pod_name"
kubectl delete pod "$test_pod_name"
return 1
fi
}
# Function to display final status
display_final_status() {
local vm_name="$1"
print_success "Node addition completed successfully!"
echo
echo "=== Final Status ==="
echo "Node Name: $vm_name"
echo "Node Status: $(kubectl get nodes "$vm_name" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')"
echo "Node IP: $(kubectl get nodes "$vm_name" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')"
echo "Node Capacity: $(kubectl get nodes "$vm_name" -o jsonpath='{.status.capacity.cpu}') CPU, $(kubectl get nodes "$vm_name" -o jsonpath='{.status.capacity.memory}') Memory"
echo
echo "=== Next Steps ==="
echo "1. Monitor the node for any issues"
echo "2. Update monitoring and alerting if needed"
echo "3. Update documentation"
echo "4. Consider running node maintenance tasks"
}
# Main function
main() {
echo "=========================================="
echo "Azure Kubernetes Node Addition Script"
echo "=========================================="
echo
# Check prerequisites
check_prerequisites
# Get user input
echo "Please provide the following information:"
echo
read -p "VM Name: " vm_name
validate_input "$vm_name" || exit 1
read -p "Resource Group: " resource_group
validate_input "$resource_group" || exit 1
read -p "Node Type (worker/master): " node_type
if [[ "$node_type" != "worker" && "$node_type" != "master" ]]; then
print_error "Node type must be 'worker' or 'master'"
exit 1
fi
echo
print_status "Summary:"
echo " VM Name: $vm_name"
echo " Resource Group: $resource_group"
echo " Node Type: $node_type"
echo
read -p "Proceed with node addition? (y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_status "Operation cancelled"
exit 0
fi
# Get VM details
ip_address=$(get_vm_details "$vm_name" "$resource_group")
if [[ $? -ne 0 ]]; then
exit 1
fi
print_success "VM IP Address: $ip_address"
# Test SSH connectivity
test_ssh_connectivity "$ip_address" || exit 1
# Update inventory
update_inventory "$vm_name" "$ip_address" "$node_type" || exit 1
# Verify inventory
verify_inventory || exit 1
# Run scale playbook
run_scale_playbook || exit 1
# Verify node addition
verify_node_addition "$vm_name" || exit 1
# Test pod scheduling
test_pod_scheduling "$vm_name" || exit 1
# Display final status
display_final_status "$vm_name"
}
# Handle script arguments
if [[ $# -eq 0 ]]; then
main
else
case "$1" in
--help|-h)
echo "Usage: $0 [OPTIONS]"
echo
echo "Options:"
echo " --help, -h Show this help message"
echo " --version, -v Show version information"
echo
echo "This script automates the process of adding new Azure VMs to an existing Kubernetes cluster."
echo "It will prompt for necessary information and guide you through the process."
exit 0
;;
--version|-v)
echo "Azure Kubernetes Node Addition Script v1.0"
exit 0
;;
*)
print_error "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
fi