Health Checks and Pre-Flight Validation

Why Pre-Flight Matters¶

Pilots check instruments before takeoff. Network operators should validate before deployment.

Without Pre-Flight Checks:

❌ Deploy to down device (wasted time)
❌ Deploy to already-misconfigured device (adds complexity)
❌ Deploy when bandwidth is saturated (timeouts)
❌ Deploy to device with low disk space (changes fail)
❌ Deploy to device with high CPU (risk of crash)

With Pre-Flight Checks:

✅ Skip down devices, report status
✅ Know if device is in clean state
✅ Only deploy when safe
✅ Predictable success rate

Pre-flight checks are the first line of defense against failed deployments.

Pattern 1: Device Health Checks¶

The Implementation¶

# src/health_checks.py
from dataclasses import dataclass
from enum import Enum

class HealthStatus(Enum):
    """Device health status."""
    HEALTHY = "healthy"
    WARNING = "warning"
    CRITICAL = "critical"

@dataclass
class HealthCheckResult:
    """Result of a health check."""
    check_name: str
    status: HealthStatus
    message: str
    details: dict = None

class DeviceHealthChecker:
    """Validate device health before deployment."""

    def __init__(self, device):
        self.device = device
        self.checks = []

    def check_device_reachable(self) -> HealthCheckResult:
        """Check if device responds to ping/SSH."""
        try:
            output = self.device.send_command("ping -c 1 8.8.8.8")
            if "100%" in output or "0% packet loss" in output:
                return HealthCheckResult(
                    check_name="device_reachable",
                    status=HealthStatus.HEALTHY,
                    message="Device is reachable"
                )
            return HealthCheckResult(
                check_name="device_reachable",
                status=HealthStatus.WARNING,
                message="Device has connectivity issues",
                details={"ping_output": output[:100]}
            )
        except Exception as e:
            return HealthCheckResult(
                check_name="device_reachable",
                status=HealthStatus.CRITICAL,
                message=f"Cannot reach device: {str(e)}"
            )

    def check_disk_space(self, min_percent_free=10) -> HealthCheckResult:
        """Check device has sufficient free disk space."""
        try:
            output = self.device.send_command("show disk:")

            # Parse disk usage (device-specific)
            lines = output.split('\n')
            for line in lines:
                if 'Kbytes total' in line:
                    # Extract percentages
                    parts = line.split()
                    # (This is simplified; real parsing depends on device OS)

                    # If we get here, device has space
                    return HealthCheckResult(
                        check_name="disk_space",
                        status=HealthStatus.HEALTHY,
                        message=f"Disk space available",
                        details={"output": line}
                    )

            return HealthCheckResult(
                check_name="disk_space",
                status=HealthStatus.WARNING,
                message="Could not parse disk usage"
            )
        except Exception as e:
            return HealthCheckResult(
                check_name="disk_space",
                status=HealthStatus.CRITICAL,
                message=f"Cannot check disk space: {str(e)}"
            )

    def check_cpu_usage(self, max_percent=80) -> HealthCheckResult:
        """Check device CPU is not overloaded."""
        try:
            output = self.device.send_command("show processes cpu")

            # Look for CPU usage line
            for line in output.split('\n'):
                if 'CPU utilization' in line:
                    # Parse CPU percentage
                    cpu_str = line.split()[-3]  # Simplified
                    cpu_percent = float(cpu_str.rstrip('%'))

                    if cpu_percent < max_percent:
                        return HealthCheckResult(
                            check_name="cpu_usage",
                            status=HealthStatus.HEALTHY,
                            message=f"CPU usage OK ({cpu_percent}%)"
                        )
                    else:
                        return HealthCheckResult(
                            check_name="cpu_usage",
                            status=HealthStatus.WARNING,
                            message=f"CPU usage high ({cpu_percent}%)"
                        )

            return HealthCheckResult(
                check_name="cpu_usage",
                status=HealthStatus.WARNING,
                message="Could not parse CPU usage"
            )
        except Exception as e:
            return HealthCheckResult(
                check_name="cpu_usage",
                status=HealthStatus.CRITICAL,
                message=f"Cannot check CPU: {str(e)}"
            )

    def check_memory_usage(self, max_percent=80) -> HealthCheckResult:
        """Check device memory is not exhausted."""
        try:
            output = self.device.send_command("show memory")

            for line in output.split('\n'):
                if 'Processor' in line:
                    # Parse memory (simplified)
                    parts = line.split()
                    # Real implementation depends on device OS

                    return HealthCheckResult(
                        check_name="memory_usage",
                        status=HealthStatus.HEALTHY,
                        message="Memory available"
                    )

            return HealthCheckResult(
                check_name="memory_usage",
                status=HealthStatus.WARNING,
                message="Could not parse memory usage"
            )
        except Exception as e:
            return HealthCheckResult(
                check_name="memory_usage",
                status=HealthStatus.CRITICAL,
                message=f"Cannot check memory: {str(e)}"
            )

    def check_running_config_valid(self) -> HealthCheckResult:
        """Check running configuration is valid."""
        try:
            output = self.device.send_command("show running-config | include ERROR")

            if output.strip() == "":
                return HealthCheckResult(
                    check_name="config_valid",
                    status=HealthStatus.HEALTHY,
                    message="Running configuration is valid"
                )
            else:
                return HealthCheckResult(
                    check_name="config_valid",
                    status=HealthStatus.CRITICAL,
                    message="Running configuration has errors",
                    details={"errors": output}
                )
        except Exception as e:
            return HealthCheckResult(
                check_name="config_valid",
                status=HealthStatus.WARNING,
                message=f"Cannot validate config: {str(e)}"
            )

    def run_all_checks(self) -> list:
        """Run all health checks."""
        results = [
            self.check_device_reachable(),
            self.check_disk_space(),
            self.check_cpu_usage(),
            self.check_memory_usage(),
            self.check_running_config_valid(),
        ]
        return results

    def is_healthy(self, results: list, allow_warnings=True) -> bool:
        """
        Determine if device is healthy enough for deployment.

        Args:
            results: List of HealthCheckResult
            allow_warnings: Allow deployment if only warnings

        Returns:
            bool: True if device is healthy
        """
        critical = [r for r in results if r.status == HealthStatus.CRITICAL]

        if critical:
            return False

        if not allow_warnings:
            warnings = [r for r in results if r.status == HealthStatus.WARNING]
            return len(warnings) == 0

        return True

Usage¶

from netmiko import ConnectHandler
from health_checks import DeviceHealthChecker

device = ConnectHandler(
    device_type="cisco_ios",
    host="10.0.0.1",
    username="admin",
    password="password"
)

checker = DeviceHealthChecker(device)
results = checker.run_all_checks()

# Print results
for result in results:
    status_icon = "✓" if result.status.value == "healthy" else "⚠" if result.status.value == "warning" else "✗"
    print(f"{status_icon} {result.check_name}: {result.message}")

# Decide whether to deploy
if checker.is_healthy(results, allow_warnings=True):
    print("\n✓ Device is healthy, proceeding with deployment")
    # deploy_to_device()
else:
    print("\n✗ Device has critical issues, skipping deployment")
    # Log and skip

Pattern 2: Configuration Backup Before Deployment¶

The Implementation¶

# src/config_backup.py
import json
from datetime import datetime

class ConfigurationBackup:
    """Backup device configuration before changes."""

    def __init__(self, device, backup_dir="/tmp/config_backups"):
        self.device = device
        self.backup_dir = backup_dir
        self.backup_file = None

    def backup_running_config(self):
        """Backup current running configuration."""
        try:
            config = self.device.send_command("show running-config")

            # Create backup file
            timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
            filename = f"{self.backup_dir}/{self.device.host}_{timestamp}.cfg"

            with open(filename, 'w') as f:
                f.write(config)

            self.backup_file = filename

            return {
                "status": "success",
                "filename": filename,
                "lines": len(config.split('\n'))
            }

        except Exception as e:
            return {
                "status": "failed",
                "error": str(e)
            }

    def get_config_checksum(self):
        """Get MD5 checksum of running configuration."""
        try:
            import hashlib
            config = self.device.send_command("show running-config")
            checksum = hashlib.md5(config.encode()).hexdigest()
            return checksum
        except Exception as e:
            return None

Pattern 3: Nornir Pre-Flight Tasks¶

# src/nornir_preflight_tasks.py
from nornir import InitNornir
from nornir.core.task import Task, Result
from health_checks import DeviceHealthChecker, HealthStatus
from config_backup import ConfigurationBackup

def preflight_validation(task: Task) -> Result:
    """
    Nornir task: Validate device before deployment.
    """
    try:
        device = task.host.get_connection("netmiko")
        checker = DeviceHealthChecker(device)
        backup = ConfigurationBackup(device)

        # Run health checks
        health_results = checker.run_all_checks()

        # Determine status
        critical_issues = [
            r for r in health_results 
            if r.status == HealthStatus.CRITICAL
        ]

        if critical_issues:
            return Result(
                host=task.host,
                failed=True,
                result={
                    "preflight_status": "FAILED",
                    "critical_issues": [
                        {
                            "check": r.check_name,
                            "message": r.message
                        }
                        for r in critical_issues
                    ]
                }
            )

        # Backup configuration
        backup_result = backup.backup_running_config()

        if backup_result["status"] == "failed":
            return Result(
                host=task.host,
                failed=True,
                result={
                    "preflight_status": "FAILED",
                    "reason": "Could not backup configuration",
                    "error": backup_result.get("error")
                }
            )

        return Result(
            host=task.host,
            result={
                "preflight_status": "PASSED",
                "health_checks": [
                    {
                        "check": r.check_name,
                        "status": r.status.value,
                        "message": r.message
                    }
                    for r in health_results
                ],
                "backup": backup_result
            }
        )

    except Exception as e:
        return Result(
            host=task.host,
            failed=True,
            result={"error": str(e)}
        )

# Usage
nr = InitNornir(config_file="config.yaml")
results = nr.run(task=preflight_validation)

# Filter: Only deploy to passing devices
passing_devices = [
    hostname for hostname, result in results.items()
    if not result[0].failed and result[0].result["preflight_status"] == "PASSED"
]

print(f"✓ {len(passing_devices)} devices passed pre-flight validation")
print(f"✗ {len(results) - len(passing_devices)} devices failed pre-flight")

# Deploy only to passing devices
# nr = InitNornir(config_file="config.yaml")
# nr.inventory.hosts = {h: nr.inventory.hosts[h] for h in passing_devices}
# nr.run(task=deploy_to_devices)

Pattern 4: Sanity Tests After Deployment¶

The Implementation¶

# src/sanity_tests.py
class SanityTestResult:
    def __init__(self, test_name: str, passed: bool, message: str):
        self.test_name = test_name
        self.passed = passed
        self.message = message

class SanityTests:
    """Verify deployment had expected effect."""

    def __init__(self, device):
        self.device = device

    def test_interfaces_still_up(self, expected_up=2) -> SanityTestResult:
        """Verify interfaces didn't break."""
        output = self.device.send_command("show ip interface brief")
        up_count = output.count(" UP ")

        if up_count >= expected_up:
            return SanityTestResult(
                "interfaces_up",
                True,
                f"{up_count} interfaces up (expected {expected_up})"
            )
        else:
            return SanityTestResult(
                "interfaces_up",
                False,
                f"Only {up_count} interfaces up (expected {expected_up})"
            )

    def test_config_saved(self) -> SanityTestResult:
        """Verify configuration was saved."""
        try:
            self.device.send_command("write memory")
            return SanityTestResult(
                "config_saved",
                True,
                "Configuration saved successfully"
            )
        except Exception as e:
            return SanityTestResult(
                "config_saved",
                False,
                f"Failed to save configuration: {str(e)}"
            )

    def test_device_responsive(self) -> SanityTestResult:
        """Verify device still responds to commands."""
        try:
            self.device.send_command("show version")
            return SanityTestResult(
                "device_responsive",
                True,
                "Device is responsive"
            )
        except Exception as e:
            return SanityTestResult(
                "device_responsive",
                False,
                f"Device not responsive: {str(e)}"
            )

Best Practices¶

1. Fail Fast on Critical Issues¶

# ✅ GOOD
if checker.is_healthy(results):
    deploy()
else:
    return False  # Stop immediately

# ❌ BAD - Deploying despite critical issues
if has_critical_issues(results):
    log_warning("Issues detected but deploying anyway")
    deploy()  # 💥 Will fail

2. Backup Before Every Change¶

# ✅ GOOD
backup.backup_running_config()
deploy_changes()

# ❌ BAD - No backup
deploy_changes()
# Now if something breaks, no recovery point

3. Test Your Health Checks¶

def test_health_checker_detects_critical(mock_device):
    """Verify health checker catches real problems."""
    mock_device.send_command.side_effect = Exception("SSH timeout")

    checker = DeviceHealthChecker(mock_device)
    results = checker.run_all_checks()

    assert not checker.is_healthy(results)

Summary¶

Pre-flight validation is critical:

✅ Check device health
✅ Backup configuration
✅ Run sanity tests
✅ Only deploy if safe
✅ Verify after deployment

Safe deployment = Validate before + Backup before + Verify after¶

Next Steps¶

Circuit Breakers & Backpressure — Safety at scale