diff --git a/agents/aliyun/fence_aliyun.py b/agents/aliyun/fence_aliyun.py index 788ac647..e2b57b8e 100644 --- a/agents/aliyun/fence_aliyun.py +++ b/agents/aliyun/fence_aliyun.py @@ -1,235 +1,235 @@ #!@PYTHON@ -tt import sys import logging import atexit import json sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import * from fencing import fail_usage, run_delay try: from aliyunsdkcore import client from aliyunsdkcore.auth.credentials import EcsRamRoleCredential from aliyunsdkcore.profile import region_provider except ImportError as e: - logging.warn("The 'aliyunsdkcore' module has been not installed or is unavailable, try to execute the command 'pip install aliyun-python-sdk-core --upgrade' to solve. error: %s" % e) + logging.warning("The 'aliyunsdkcore' module has been not installed or is unavailable, try to execute the command 'pip install aliyun-python-sdk-core --upgrade' to solve. error: %s" % e) try: from aliyunsdkecs.request.v20140526.DescribeInstancesRequest import DescribeInstancesRequest from aliyunsdkecs.request.v20140526.StartInstanceRequest import StartInstanceRequest from aliyunsdkecs.request.v20140526.StopInstanceRequest import StopInstanceRequest from aliyunsdkecs.request.v20140526.RebootInstanceRequest import RebootInstanceRequest except ImportError as e: - logging.warn("The 'aliyunsdkecs' module has been not installed or is unavailable, try to execute the command 'pip install aliyun-python-sdk-ecs --upgrade' to solve. error: %s" % e) + logging.warning("The 'aliyunsdkecs' module has been not installed or is unavailable, try to execute the command 'pip install aliyun-python-sdk-ecs --upgrade' to solve. error: %s" % e) def _send_request(conn, request): logging.debug("send request action: %s" % request.get_action_name()) request.set_accept_format('json') try: response_str = conn.do_action_with_exception(request) except Exception as e: fail_usage("Failed: send request failed: Error: %s" % e) response_detail = json.loads(response_str) logging.debug("reponse: %s" % response_detail) return response_detail def start_instance(conn, instance_id): logging.debug("start instance %s" % instance_id) request = StartInstanceRequest() request.set_InstanceId(instance_id) _send_request(conn, request) def stop_instance(conn, instance_id): logging.debug("stop instance %s" % instance_id) request = StopInstanceRequest() request.set_InstanceId(instance_id) request.set_ForceStop('true') _send_request(conn, request) def reboot_instance(conn, instance_id): logging.debug("reboot instance %s" % instance_id) request = RebootInstanceRequest() request.set_InstanceId(instance_id) request.set_ForceStop('true') _send_request(conn, request) def get_status(conn, instance_id): logging.debug("get instance %s status" % instance_id) request = DescribeInstancesRequest() request.set_InstanceIds(json.dumps([instance_id])) response = _send_request(conn, request) instance_status = None if response is not None: instance_list = response.get('Instances').get('Instance') for item in instance_list: instance_status = item.get('Status') return instance_status def get_nodes_list(conn, options): logging.debug("start to get nodes list") result = {} request = DescribeInstancesRequest() request.set_PageSize(100) if "--filter" in options: filter_key = options["--filter"].split("=")[0].strip() filter_value = options["--filter"].split("=")[1].strip() params = request.get_query_params() params[filter_key] = filter_value request.set_query_params(params) response = _send_request(conn, request) if response is not None: instance_list = response.get('Instances').get('Instance') for item in instance_list: instance_id = item.get('InstanceId') instance_name = item.get('InstanceName') result[instance_id] = (instance_name, None) logging.debug("get nodes list: %s" % result) return result def get_power_status(conn, options): logging.debug("start to get power(%s) status" % options["--plug"]) state = get_status(conn, options["--plug"]) if state == "Running": status = "on" elif state == "Stopped": status = "off" else: status = "unknown" logging.debug("the power(%s) status is %s" % (options["--plug"], status)) return status def set_power_status(conn, options): logging.info("start to set power(%s) status to %s" % (options["--plug"], options["--action"])) if (options["--action"]=="off"): stop_instance(conn, options["--plug"]) elif (options["--action"]=="on"): start_instance(conn, options["--plug"]) elif (options["--action"]=="reboot"): reboot_instance(conn, options["--plug"]) def define_new_opts(): all_opt["region"] = { "getopt" : "r:", "longopt" : "region", "help" : "-r, --region=[name] Region, e.g. cn-hangzhou", "shortdesc" : "Region.", "required" : "0", "order" : 2 } all_opt["access_key"] = { "getopt" : "a:", "longopt" : "access-key", "help" : "-a, --access-key=[name] Access Key", "shortdesc" : "Access Key.", "required" : "0", "order" : 3 } all_opt["secret_key"] = { "getopt" : "s:", "longopt" : "secret-key", "help" : "-s, --secret-key=[name] Secret Key", "shortdesc" : "Secret Key.", "required" : "0", "order" : 4 } all_opt["ram_role"] = { "getopt": ":", "longopt": "ram-role", "help": "--ram-role=[name] Ram Role", "shortdesc": "Ram Role.", "required": "0", "order": 5 } all_opt["credentials_file"] = { "getopt": ":", "longopt": "credentials-file", "help": "--credentials-file=[path] Path to aliyun-cli credentials file", "shortdesc": "Path to credentials file", "required": "0", "order": 6 } all_opt["credentials_file_profile"] = { "getopt": ":", "longopt": "credentials-file-profile", "help": "--credentials-file-profile=[profile] Credentials file profile", "shortdesc": "Credentials file profile", "required": "0", "default": "default", "order": 7 } all_opt["filter"] = { "getopt": ":", "longopt": "filter", "help": "--filter=[key=value] Filter (e.g. InstanceIds=[\"i-XXYYZZAA1\",\"i-XXYYZZAA2\"]", "shortdesc": "Filter for list-action.", "required": "0", "order": 8 } # Main agent method def main(): conn = None device_opt = ["port", "no_password", "region", "access_key", "secret_key", "ram_role", "credentials_file", "credentials_file_profile", "filter"] atexit.register(atexit_handler) define_new_opts() all_opt["power_timeout"]["default"] = "60" options = check_input(device_opt, process_input(device_opt)) docs = {} docs["shortdesc"] = "Fence agent for Aliyun (Aliyun Web Services)" docs["longdesc"] = "fence_aliyun is a Power Fencing agent for Aliyun." docs["vendorurl"] = "http://www.aliyun.com" show_docs(options, docs) run_delay(options) if "--region" in options: region = options["--region"] if "--access-key" in options and "--secret-key" in options: access_key = options["--access-key"] secret_key = options["--secret-key"] conn = client.AcsClient(access_key, secret_key, region) elif "--ram-role" in options: ram_role = options["--ram-role"] role = EcsRamRoleCredential(ram_role) conn = client.AcsClient(region_id=region, credential=role) elif "--credentials-file" in options and "--credentials-file-profile" in options: import os, configparser try: config = configparser.ConfigParser() config.read(os.path.expanduser(options["--credentials-file"])) access_key = config.get(options["--credentials-file-profile"], "aliyun_access_key_id") secret_key = config.get(options["--credentials-file-profile"], "aliyun_access_key_secret") conn = client.AcsClient(access_key, secret_key, region) except Exception as e: fail_usage("Failed: failed to read credentials file: %s" % e) else: fail_usage("Failed: User credentials are not set. Please set the Access Key and the Secret Key, or configure the RAM role.") # Use intranet endpoint to access ECS service try: region_provider.modify_point('Ecs', region, 'ecs.%s.aliyuncs.com' % region) except Exception as e: - logging.warn("Failed: failed to modify endpoint to 'ecs.%s.aliyuncs.com': %s" % (region, e)) + logging.warning("Failed: failed to modify endpoint to 'ecs.%s.aliyuncs.com': %s" % (region, e)) # Operate the fencing device result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list) sys.exit(result) if __name__ == "__main__": main() diff --git a/agents/compute/fence_compute.py b/agents/compute/fence_compute.py index f53b97da..d862dca3 100644 --- a/agents/compute/fence_compute.py +++ b/agents/compute/fence_compute.py @@ -1,516 +1,516 @@ #!@PYTHON@ -tt import sys import time import atexit import logging import inspect import requests.exceptions sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import * from fencing import fail_usage, is_executable, run_command, run_delay override_status = "" EVACUABLE_TAG = "evacuable" TRUE_TAGS = ['true'] def get_power_status(connection, options): if len(override_status): logging.debug("Pretending we're " + override_status) return override_status status = "unknown" logging.debug("get action: " + options["--action"]) if connection: try: services = connection.services.list(host=options["--plug"], binary="nova-compute") for service in services: logging.debug("Status of %s on %s is %s, %s" % (service.binary, options["--plug"], service.state, service.status)) if service.state == "up" and service.status == "enabled": # Up and operational status = "on" elif service.state == "down" and service.status == "disabled": # Down and fenced status = "off" elif service.state == "down": # Down and requires fencing status = "failed" elif service.state == "up": # Up and requires unfencing status = "running" else: logging.warning("Unknown status detected from nova for %s: %s, %s" % (options["--plug"], service.state, service.status)) status = "%s %s" % (service.state, service.status) break except requests.exception.ConnectionError as err: logging.warning("Nova connection failed: " + str(err)) logging.debug("Final status of %s is %s" % (options["--plug"], status)) return status def get_power_status_simple(connection, options): status = get_power_status(connection, options) if status in [ "off" ]: return status return "on" def set_attrd_status(host, status, options): logging.debug("Setting fencing status for %s to %s" % (host, status)) run_command(options, "attrd_updater -p -n evacuate -Q -N %s -U %s" % (host, status)) def get_attrd_status(host, options): (status, pipe_stdout, pipe_stderr) = run_command(options, "attrd_updater -p -n evacuate -Q -N %s" % (host)) fields = pipe_stdout.split('"') if len(fields) > 6: return fields[5] logging.debug("Got %s: o:%s e:%s n:%d" % (status, pipe_stdout, pipe_stderr, len(fields))) return "" def set_power_status_on(connection, options): # Wait for any evacuations to complete while True: current = get_attrd_status(options["--plug"], options) if current in ["no", ""]: logging.info("Evacuation complete for: %s '%s'" % (options["--plug"], current)) break else: logging.info("Waiting for %s to complete evacuations: %s" % (options["--plug"], current)) time.sleep(2) status = get_power_status(connection, options) # Should we do it for 'failed' too? if status in [ "off", "running", "failed" ]: try: # Forcing the host back up logging.info("Forcing nova-compute back up on "+options["--plug"]) connection.services.force_down(options["--plug"], "nova-compute", force_down=False) logging.info("Forced nova-compute back up on "+options["--plug"]) except Exception as e: # In theory, if force_down=False fails, that's for the exact # same possible reasons that below with force_down=True # eg. either an incompatible version or an old client. # Since it's about forcing back to a default value, there is # no real worries to just consider it's still okay even if the # command failed - logging.warn("Exception from attempt to force " + logging.warning("Exception from attempt to force " "host back up via nova API: " "%s: %s" % (e.__class__.__name__, e)) # Forcing the service back up in case it was disabled logging.info("Enabling nova-compute on "+options["--plug"]) connection.services.enable(options["--plug"], 'nova-compute') # Pretend we're 'on' so that the fencing library doesn't loop forever waiting for the node to boot override_status = "on" elif status not in ["on"]: # Not safe to unfence, don't waste time looping to see if the status changes to "on" options["--power-timeout"] = "0" def set_power_status_off(connection, options): status = get_power_status(connection, options) if status in [ "off" ]: return try: # Until 2.53 connection.services.force_down( options["--plug"], "nova-compute", force_down=True) connection.services.disable(options["--plug"], 'nova-compute') except Exception as e: # Something went wrong when we tried to force the host down. # That could come from either an incompatible API version # eg. UnsupportedVersion or VersionNotFoundForAPIMethod # or because novaclient is old and doesn't include force_down yet # eg. AttributeError # In that case, fallbacking to wait for Nova to catch the right state. logging.error("Exception from attempt to force host down via nova API: " "%s: %s" % (e.__class__.__name__, e)) # need to wait for nova to update its internal status or we # cannot call host-evacuate while get_power_status(connection, options) not in ["off"]: # Loop forever if need be. # # Some callers (such as Pacemaker) will have a timer # running and kill us if necessary logging.debug("Waiting for nova to update its internal state for %s" % options["--plug"]) time.sleep(1) set_attrd_status(options["--plug"], "yes", options) def set_power_status(connection, options): global override_status override_status = "" logging.debug("set action: " + options["--action"]) if not connection: return if options["--action"] in ["off", "reboot"]: set_power_status_off(connection, options) else: set_power_status_on(connection, options) logging.debug("set action passed: " + options["--action"]) sys.exit(0) def fix_domain(connection, options): domains = {} last_domain = None if connection: # Find it in nova services = connection.services.list(binary="nova-compute") for service in services: shorthost = service.host.split('.')[0] if shorthost == service.host: # Nova is not using FQDN calculated = "" else: # Compute nodes are named as FQDN, strip off the hostname calculated = service.host.replace(shorthost+".", "") if calculated == last_domain: # Avoid complaining for each compute node with the same name # One hopes they don't appear interleaved as A.com B.com A.com B.com logging.debug("Calculated the same domain from: %s" % service.host) continue domains[calculated] = service.host last_domain = calculated if "--domain" in options and options["--domain"] != calculated: # Warn in case nova isn't available at some point logging.warning("Supplied domain '%s' does not match the one calculated from: %s" % (options["--domain"], service.host)) if len(domains) == 0 and "--domain" not in options: logging.error("Could not calculate the domain names used by compute nodes in nova") elif len(domains) == 1 and "--domain" not in options: options["--domain"] = last_domain elif len(domains) == 1 and options["--domain"] != last_domain: logging.error("Overriding supplied domain '%s' as it does not match the one calculated from: %s" % (options["--domain"], domains[last_domain])) options["--domain"] = last_domain elif len(domains) > 1: logging.error("The supplied domain '%s' did not match any used inside nova: %s" % (options["--domain"], repr(domains))) sys.exit(1) return last_domain def fix_plug_name(connection, options): if options["--action"] == "list": return if "--plug" not in options: return calculated = fix_domain(connection, options) if calculated is None or "--domain" not in options: # Nothing supplied and nova not available... what to do... nothing return short_plug = options["--plug"].split('.')[0] logging.debug("Checking target '%s' against calculated domain '%s'"% (options["--plug"], calculated)) if options["--domain"] == "": # Ensure any domain is stripped off since nova isn't using FQDN options["--plug"] = short_plug elif options["--plug"].endswith(options["--domain"]): # Plug already uses the domain, don't re-add return else: # Add the domain to the plug options["--plug"] = short_plug + "." + options["--domain"] def get_plugs_list(connection, options): result = {} if connection: services = connection.services.list(binary="nova-compute") for service in services: longhost = service.host shorthost = longhost.split('.')[0] result[longhost] = ("", None) result[shorthost] = ("", None) return result def create_nova_connection(options): nova = None try: from novaclient import client from novaclient.exceptions import NotAcceptable except ImportError: fail_usage("Nova not found or not accessible") from keystoneauth1 import loading from keystoneauth1 import session from keystoneclient import discover # Prefer the oldest and strip the leading 'v' keystone_versions = discover.available_versions(options["--auth-url"]) keystone_version = keystone_versions[0]['id'][1:] kwargs = dict( auth_url=options["--auth-url"], username=options["--username"], password=options["--password"] ) if discover.version_match("2", keystone_version): kwargs["tenant_name"] = options["--tenant-name"] elif discover.version_match("3", keystone_version): kwargs["project_name"] = options["--tenant-name"] kwargs["user_domain_name"] = options["--user-domain"] kwargs["project_domain_name"] = options["--project-domain"] loader = loading.get_plugin_loader('password') keystone_auth = loader.load_from_options(**kwargs) keystone_session = session.Session(auth=keystone_auth, verify=not "--insecure" in options) nova_versions = [ "2.11", "2" ] for version in nova_versions: clientargs = inspect.getargspec(client.Client).varargs # Some versions of Openstack prior to Ocata only # supported positional arguments for username, # password, and tenant. # # Versions since Ocata only support named arguments. # # So we need to use introspection to figure out how to # create a Nova client. # # Happy days # if clientargs: # OSP < 11 # ArgSpec(args=['version', 'username', 'password', 'project_id', 'auth_url'], # varargs=None, # keywords='kwargs', defaults=(None, None, None, None)) nova = client.Client(version, None, # User None, # Password None, # Tenant None, # Auth URL insecure="--insecure" in options, region_name=options["--region-name"], endpoint_type=options["--endpoint-type"], session=keystone_session, auth=keystone_auth, http_log_debug="--verbose" in options) else: # OSP >= 11 # ArgSpec(args=['version'], varargs='args', keywords='kwargs', defaults=None) nova = client.Client(version, region_name=options["--region-name"], endpoint_type=options["--endpoint-type"], session=keystone_session, auth=keystone_auth, http_log_debug="--verbose" in options) try: nova.hypervisors.list() return nova except NotAcceptable as e: logging.warning(e) except Exception as e: logging.warning("Nova connection failed. %s: %s" % (e.__class__.__name__, e)) logging.warning("Couldn't obtain a supported connection to nova, tried: %s\n" % repr(nova_versions)) return None def define_new_opts(): all_opt["endpoint_type"] = { "getopt" : "e:", "longopt" : "endpoint-type", "help" : "-e, --endpoint-type=[endpoint] Nova Endpoint type (publicURL, internalURL, adminURL)", "required" : "0", "shortdesc" : "Nova Endpoint type", "default" : "internalURL", "order": 1, } all_opt["tenant_name"] = { "getopt" : "t:", "longopt" : "tenant-name", "help" : "-t, --tenant-name=[name] Keystone v2 Tenant or v3 Project Name", "required" : "0", "shortdesc" : "Keystone Admin Tenant or v3 Project", "default" : "", "order": 1, } all_opt["user-domain"] = { "getopt" : "u:", "longopt" : "user-domain", "help" : "-u, --user-domain=[name] Keystone v3 User Domain", "required" : "0", "shortdesc" : "Keystone v3 User Domain", "default" : "Default", "order": 2, } all_opt["project-domain"] = { "getopt" : "P:", "longopt" : "project-domain", "help" : "-P, --project-domain=[name] Keystone v3 Project Domain", "required" : "0", "shortdesc" : "Keystone v3 Project Domain", "default" : "Default", "order": 2, } all_opt["auth_url"] = { "getopt" : "k:", "longopt" : "auth-url", "help" : "-k, --auth-url=[url] Keystone Admin Auth URL", "required" : "0", "shortdesc" : "Keystone Admin Auth URL", "default" : "", "order": 1, } all_opt["region_name"] = { "getopt" : ":", "longopt" : "region-name", "help" : "--region-name=[region] Region Name", "required" : "0", "shortdesc" : "Region Name", "default" : "", "order": 1, } all_opt["insecure"] = { "getopt" : "", "longopt" : "insecure", "help" : "--insecure Explicitly allow agent to perform \"insecure\" TLS (https) requests", "required" : "0", "shortdesc" : "Allow Insecure TLS Requests", "order": 2, } all_opt["domain"] = { "getopt" : "d:", "longopt" : "domain", "help" : "-d, --domain=[string] DNS domain in which hosts live, useful when the cluster uses short names and nova uses FQDN", "required" : "0", "shortdesc" : "DNS domain in which hosts live", "order": 5, } all_opt["record_only"] = { "getopt" : "r:", "longopt" : "record-only", "help" : "--record-only Record the target as needing evacuation but as yet do not intiate it", "required" : "0", "shortdesc" : "Only record the target as needing evacuation", "default" : "False", "order": 5, } all_opt["instance_filtering"] = { "getopt" : "", "longopt" : "instance-filtering", "help" : "--instance-filtering Allow instances created from images and flavors with evacuable=true to be evacuated (or all if no images/flavors have been tagged)", "required" : "0", "shortdesc" : "Allow instances to be evacuated", "default" : "True", "order": 5, } all_opt["no_shared_storage"] = { "getopt" : "", "longopt" : "no-shared-storage", "help" : "--no-shared-storage Disable functionality for shared storage", "required" : "0", "shortdesc" : "Disable functionality for dealing with shared storage", "default" : "False", "order": 5, } all_opt["compute-domain"] = { "getopt" : ":", "longopt" : "compute-domain", "help" : "--compute-domain=[string] Replaced by --domain", "required" : "0", "shortdesc" : "Replaced by domain", "order": 6, } def set_multi_power_fn(connection, options, set_power_fn, get_power_fn, retry_attempts=1): for _ in range(retry_attempts): set_power_fn(connection, options) time.sleep(int(options["--power-wait"])) for _ in range(int(options["--power-timeout"])): if get_power_fn(connection, options) != options["--action"]: time.sleep(1) else: return True return False def main(): global override_status atexit.register(atexit_handler) device_opt = ["login", "passwd", "tenant_name", "auth_url", "fabric_fencing", "no_login", "no_password", "port", "domain", "compute-domain", "project-domain", "user-domain", "no_shared_storage", "endpoint_type", "record_only", "instance_filtering", "insecure", "region_name"] define_new_opts() all_opt["shell_timeout"]["default"] = "180" options = check_input(device_opt, process_input(device_opt)) docs = {} docs["shortdesc"] = "Fence agent for the automatic resurrection of OpenStack compute instances" docs["longdesc"] = "Used to tell Nova that compute nodes are down and to reschedule flagged instances" docs["vendorurl"] = "" show_docs(options, docs) if options["--record-only"] in [ "2", "Disabled", "disabled" ]: sys.exit(0) run_delay(options) # workaround to avoid regressions if "--compute-domain" in options and options["--compute-domain"]: options["--domain"] = options["--compute-domain"] del options["--domain"] # Disable insecure-certificate-warning message if "--insecure" in options: import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) logging.debug("Running "+options["--action"]) connection = create_nova_connection(options) if options["--action"] in ["off", "on", "reboot", "status"]: fix_plug_name(connection, options) if options["--action"] in ["reboot"]: options["--action"]="off" if options["--action"] in ["off", "on"]: # No status first, call our own version result = not set_multi_power_fn(connection, options, set_power_status, get_power_status_simple, 1 + int(options["--retry-on"])) elif options["--action"] in ["monitor"]: result = 0 else: result = fence_action(connection, options, set_power_status, get_power_status_simple, get_plugs_list, None) logging.debug("Result for "+options["--action"]+": "+repr(result)) if result == None: result = 0 sys.exit(result) if __name__ == "__main__": main() diff --git a/agents/sbd/fence_sbd.py b/agents/sbd/fence_sbd.py index 5c498263..bf95bb72 100644 --- a/agents/sbd/fence_sbd.py +++ b/agents/sbd/fence_sbd.py @@ -1,435 +1,435 @@ #!@PYTHON@ -tt import sys, stat import logging import os import atexit sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import fail_usage, run_commands, fence_action, all_opt from fencing import atexit_handler, check_input, process_input, show_docs from fencing import run_delay import itertools DEVICE_INIT = 1 DEVICE_NOT_INIT = -3 PATH_NOT_EXISTS = -1 PATH_NOT_BLOCK = -2 def is_block_device(filename): """Checks if a given path is a valid block device Key arguments: filename -- the file to check Return codes: True if it's a valid block device False, otherwise """ try: mode = os.lstat(filename).st_mode except OSError: return False else: return stat.S_ISBLK(mode) def is_link(filename): """Checks if a given path is a link. Key arguments: filename -- the file to check Return codes: True if it's a link False, otherwise """ try: mode = os.lstat(filename).st_mode except OSError: return False else: return stat.S_ISLNK(mode) def check_sbd_device(options, device_path): """checks that a given sbd device exists and is initialized Key arguments: options -- options dictionary device_path -- device path to check Return Codes: 1 / DEVICE_INIT if the device exists and is initialized -1 / PATH_NOT_EXISTS if the path does not exists -2 / PATH_NOT_BLOCK if the path exists but is not a valid block device -3 / DEVICE_NOT_INIT if the sbd device is not initialized """ # First of all we need to check if the device is valid if not os.path.exists(device_path): return PATH_NOT_EXISTS # We need to check if device path is a symbolic link. If so we resolve that # link. if is_link(device_path): link_target = os.readlink(device_path) device_path = os.path.join(os.path.dirname(device_path), link_target) # As second step we make sure it's a valid block device if not is_block_device(device_path): return PATH_NOT_BLOCK cmd = "%s -d %s dump" % (options["--sbd-path"], device_path) (return_code, out, err) = run_commands(options, [ cmd ]) for line in itertools.chain(out.split("\n"), err.split("\n")): if len(line) == 0: continue # If we read "NOT dumped" something went wrong, e.g. the device is not # initialized. if "NOT dumped" in line: return DEVICE_NOT_INIT return DEVICE_INIT def generate_sbd_command(options, command, arguments=None): """Generates a sbd command based on given arguments. Return Value: generated list of sbd commands (strings) depending on command multiple commands with a device each or a single command with multiple devices """ cmds = [] if not command in ["list", "dump"]: cmd = options["--sbd-path"] # add "-d" for each sbd device for device in parse_sbd_devices(options): cmd += " -d %s" % device cmd += " %s %s" % (command, arguments) cmds.append(cmd) else: for device in parse_sbd_devices(options): cmd = options["--sbd-path"] cmd += " -d %s" % device cmd += " %s %s" % (command, arguments) cmds.append(cmd) return cmds def send_sbd_message(conn, options, plug, message): """Sends a message to all sbd devices. Key arguments: conn -- connection structure options -- options dictionary plug -- plug to sent the message to message -- message to send Return Value: (return_code, out, err) Tuple containing the error code, """ del conn arguments = "%s %s" % (plug, message) cmd = generate_sbd_command(options, "message", arguments) (return_code, out, err) = run_commands(options, cmd) return (return_code, out, err) def get_msg_timeout(options): """Reads the configured sbd message timeout from each device. Key arguments: options -- options dictionary Return Value: msg_timeout (integer, seconds) """ # get the defined msg_timeout msg_timeout = -1 # default sbd msg timeout cmd = generate_sbd_command(options, "dump") (return_code, out, err) = run_commands(options, cmd) for line in itertools.chain(out.split("\n"), err.split("\n")): if len(line) == 0: continue if "msgwait" in line: tmp_msg_timeout = int(line.split(':')[1]) if -1 != msg_timeout and tmp_msg_timeout != msg_timeout: - logging.warn(\ + logging.warning(\ "sbd message timeouts differ in different devices") # we only save the highest timeout if tmp_msg_timeout > msg_timeout: msg_timeout = tmp_msg_timeout return msg_timeout def set_power_status(conn, options): """send status to sbd device (poison pill) Key arguments: conn -- connection structure options -- options dictionary Return Value: return_code -- action result (bool) """ target_status = options["--action"] plug = options["--plug"] return_code = 99 out = "" err = "" # Map fencing actions to sbd messages if "on" == target_status: (return_code, out, err) = send_sbd_message(conn, options, plug, "clear") elif "off" == target_status: (return_code, out, err) = send_sbd_message(conn, options, plug, "off") elif "reboot" == target_status: (return_code, out, err) = send_sbd_message(conn, options, plug, "reset") if 0 != return_code: logging.error("sending message to sbd device(s) \ failed with return code %d", return_code) logging.error("DETAIL: output on stdout was \"%s\"", out) logging.error("DETAIL: output on stderr was \"%s\"", err) return not bool(return_code) def reboot_cycle(conn, options): """" trigger reboot by sbd messages Key arguments: conn -- connection structure options -- options dictionary Return Value: return_code -- action result (bool) """ plug = options["--plug"] return_code = 99 out = "" err = "" (return_code, out, err) = send_sbd_message(conn, options, plug, "reset") return not bool(return_code) def get_power_status(conn, options): """Returns the status of a specific node. Key arguments: conn -- connection structure options -- option dictionary Return Value: status -- status code (string) """ status = "UNKWNOWN" plug = options["--plug"] nodelist = get_node_list(conn, options) # We need to check if the specified plug / node a already a allocated slot # on the device. if plug not in nodelist: logging.error("node \"%s\" not found in node list", plug) else: status = nodelist[plug][1] return status def translate_status(sbd_status): """Translates the sbd status to fencing status. Key arguments: sbd_status -- status to translate (string) Return Value: status -- fencing status (string) """ status = "UNKNOWN" # Currently we only accept "clear" to be marked as online. Eventually we # should also check against "test" online_status = ["clear"] offline_status = ["reset", "off"] if any(online_status_element in sbd_status \ for online_status_element in online_status): status = "on" if any(offline_status_element in sbd_status \ for offline_status_element in offline_status): status = "off" return status def get_node_list(conn, options): """Returns a list of hostnames, registerd on the sbd device. Key arguments: conn -- connection options options -- options Return Value: nodelist -- dictionary wich contains all node names and there status """ del conn nodelist = {} cmd = generate_sbd_command(options, "list") (return_code, out, err) = run_commands(options, cmd) for line in out.split("\n"): if len(line) == 0: continue # if we read "unreadable" something went wrong if "NOT dumped" in line: return nodelist words = line.split() port = words[1] sbd_status = words[2] nodelist[port] = (port, translate_status(sbd_status)) return nodelist def parse_sbd_devices(options): """Returns an array of all sbd devices. Key arguments: options -- options dictionary Return Value: devices -- array of device paths """ devices = [str.strip(dev) \ for dev in str.split(options["--devices"], ",")] return devices def define_new_opts(): """Defines the all opt list """ all_opt["devices"] = { "getopt" : ":", "longopt" : "devices", "help":"--devices=[device_a,device_b] \ Comma separated list of sbd devices", "required" : "1", "shortdesc" : "SBD Device", "order": 1 } all_opt["sbd_path"] = { "getopt" : ":", "longopt" : "sbd-path", "help" : "--sbd-path=[path] Path to SBD binary", "required" : "0", "default" : "@SBD_PATH@", "order": 200 } def main(): """Main function """ # We need to define "no_password" otherwise we will be ask about it if # we don't provide any password. device_opt = ["no_password", "devices", "port", "method", "sbd_path"] # close stdout if we get interrupted atexit.register(atexit_handler) define_new_opts() all_opt["method"]["default"] = "cycle" all_opt["method"]["help"] = "-m, --method=[method] Method to fence (onoff|cycle) (Default: cycle)" all_opt["power_timeout"]["default"] = "30" options = check_input(device_opt, process_input(device_opt)) # fill the needed variables to generate metadata and help text output docs = {} docs["shortdesc"] = "Fence agent for sbd" docs["longdesc"] = "fence_sbd is an I/O Fencing agent \ which can be used in environments where sbd can be used (shared storage)." docs["vendorurl"] = "" show_docs(options, docs) # We need to check if --devices is given and not empty. if "--devices" not in options: fail_usage("No SBD devices specified. \ At least one SBD device is required.") run_delay(options) # We need to check if the provided sbd_devices exists. We need to do # that for every given device. # Just for the case we are really rebooting / powering off a device # (pacemaker as well uses the list command to generate a dynamic list) # we leave it to sbd to try and decide if it was successful if not options["--action"] in ["reboot", "off", "list"]: for device_path in parse_sbd_devices(options): logging.debug("check device \"%s\"", device_path) return_code = check_sbd_device(options, device_path) if PATH_NOT_EXISTS == return_code: logging.error("\"%s\" does not exist", device_path) elif PATH_NOT_BLOCK == return_code: logging.error("\"%s\" is not a valid block device", device_path) elif DEVICE_NOT_INIT == return_code: logging.error("\"%s\" is not initialized", device_path) elif DEVICE_INIT != return_code: logging.error("UNKNOWN error while checking \"%s\"", device_path) # If we get any error while checking the device we need to exit at this # point. if DEVICE_INIT != return_code: exit(return_code) # we check against the defined timeouts. If the pacemaker timeout is smaller # then that defined within sbd we should report this. power_timeout = int(options["--power-timeout"]) sbd_msg_timeout = get_msg_timeout(options) if 0 < power_timeout <= sbd_msg_timeout: - logging.warn("power timeout needs to be \ + logging.warning("power timeout needs to be \ greater then sbd message timeout") result = fence_action(\ None, \ options, \ set_power_status, \ get_power_status, \ get_node_list, \ reboot_cycle) sys.exit(result) if __name__ == "__main__": main()