diff --git a/agents/gce/fence_gce.py b/agents/gce/fence_gce.py index 2c815b84..e4675460 100644 --- a/agents/gce/fence_gce.py +++ b/agents/gce/fence_gce.py @@ -1,632 +1,635 @@ #!@PYTHON@ -tt # # Requires the googleapiclient and oauth2client # RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2 pysocks==1.7.1 httplib2==0.19.0 # RHEL 8.x: pysocks==1.7.1 httplib2==0.19.0 # SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce pysocks==1.7.1 httplib2==0.19.0 # SLES 15.x: python3-google-api-python-client python3-oauth2client pysocks==1.7.1 httplib2==0.19.0 # import atexit import logging import json import re import os import socket import sys import time from ssl import SSLError if sys.version_info >= (3, 0): # Python 3 imports. import urllib.parse as urlparse import urllib.request as urlrequest else: # Python 2 imports. import urllib as urlparse import urllib2 as urlrequest sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action, run_command try: import httplib2 import googleapiclient.discovery import socks try: from google.oauth2.credentials import Credentials as GoogleCredentials except: from oauth2client.client import GoogleCredentials except: pass VERSION = '1.0.5' ACTION_IDS = { 'on': 1, 'off': 2, 'reboot': 3, 'status': 4, 'list': 5, 'list-status': 6, 'monitor': 7, 'metadata': 8, 'manpage': 9, 'validate-all': 10 } USER_AGENT = 'sap-core-eng/fencegce/%s/%s/ACTION/%s' METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' METADATA_HEADERS = {'Metadata-Flavor': 'Google'} INSTANCE_LINK = 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}/instances/{}' def run_on_fail(options): if "--runonfail" in options: run_command(options, options["--runonfail"]) def fail_fence_agent(options, message): run_on_fail(options) fail_usage(message) def raise_fence_agent(options, message): run_on_fail(options) raise Exception(message) # # Will use baremetalsolution setting or the environment variable # FENCE_GCE_URI_REPLACEMENTS to replace the uri for calls to *.googleapis.com. # def replace_api_uri(options, http_request): uri_replacements = [] # put any env var replacements first, then baremetalsolution if in options if "FENCE_GCE_URI_REPLACEMENTS" in os.environ: logging.debug("FENCE_GCE_URI_REPLACEMENTS environment variable exists") env_uri_replacements = os.environ["FENCE_GCE_URI_REPLACEMENTS"] try: uri_replacements_json = json.loads(env_uri_replacements) if isinstance(uri_replacements_json, list): uri_replacements = uri_replacements_json else: logging.warning("FENCE_GCE_URI_REPLACEMENTS exists, but is not a JSON List") except ValueError as e: logging.warning("FENCE_GCE_URI_REPLACEMENTS exists but is not valid JSON") if "--baremetalsolution" in options: uri_replacements.append( { "matchlength": 4, "match": "https://compute.googleapis.com/compute/v1/projects/(.*)/zones/(.*)/instances/(.*)/reset(.*)", "replace": "https://baremetalsolution.googleapis.com/v1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4" }) for uri_replacement in uri_replacements: # each uri_replacement should have matchlength, match, and replace if "matchlength" not in uri_replacement or "match" not in uri_replacement or "replace" not in uri_replacement: logging.warning("FENCE_GCE_URI_REPLACEMENTS missing matchlength, match, or replace in %s" % uri_replacement) continue match = re.match(uri_replacement["match"], http_request.uri) if match is None or len(match.groups()) != uri_replacement["matchlength"]: continue replaced_uri = re.sub(uri_replacement["match"], uri_replacement["replace"], http_request.uri) match = re.match("https:\/\/.*.googleapis.com", replaced_uri) if match is None or match.start() != 0: logging.warning("FENCE_GCE_URI_REPLACEMENTS replace is not " "targeting googleapis.com, ignoring it: %s" % replaced_uri) continue logging.debug("Replacing googleapis uri %s with %s" % (http_request.uri, replaced_uri)) http_request.uri = replaced_uri break return http_request def retry_api_execute(options, http_request): replaced_http_request = replace_api_uri(options, http_request) action = ACTION_IDS[options["--action"]] if options["--action"] in ACTION_IDS else 0 try: user_agent_header = USER_AGENT % (VERSION, options["image"], action) except ValueError: user_agent_header = USER_AGENT % (VERSION, options["image"], 0) replaced_http_request.headers["User-Agent"] = user_agent_header logging.debug("User agent set as %s" % (user_agent_header)) retries = 3 if options.get("--retries"): retries = int(options.get("--retries")) retry_sleep = 5 if options.get("--retrysleep"): retry_sleep = int(options.get("--retrysleep")) retry = 0 current_err = None while retry <= retries: if retry > 0: time.sleep(retry_sleep) try: return replaced_http_request.execute() except Exception as err: current_err = err logging.warning("Could not execute api call to: %s, retry: %s, " "err: %s" % (replaced_http_request.uri, retry, str(err))) retry += 1 raise current_err def translate_status(instance_status): "Returns on | off | unknown." if instance_status == "RUNNING": return "on" elif instance_status == "TERMINATED": return "off" return "unknown" def get_nodes_list(conn, options): result = {} plug = options["--plug"] if "--plug" in options else "" zones = options["--zone"] if "--zone" in options else "" if not zones: zones = get_zone(conn, options, plug) if "--plugzonemap" not in options else options["--plugzonemap"][plug] try: for zone in zones.split(","): instanceList = retry_api_execute(options, conn.instances().list( project=options["--project"], zone=zone)) for instance in instanceList["items"]: result[instance["id"]] = (instance["name"], translate_status(instance["status"])) except Exception as err: fail_fence_agent(options, "Failed: get_nodes_list: {}".format(str(err))) return result def get_power_status(conn, options): logging.debug("get_power_status") # if this is bare metal we need to just send back the opposite of the # requested action: if on send off, if off send on if "--baremetalsolution" in options: if options.get("--action") == "on": return "off" else: return "on" # If zone is not listed for an entry we attempt to get it automatically instance = options["--plug"] zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance] instance_status = get_instance_power_status(conn, options, instance, zone) # If any of the instances do not match the intended status we return the # the opposite status so that the fence agent can change it. if instance_status != options.get("--action"): return instance_status return options.get("--action") def get_instance_power_status(conn, options, instance, zone): try: instance = retry_api_execute( options, conn.instances().get(project=options["--project"], zone=zone, instance=instance)) return translate_status(instance["status"]) except Exception as err: fail_fence_agent(options, "Failed: get_instance_power_status: {}".format(str(err))) def check_for_existing_operation(conn, options, instance, zone, operation_type): logging.debug("check_for_existing_operation") if "--baremetalsolution" in options: # There is no API for checking in progress operations return False project = options["--project"] target_link = INSTANCE_LINK.format(project, zone, instance) query_filter = '(targetLink = "{}") AND (operationType = "{}") AND (status = "RUNNING")'.format(target_link, operation_type) result = retry_api_execute( options, conn.zoneOperations().list(project=project, zone=zone, filter=query_filter, maxResults=1)) if "items" in result and result["items"]: logging.info("Existing %s operation found", operation_type) return result["items"][0] def wait_for_operation(conn, options, zone, operation): if 'name' not in operation: logging.warning('Cannot wait for operation to complete, the' ' requested operation will continue asynchronously') return False wait_time = 0 project = options["--project"] while True: result = retry_api_execute(options, conn.zoneOperations().get( project=project, zone=zone, operation=operation['name'])) if result['status'] == 'DONE': if 'error' in result: raise_fence_agent(options, result['error']) return True if "--errortimeout" in options and wait_time > int(options["--errortimeout"]): raise_fence_agent(options, "Operation did not complete before the timeout.") if "--warntimeout" in options and wait_time > int(options["--warntimeout"]): logging.warning("Operation did not complete before the timeout.") if "--runonwarn" in options: run_command(options, options["--runonwarn"]) return False wait_time = wait_time + 1 time.sleep(1) def set_power_status(conn, options): logging.debug("set_power_status") instance = options["--plug"] # If zone is not listed for an entry we attempt to get it automatically zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance] set_instance_power_status(conn, options, instance, zone, options["--action"]) def set_instance_power_status(conn, options, instance, zone, action): logging.info("Setting power status of %s in zone %s", instance, zone) project = options["--project"] try: if action == "off": logging.info("Issuing poweroff of %s in zone %s", instance, zone) operation = check_for_existing_operation(conn, options, instance, zone, "stop") if operation and "--earlyexit" in options: return if not operation: operation = retry_api_execute( options, conn.instances().stop(project=project, zone=zone, instance=instance)) logging.info("Poweroff command completed, waiting for the operation to complete") if wait_for_operation(conn, options, zone, operation): logging.info("Poweroff of %s in zone %s complete", instance, zone) elif action == "on": logging.info("Issuing poweron of %s in zone %s", instance, zone) operation = check_for_existing_operation(conn, options, instance, zone, "start") if operation and "--earlyexit" in options: return if not operation: operation = retry_api_execute( options, conn.instances().start(project=project, zone=zone, instance=instance)) if wait_for_operation(conn, options, zone, operation): logging.info("Poweron of %s in zone %s complete", instance, zone) except Exception as err: fail_fence_agent(options, "Failed: set_instance_power_status: {}".format(str(err))) def power_cycle(conn, options): logging.debug("power_cycle") instance = options["--plug"] # If zone is not listed for an entry we attempt to get it automatically zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance] return power_cycle_instance(conn, options, instance, zone) def power_cycle_instance(conn, options, instance, zone): logging.info("Issuing reset of %s in zone %s", instance, zone) project = options["--project"] try: operation = check_for_existing_operation(conn, options, instance, zone, "reset") if operation and "--earlyexit" in options: return True if not operation: operation = retry_api_execute( options, conn.instances().reset(project=project, zone=zone, instance=instance)) + if operation and "--earlyexit" in options: + logging.info("Reset command sent, returning early and not waiting for the operation to complete") + return True logging.info("Reset command sent, waiting for the operation to complete") if wait_for_operation(conn, options, zone, operation): logging.info("Reset of %s in zone %s complete", instance, zone) return True except Exception as err: logging.exception("Failed: power_cycle") raise err def get_zone(conn, options, instance): logging.debug("get_zone"); project = options['--project'] fl = 'name="%s"' % instance request = replace_api_uri(options, conn.instances().aggregatedList(project=project, filter=fl)) while request is not None: response = request.execute() zones = response.get('items', {}) for zone in zones.values(): for inst in zone.get('instances', []): if inst['name'] == instance: return inst['zone'].split("/")[-1] request = replace_api_uri(options, conn.instances().aggregatedList_next( previous_request=request, previous_response=response)) raise_fence_agent(options, "Unable to find instance %s" % (instance)) def get_metadata(metadata_key, params=None, timeout=None): """Performs a GET request with the metadata headers. Args: metadata_key: string, the metadata to perform a GET request on. params: dictionary, the query parameters in the GET request. timeout: int, timeout in seconds for metadata requests. Returns: HTTP response from the GET request. Raises: urlerror.HTTPError: raises when the GET request fails. """ logging.debug("get_metadata"); timeout = timeout or 60 metadata_url = os.path.join(METADATA_SERVER, metadata_key) params = urlparse.urlencode(params or {}) url = '%s?%s' % (metadata_url, params) request = urlrequest.Request(url, headers=METADATA_HEADERS) request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) return request_opener.open(request, timeout=timeout * 1.1).read().decode("utf-8") def define_new_opts(): all_opt["zone"] = { "getopt" : ":", "longopt" : "zone", "help" : "--zone=[name] Zone, e.g. us-central1-b", "shortdesc" : "Zone.", "required" : "0", "order" : 2 } all_opt["project"] = { "getopt" : ":", "longopt" : "project", "help" : "--project=[name] Project ID", "shortdesc" : "Project ID.", "required" : "0", "order" : 3 } all_opt["stackdriver-logging"] = { "getopt" : "", "longopt" : "stackdriver-logging", "help" : "--stackdriver-logging Enable Logging to Stackdriver", "shortdesc" : "Stackdriver-logging support.", "longdesc" : "If enabled IP failover logs will be posted to stackdriver logging.", "required" : "0", "order" : 4 } all_opt["baremetalsolution"] = { "getopt" : "", "longopt" : "baremetalsolution", "help" : "--baremetalsolution Enable on bare metal", "shortdesc" : "If enabled this is a bare metal offering from google.", "required" : "0", "order" : 5 } all_opt["apitimeout"] = { "getopt" : ":", "type" : "second", "longopt" : "apitimeout", "help" : "--apitimeout=[seconds] Timeout to use for API calls", "shortdesc" : "Timeout in seconds to use for API calls, default is 60.", "required" : "0", "default" : 60, "order" : 6 } all_opt["retries"] = { "getopt" : ":", "type" : "integer", "longopt" : "retries", "help" : "--retries=[retries] Number of retries on failure for API calls", "shortdesc" : "Number of retries on failure for API calls, default is 3.", "required" : "0", "default" : 3, "order" : 7 } all_opt["retrysleep"] = { "getopt" : ":", "type" : "second", "longopt" : "retrysleep", "help" : "--retrysleep=[seconds] Time to sleep between API retries", "shortdesc" : "Time to sleep in seconds between API retries, default is 5.", "required" : "0", "default" : 5, "order" : 8 } all_opt["serviceaccount"] = { "getopt" : ":", "longopt" : "serviceaccount", "help" : "--serviceaccount=[filename] Service account json file location e.g. serviceaccount=/somedir/service_account.json", "shortdesc" : "Service Account to use for authentication to the google cloud APIs.", "required" : "0", "order" : 9 } all_opt["plugzonemap"] = { "getopt" : ":", "longopt" : "plugzonemap", "help" : "--plugzonemap=[plugzonemap] Comma separated zone map when fencing multiple plugs", "shortdesc" : "Comma separated zone map when fencing multiple plugs.", "required" : "0", "order" : 10 } all_opt["proxyhost"] = { "getopt" : ":", "longopt" : "proxyhost", "help" : "--proxyhost=[proxy_host] The proxy host to use, if one is needed to access the internet (Example: 10.122.0.33)", "shortdesc" : "If a proxy is used for internet access, the proxy host should be specified.", "required" : "0", "order" : 11 } all_opt["proxyport"] = { "getopt" : ":", "type" : "integer", "longopt" : "proxyport", "help" : "--proxyport=[proxy_port] The proxy port to use, if one is needed to access the internet (Example: 3127)", "shortdesc" : "If a proxy is used for internet access, the proxy port should be specified.", "required" : "0", "order" : 12 } all_opt["earlyexit"] = { "getopt" : "", "longopt" : "earlyexit", - "help" : "--earlyexit Return early if reset is already in progress", - "shortdesc" : "If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code.", + "help" : "--earlyexit Return early from set_power_status if reset is already in progress, if power_cycle then do not wait for the reset", + "shortdesc" : "If running set_power_status and existing reset operation is detected or runnning power_cycle, the fence agent will return before the operation completes with a 0 return code.", "required" : "0", "order" : 13 } all_opt["warntimeout"] = { "getopt" : ":", "type" : "second", "longopt" : "warntimeout", "help" : "--warntimeout=[warn_timeout] Timeout seconds before logging a warning and returning a 0 status code", "shortdesc" : "If the operation is not completed within the timeout, the cluster operations are allowed to continue.", "required" : "0", "order" : 14 } all_opt["errortimeout"] = { "getopt" : ":", "type" : "second", "longopt" : "errortimeout", "help" : "--errortimeout=[error_timeout] Timeout seconds before failing and returning a non-zero status code", "shortdesc" : "If the operation is not completed within the timeout, cluster is notified of the operation failure.", "required" : "0", "order" : 15 } all_opt["runonwarn"] = { "getopt" : ":", "longopt" : "runonwarn", "help" : "--runonwarn=[run_on_warn] If a timeout occurs and warning is generated, run the supplied command", "shortdesc" : "If a timeout would occur while running the agent, then the supplied command is run.", "required" : "0", "order" : 16 } all_opt["runonfail"] = { "getopt" : ":", "longopt" : "runonfail", "help" : "--runonfail=[run_on_fail] If a failure occurs, run the supplied command", "shortdesc" : "If a failure would occur while running the agent, then the supplied command is run.", "required" : "0", "order" : 17 } def main(): conn = None device_opt = ["port", "no_password", "zone", "project", "stackdriver-logging", "method", "baremetalsolution", "apitimeout", "retries", "retrysleep", "serviceaccount", "plugzonemap", "proxyhost", "proxyport", "earlyexit", "warntimeout", "errortimeout", "runonwarn", "runonfail"] atexit.register(atexit_handler) define_new_opts() all_opt["power_timeout"]["default"] = "60" all_opt["method"]["default"] = "cycle" all_opt["method"]["help"] = "-m, --method=[method] Method to fence (onoff|cycle) (Default: cycle)" options = check_input(device_opt, process_input(device_opt)) docs = {} docs["shortdesc"] = "Fence agent for GCE (Google Cloud Engine)" docs["longdesc"] = "fence_gce is an I/O Fencing agent for GCE (Google Cloud " \ "Engine). It uses the googleapiclient library to connect to GCE.\n" \ "googleapiclient can be configured with Google SDK CLI or by " \ "executing 'gcloud auth application-default login'.\n" \ "For instructions see: https://cloud.google.com/compute/docs/tutorials/python-guide" docs["vendorurl"] = "http://cloud.google.com" show_docs(options, docs) run_delay(options) # Prepare logging if options.get('--verbose') is None: logging.getLogger('googleapiclient').setLevel(logging.ERROR) logging.getLogger('oauth2client').setLevel(logging.ERROR) if options.get('--stackdriver-logging') is not None and options.get('--plug'): try: import google.cloud.logging.handlers client = google.cloud.logging.Client() handler = google.cloud.logging.handlers.CloudLoggingHandler(client, name=options['--plug']) handler.setLevel(logging.INFO) formatter = logging.Formatter('gcp:stonith "%(message)s"') handler.setFormatter(formatter) root_logger = logging.getLogger() if options.get('--verbose') is None: root_logger.setLevel(logging.INFO) root_logger.addHandler(handler) except ImportError: logging.error('Couldn\'t import google.cloud.logging, ' 'disabling Stackdriver-logging support') # if apitimeout is defined we set the socket timeout, if not we keep the # socket default which is 60s if options.get("--apitimeout"): socket.setdefaulttimeout(options["--apitimeout"]) # Prepare cli try: serviceaccount = options.get("--serviceaccount") if serviceaccount: scope = ['https://www.googleapis.com/auth/cloud-platform'] logging.debug("using credentials from service account") try: from google.oauth2.service_account import Credentials as ServiceAccountCredentials credentials = ServiceAccountCredentials.from_service_account_file(filename=serviceaccount, scopes=scope) except ImportError: from oauth2client.service_account import ServiceAccountCredentials credentials = ServiceAccountCredentials.from_json_keyfile_name(serviceaccount, scope) else: try: from googleapiclient import _auth credentials = _auth.default_credentials(); except: credentials = GoogleCredentials.get_application_default() logging.debug("using application default credentials") if options.get("--proxyhost") and options.get("--proxyport"): proxy_info = httplib2.ProxyInfo( proxy_type=socks.PROXY_TYPE_HTTP, proxy_host=options.get("--proxyhost"), proxy_port=int(options.get("--proxyport"))) http = credentials.authorize(httplib2.Http(proxy_info=proxy_info)) conn = googleapiclient.discovery.build( 'compute', 'v1', http=http, cache_discovery=False) else: conn = googleapiclient.discovery.build( 'compute', 'v1', credentials=credentials, cache_discovery=False) except SSLError as err: fail_fence_agent(options, "Failed: Create GCE compute v1 connection: {}\n\nThis might be caused by old versions of httplib2.".format(str(err))) except Exception as err: fail_fence_agent(options, "Failed: Create GCE compute v1 connection: {}".format(str(err))) # Get project and zone if not options.get("--project"): try: options["--project"] = get_metadata('project/project-id') except Exception as err: fail_fence_agent(options, "Failed retrieving GCE project. Please provide --project option: {}".format(str(err))) try: image = get_metadata('instance/image') options["image"] = image[image.rindex('/')+1:] except Exception as err: options["image"] = "unknown" if "--baremetalsolution" in options: options["--zone"] = "none" # Populates zone automatically if missing from the command zones = [] if not "--zone" in options else options["--zone"].split(",") options["--plugzonemap"] = {} if "--plug" in options: for i, instance in enumerate(options["--plug"].split(",")): if len(zones) == 1: # If only one zone is specified, use it across all plugs options["--plugzonemap"][instance] = zones[0] continue if len(zones) - 1 >= i: # If we have enough zones specified with the --zone flag use the zone at # the same index as the plug options["--plugzonemap"][instance] = zones[i] continue try: # In this case we do not have a zone specified so we attempt to detect it options["--plugzonemap"][instance] = get_zone(conn, options, instance) except Exception as err: fail_fence_agent(options, "Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err))) # Operate the fencing device result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list, power_cycle) sys.exit(result) if __name__ == "__main__": main() diff --git a/tests/data/metadata/fence_gce.xml b/tests/data/metadata/fence_gce.xml index 2a89b16c..21ecaae9 100644 --- a/tests/data/metadata/fence_gce.xml +++ b/tests/data/metadata/fence_gce.xml @@ -1,213 +1,213 @@ fence_gce is an I/O Fencing agent for GCE (Google Cloud Engine). It uses the googleapiclient library to connect to GCE. googleapiclient can be configured with Google SDK CLI or by executing 'gcloud auth application-default login'. For instructions see: https://cloud.google.com/compute/docs/tutorials/python-guide http://cloud.google.com Fencing action Method to fence Physical plug number on device, UUID or identification of machine Physical plug number on device, UUID or identification of machine Zone. Project ID. Stackdriver-logging support. Stackdriver-logging support. If enabled this is a bare metal offering from google. Timeout in seconds to use for API calls, default is 60. Number of retries on failure for API calls, default is 3. Time to sleep in seconds between API retries, default is 5. Service Account to use for authentication to the google cloud APIs. Comma separated zone map when fencing multiple plugs. If a proxy is used for internet access, the proxy host should be specified. If a proxy is used for internet access, the proxy port should be specified. - If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code. + If running set_power_status and existing reset operation is detected or runnning power_cycle, the fence agent will return before the operation completes with a 0 return code. If the operation is not completed within the timeout, the cluster operations are allowed to continue. If the operation is not completed within the timeout, cluster is notified of the operation failure. If a timeout would occur while running the agent, then the supplied command is run. If a failure would occur while running the agent, then the supplied command is run. Disable logging to stderr. Does not affect --verbose or --debug-file or logging to syslog. Verbose mode. Multiple -v flags can be stacked on the command line (e.g., -vvv) to increase verbosity. Level of debugging detail in output. Defaults to the number of --verbose flags specified on the command line, or to 1 if verbose=1 in a stonith device configuration (i.e., on stdin). Write debug information to given file Write debug information to given file Display version information and exit Display help and exit Separator for plug parameter when specifying more than 1 plug Separator for CSV created by 'list' operation Wait X seconds before fencing is started Disable timeout (true/false) (default: true when run from Pacemaker 2.0+) Wait X seconds for cmd prompt after login Test X seconds for status change after ON/OFF Wait X seconds after issuing ON/OFF Wait X seconds for cmd prompt after issuing command Sleep X seconds between status calls during a STONITH action Count of attempts to retry power on