diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index e2a2391f..8322df9e 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -1,204 +1,208 @@ #!@PYTHON@ -tt import sys, re import logging import atexit sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import * from fencing import fail, fail_usage, run_delay, EC_STATUS, SyslogLibHandler import requests -import boto3 from requests import HTTPError -from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError + +try: + import boto3 + from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError +except ImportError: + pass logger = logging.getLogger("fence_aws") logger.propagate = False logger.setLevel(logging.INFO) logger.addHandler(SyslogLibHandler()) logging.getLogger('botocore.vendored').propagate = False def get_instance_id(): try: token = requests.put('http://169.254.169.254/latest/api/token', headers={"X-aws-ec2-metadata-token-ttl-seconds" : "21600"}).content.decode("UTF-8") r = requests.get('http://169.254.169.254/latest/meta-data/instance-id', headers={"X-aws-ec2-metadata-token" : token}).content.decode("UTF-8") return r except HTTPError as http_err: logger.error('HTTP error occurred while trying to access EC2 metadata server: %s', http_err) except Exception as err: logger.error('A fatal error occurred while trying to access EC2 metadata server: %s', err) return None def get_nodes_list(conn, options): logger.info("Starting monitor operation") result = {} try: for instance in conn.instances.all(): result[instance.id] = ("", None) except ClientError: fail_usage("Failed: Incorrect Access Key or Secret Key.") except EndpointConnectionError: fail_usage("Failed: Incorrect Region.") except ConnectionError as e: fail_usage("Failed: Unable to connect to AWS: " + str(e)) except Exception as e: logger.error("Failed to get node list: %s", e) logger.debug("Monitor operation OK: %s",result) return result def get_power_status(conn, options): logger.debug("Starting status operation") try: instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [options["--plug"]]}]) state = list(instance)[0].state["Name"] logger.info("Status operation for EC2 instance %s returned state: %s",options["--plug"],state.upper()) if state == "running": return "on" elif state == "stopped": return "off" else: return "unknown" except ClientError: fail_usage("Failed: Incorrect Access Key or Secret Key.") except EndpointConnectionError: fail_usage("Failed: Incorrect Region.") except IndexError: fail(EC_STATUS) except Exception as e: logging.error("Failed to get power status: %s", e) fail(EC_STATUS) def get_self_power_status(conn, instance_id): try: instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}]) state = list(instance)[0].state["Name"] if state == "running": logging.debug("Captured my (%s) state and it %s - returning OK - Proceeding with fencing",instance_id,state.upper()) return "ok" else: logging.debug("Captured my (%s) state it is %s - returning Alert - Unable to fence other nodes",instance_id,state.upper()) return "alert" except ClientError: fail_usage("Failed: Incorrect Access Key or Secret Key.") except EndpointConnectionError: fail_usage("Failed: Incorrect Region.") except IndexError: return "fail" def set_power_status(conn, options): my_instance = get_instance_id() try: if (options["--action"]=="off"): if (get_self_power_status(conn,my_instance) == "ok"): conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True) logger.info("Called StopInstance API call for %s", options["--plug"]) else: logger.info("Skipping fencing as instance is not in running status") elif (options["--action"]=="on"): conn.instances.filter(InstanceIds=[options["--plug"]]).start() except Exception as e: logger.error("Failed to power %s %s: %s", \ options["--action"], options["--plug"], e) def define_new_opts(): all_opt["region"] = { "getopt" : "r:", "longopt" : "region", "help" : "-r, --region=[region] Region, e.g. us-east-1", "shortdesc" : "Region.", "required" : "0", "order" : 2 } all_opt["access_key"] = { "getopt" : "a:", "longopt" : "access-key", "help" : "-a, --access-key=[key] Access Key", "shortdesc" : "Access Key.", "required" : "0", "order" : 3 } all_opt["secret_key"] = { "getopt" : "s:", "longopt" : "secret-key", "help" : "-s, --secret-key=[key] Secret Key", "shortdesc" : "Secret Key.", "required" : "0", "order" : 4 } all_opt["boto3_debug"] = { "getopt" : "b:", "longopt" : "boto3_debug", "help" : "-b, --boto3_debug=[option] Boto3 and Botocore library debug logging", "shortdesc": "Boto Lib debug", "required": "0", "default": "False", "order": 5 } # Main agent method def main(): conn = None device_opt = ["port", "no_password", "region", "access_key", "secret_key", "boto3_debug"] atexit.register(atexit_handler) define_new_opts() all_opt["power_timeout"]["default"] = "60" options = check_input(device_opt, process_input(device_opt)) docs = {} docs["shortdesc"] = "Fence agent for AWS (Amazon Web Services)" docs["longdesc"] = "fence_aws is an I/O Fencing agent for AWS (Amazon Web\ Services). It uses the boto3 library to connect to AWS.\ \n.P\n\ boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.\n\ For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration" docs["vendorurl"] = "http://www.amazon.com" show_docs(options, docs) run_delay(options) if options.get("--verbose") is not None: lh = logging.FileHandler('/var/log/fence_aws_debug.log') logger.addHandler(lh) lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') lh.setFormatter(lhf) logger.setLevel(logging.DEBUG) if options["--boto3_debug"].lower() not in ["1", "yes", "on", "true"]: boto3.set_stream_logger('boto3',logging.INFO) boto3.set_stream_logger('botocore',logging.CRITICAL) logging.getLogger('botocore').propagate = False logging.getLogger('boto3').propagate = False else: log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s') logging.getLogger('botocore').propagate = False logging.getLogger('boto3').propagate = False fdh = logging.FileHandler('/var/log/fence_aws_boto3.log') fdh.setFormatter(log_format) logging.getLogger('boto3').addHandler(fdh) logging.getLogger('botocore').addHandler(fdh) logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_boto3.log", options["--boto3_debug"]) region = options.get("--region") access_key = options.get("--access-key") secret_key = options.get("--secret-key") try: conn = boto3.resource('ec2', region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key) except Exception as e: fail_usage("Failed: Unable to connect to AWS: " + str(e)) # Operate the fencing device result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list) sys.exit(result) if __name__ == "__main__": main() diff --git a/agents/gce/fence_gce.py b/agents/gce/fence_gce.py index 84cf3634..d69acf4e 100644 --- a/agents/gce/fence_gce.py +++ b/agents/gce/fence_gce.py @@ -1,410 +1,410 @@ #!@PYTHON@ -tt # # Requires the googleapiclient and oauth2client # RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2 # RHEL 8.x: nothing additional needed # SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce # SLES 15.x: python3-google-api-python-client python3-oauth2client # import atexit import logging import json import re import os import socket import sys import time if sys.version_info >= (3, 0): # Python 3 imports. import urllib.parse as urlparse import urllib.request as urlrequest else: # Python 2 imports. import urllib as urlparse import urllib2 as urlrequest sys.path.append("@FENCEAGENTSLIBDIR@") -import googleapiclient.discovery from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action try: + import googleapiclient.discovery from oauth2client.client import GoogleCredentials from oauth2client.service_account import ServiceAccountCredentials except: pass METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' METADATA_HEADERS = {'Metadata-Flavor': 'Google'} # # Will use baremetalsolution setting or the environment variable # FENCE_GCE_URI_REPLACEMENTS to replace the uri for calls to *.googleapis.com. # def replace_api_uri(options, http_request): uri_replacements = [] # put any env var replacements first, then baremetalsolution if in options if "FENCE_GCE_URI_REPLACEMENTS" in os.environ: logging.debug("FENCE_GCE_URI_REPLACEMENTS environment variable exists") env_uri_replacements = os.environ["FENCE_GCE_URI_REPLACEMENTS"] try: uri_replacements_json = json.loads(env_uri_replacements) if isinstance(uri_replacements_json, list): uri_replacements = uri_replacements_json else: logging.warning("FENCE_GCE_URI_REPLACEMENTS exists, but is not a JSON List") except ValueError as e: logging.warning("FENCE_GCE_URI_REPLACEMENTS exists but is not valid JSON") if "--baremetalsolution" in options: uri_replacements.append( { "matchlength": 4, "match": "https://compute.googleapis.com/compute/v1/projects/(.*)/zones/(.*)/instances/(.*)/reset(.*)", "replace": "https://baremetalsolution.googleapis.com/v1alpha1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4" }) for uri_replacement in uri_replacements: # each uri_replacement should have matchlength, match, and replace if "matchlength" not in uri_replacement or "match" not in uri_replacement or "replace" not in uri_replacement: logging.warning("FENCE_GCE_URI_REPLACEMENTS missing matchlength, match, or replace in %s" % uri_replacement) continue match = re.match(uri_replacement["match"], http_request.uri) if match is None or len(match.groups()) != uri_replacement["matchlength"]: continue replaced_uri = re.sub(uri_replacement["match"], uri_replacement["replace"], http_request.uri) match = re.match("https:\/\/.*.googleapis.com", replaced_uri) if match is None or match.start() != 0: logging.warning("FENCE_GCE_URI_REPLACEMENTS replace is not " "targeting googleapis.com, ignoring it: %s" % replaced_uri) continue logging.debug("Replacing googleapis uri %s with %s" % (http_request.uri, replaced_uri)) http_request.uri = replaced_uri break return http_request def retry_api_execute(options, http_request): replaced_http_request = replace_api_uri(options, http_request) retries = 3 if options.get("--retries"): retries = int(options.get("--retries")) retry_sleep = 5 if options.get("--retrysleep"): retry_sleep = int(options.get("--retrysleep")) retry = 0 current_err = None while retry <= retries: if retry > 0: time.sleep(retry_sleep) try: return replaced_http_request.execute() except Exception as err: current_err = err logging.warning("Could not execute api call to: %s, retry: %s, " "err: %s" % (replaced_http_request.uri, retry, str(err))) retry += 1 raise current_err def translate_status(instance_status): "Returns on | off | unknown." if instance_status == "RUNNING": return "on" elif instance_status == "TERMINATED": return "off" return "unknown" def get_nodes_list(conn, options): result = {} try: instanceList = retry_api_execute(options, conn.instances().list( project=options["--project"], zone=options["--zone"])) for instance in instanceList["items"]: result[instance["id"]] = (instance["name"], translate_status(instance["status"])) except Exception as err: fail_usage("Failed: get_nodes_list: {}".format(str(err))) return result def get_power_status(conn, options): logging.debug("get_power_status") # if this is bare metal we need to just send back the opposite of the # requested action: if on send off, if off send on if "--baremetalsolution" in options: if options.get("--action") == "on": return "off" else: return "on" try: instance = retry_api_execute(options, conn.instances().get( project=options["--project"], zone=options["--zone"], instance=options["--plug"])) return translate_status(instance["status"]) except Exception as err: fail_usage("Failed: get_power_status: {}".format(str(err))) def wait_for_operation(conn, options, operation): if 'name' not in operation: logging.warning('Cannot wait for operation to complete, the' ' requested operation will continue asynchronously') return project = options["--project"] zone = options["--zone"] while True: result = retry_api_execute(options, conn.zoneOperations().get( project=project, zone=zone, operation=operation['name'])) if result['status'] == 'DONE': if 'error' in result: raise Exception(result['error']) return time.sleep(1) def set_power_status(conn, options): logging.debug("set_power_status"); try: if options["--action"] == "off": logging.info("Issuing poweroff of %s in zone %s" % (options["--plug"], options["--zone"])) operation = retry_api_execute(options, conn.instances().stop( project=options["--project"], zone=options["--zone"], instance=options["--plug"])) logging.info("Poweroff command completed, waiting for the operation to complete") wait_for_operation(conn, options, operation) logging.info("Poweroff of %s in zone %s complete" % (options["--plug"], options["--zone"])) elif options["--action"] == "on": logging.info("Issuing poweron of %s in zone %s" % (options["--plug"], options["--zone"])) operation = retry_api_execute(options, conn.instances().start( project=options["--project"], zone=options["--zone"], instance=options["--plug"])) wait_for_operation(conn, options, operation) logging.info("Poweron of %s in zone %s complete" % (options["--plug"], options["--zone"])) except Exception as err: fail_usage("Failed: set_power_status: {}".format(str(err))) def power_cycle(conn, options): logging.debug("power_cycle"); try: logging.info('Issuing reset of %s in zone %s' % (options["--plug"], options["--zone"])) operation = retry_api_execute(options, conn.instances().reset( project=options["--project"], zone=options["--zone"], instance=options["--plug"])) logging.info("Reset command completed, waiting for the operation to complete") wait_for_operation(conn, options, operation) logging.info('Reset of %s in zone %s complete' % (options["--plug"], options["--zone"])) return True except Exception as err: logging.error("Failed: power_cycle: {}".format(str(err))) return False def get_zone(conn, options): logging.debug("get_zone"); project = options['--project'] instance = options['--plug'] fl = 'name="%s"' % instance request = replace_api_uri(options, conn.instances().aggregatedList(project=project, filter=fl)) while request is not None: response = request.execute() zones = response.get('items', {}) for zone in zones.values(): for inst in zone.get('instances', []): if inst['name'] == instance: return inst['zone'].split("/")[-1] request = replace_api_uri(options, conn.instances().aggregatedList_next( previous_request=request, previous_response=response)) raise Exception("Unable to find instance %s" % (instance)) def get_metadata(metadata_key, params=None, timeout=None): """Performs a GET request with the metadata headers. Args: metadata_key: string, the metadata to perform a GET request on. params: dictionary, the query parameters in the GET request. timeout: int, timeout in seconds for metadata requests. Returns: HTTP response from the GET request. Raises: urlerror.HTTPError: raises when the GET request fails. """ logging.debug("get_metadata"); timeout = timeout or 60 metadata_url = os.path.join(METADATA_SERVER, metadata_key) params = urlparse.urlencode(params or {}) url = '%s?%s' % (metadata_url, params) request = urlrequest.Request(url, headers=METADATA_HEADERS) request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) return request_opener.open(request, timeout=timeout * 1.1).read().decode("utf-8") def define_new_opts(): all_opt["zone"] = { "getopt" : ":", "longopt" : "zone", "help" : "--zone=[name] Zone, e.g. us-central1-b", "shortdesc" : "Zone.", "required" : "0", "order" : 2 } all_opt["project"] = { "getopt" : ":", "longopt" : "project", "help" : "--project=[name] Project ID", "shortdesc" : "Project ID.", "required" : "0", "order" : 3 } all_opt["stackdriver-logging"] = { "getopt" : "", "longopt" : "stackdriver-logging", "help" : "--stackdriver-logging Enable Logging to Stackdriver", "shortdesc" : "Stackdriver-logging support.", "longdesc" : "If enabled IP failover logs will be posted to stackdriver logging.", "required" : "0", "order" : 4 } all_opt["baremetalsolution"] = { "getopt" : "", "longopt" : "baremetalsolution", "help" : "--baremetalsolution Enable on bare metal", "shortdesc" : "If enabled this is a bare metal offering from google.", "required" : "0", "order" : 5 } all_opt["apitimeout"] = { "getopt" : ":", "type" : "second", "longopt" : "apitimeout", "help" : "--apitimeout=[seconds] Timeout to use for API calls", "shortdesc" : "Timeout in seconds to use for API calls, default is 60.", "required" : "0", "default" : 60, "order" : 6 } all_opt["retries"] = { "getopt" : ":", "type" : "integer", "longopt" : "retries", "help" : "--retries=[retries] Number of retries on failure for API calls", "shortdesc" : "Number of retries on failure for API calls, default is 3.", "required" : "0", "default" : 3, "order" : 7 } all_opt["retrysleep"] = { "getopt" : ":", "type" : "second", "longopt" : "retrysleep", "help" : "--retrysleep=[seconds] Time to sleep between API retries", "shortdesc" : "Time to sleep in seconds between API retries, default is 5.", "required" : "0", "default" : 5, "order" : 8 } all_opt["serviceaccount"] = { "getopt" : ":", "longopt" : "serviceaccount", "help" : "--serviceaccount=[filename] Service account json file location e.g. serviceaccount=/somedir/service_account.json", "shortdesc" : "Service Account to use for authentication to the google cloud APIs.", "required" : "0", "order" : 9 } def main(): conn = None device_opt = ["port", "no_password", "zone", "project", "stackdriver-logging", "method", "baremetalsolution", "apitimeout", "retries", "retrysleep", "serviceaccount"] atexit.register(atexit_handler) define_new_opts() all_opt["power_timeout"]["default"] = "60" options = check_input(device_opt, process_input(device_opt)) docs = {} docs["shortdesc"] = "Fence agent for GCE (Google Cloud Engine)" docs["longdesc"] = "fence_gce is an I/O Fencing agent for GCE (Google Cloud " \ "Engine). It uses the googleapiclient library to connect to GCE.\n" \ "googleapiclient can be configured with Google SDK CLI or by " \ "executing 'gcloud auth application-default login'.\n" \ "For instructions see: https://cloud.google.com/compute/docs/tutorials/python-guide" docs["vendorurl"] = "http://cloud.google.com" show_docs(options, docs) run_delay(options) # Prepare logging if options.get('--verbose') is None: logging.getLogger('googleapiclient').setLevel(logging.ERROR) logging.getLogger('oauth2client').setLevel(logging.ERROR) if options.get('--stackdriver-logging') is not None and options.get('--plug'): try: import google.cloud.logging.handlers client = google.cloud.logging.Client() handler = google.cloud.logging.handlers.CloudLoggingHandler(client, name=options['--plug']) handler.setLevel(logging.INFO) formatter = logging.Formatter('gcp:stonith "%(message)s"') handler.setFormatter(formatter) root_logger = logging.getLogger() if options.get('--verbose') is None: root_logger.setLevel(logging.INFO) root_logger.addHandler(handler) except ImportError: logging.error('Couldn\'t import google.cloud.logging, ' 'disabling Stackdriver-logging support') # if apitimeout is defined we set the socket timeout, if not we keep the # socket default which is 60s if options.get("--apitimeout"): socket.setdefaulttimeout(options["--apitimeout"]) # Prepare cli try: if options.get("--serviceaccount"): credentials = ServiceAccountCredentials.from_json_keyfile_name(options.get("--serviceaccount")) logging.debug("using credentials from service account") else: credentials = GoogleCredentials.get_application_default() logging.debug("using application default credentials") conn = googleapiclient.discovery.build( 'compute', 'v1', credentials=credentials, cache_discovery=False) except Exception as err: fail_usage("Failed: Create GCE compute v1 connection: {}".format(str(err))) # Get project and zone if not options.get("--project"): try: options["--project"] = get_metadata('project/project-id') except Exception as err: fail_usage("Failed retrieving GCE project. Please provide --project option: {}".format(str(err))) if "--baremetalsolution" in options: options["--zone"] = "none" if not options.get("--zone"): try: options["--zone"] = get_zone(conn, options) except Exception as err: fail_usage("Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err))) # Operate the fencing device result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list, power_cycle) sys.exit(result) if __name__ == "__main__": main()