Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/agents/aws_vpc_net/fence_aws_vpc_net.py b/agents/aws_vpc_net/fence_aws_vpc_net.py
index e1415c52..cf61f2d7 100644
--- a/agents/aws_vpc_net/fence_aws_vpc_net.py
+++ b/agents/aws_vpc_net/fence_aws_vpc_net.py
@@ -1,864 +1,980 @@
#!@PYTHON@ -tt
import sys, re
import json
import atexit
import logging
import time
import requests
sys.path.append("@FENCEAGENTSLIBDIR@")
from fencing import *
from fencing import (
run_delay,
fail,
fail_usage,
EC_STATUS,
EC_GENERIC_ERROR,
SyslogLibHandler
)
try:
- import boto3
- from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError
+ import boto3
+ from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError
except ImportError:
- pass
+ logger.error("Unable to import boto3 module. Please install boto3: pip install boto3")
+ sys.exit(EC_GENERIC_ERROR)
# Logger configuration
logger = logging.getLogger()
logger.propagate = False
logger.setLevel(logging.INFO)
logger.addHandler(SyslogLibHandler())
logging.getLogger('botocore.vendored').propagate = False
status = {
- "running": "on",
- "stopped": "off",
- "pending": "unknown",
- "stopping": "unknown",
- "shutting-down": "unknown",
- "terminated": "unknown"
+ "running": "on",
+ "stopped": "off",
+ "pending": "unknown",
+ "stopping": "unknown",
+ "shutting-down": "unknown",
+ "terminated": "unknown"
}
def get_power_status(conn, options):
- logger.debug("Starting status operation")
- try:
- instance_id = options["--plug"]
- ec2_client = conn.meta.client
-
- # Get the lastfence tag first
- lastfence_response = ec2_client.describe_tags(
- Filters=[
- {"Name": "resource-id", "Values": [instance_id]},
- {"Name": "key", "Values": ["lastfence"]}
- ]
- )
-
- if not lastfence_response["Tags"]:
- logger.debug("No lastfence tag found for instance %s - instance is not fenced", instance_id)
- return "on"
-
- lastfence_timestamp = lastfence_response["Tags"][0]["Value"]
-
- # Check for backup tags with pattern Original_SG_Backup_{instance_id}_*
- response = ec2_client.describe_tags(
- Filters=[
- {"Name": "resource-id", "Values": [instance_id]},
- {"Name": "key", "Values": [f"Original_SG_Backup_{instance_id}*"]}
- ]
- )
-
- if not response["Tags"]:
- logger.debug("No backup tags found for instance %s - instance is not fenced", instance_id)
- return "on"
-
- # Loop through backup tags to find matching timestamp
- for tag in response["Tags"]:
- try:
- backup_data = json.loads(tag["Value"])
- backup_timestamp = backup_data.get("t") # Using shortened timestamp field
-
- if not backup_timestamp:
- logger.debug("No timestamp found in backup data for tag %s", tag["Key"])
- continue
-
- # Validate timestamps match
- if str(backup_timestamp) == str(lastfence_timestamp):
- logger.debug("Found matching backup tag %s - instance is fenced", tag["Key"])
- return "off"
-
- except (json.JSONDecodeError, KeyError) as e:
- logger.error(f"Failed to parse backup data for tag {tag['Key']}: {str(e)}")
- continue
-
- logger.debug("No backup tags with matching timestamp found - instance is not fenced")
- return "on"
-
- except ClientError:
- fail_usage("Failed: Incorrect Access Key or Secret Key.")
- except EndpointConnectionError:
- fail_usage("Failed: Incorrect Region.")
- except IndexError:
- fail(EC_STATUS)
- except Exception as e:
- logger.error("Failed to get power status: %s", e)
- fail(EC_STATUS)
+ logger.debug("Starting status operation")
+ try:
+ instance_id = options["--plug"]
+ ec2_client = conn.meta.client
+
+ # Get the lastfence tag first
+ lastfence_response = ec2_client.describe_tags(
+ Filters=[
+ {"Name": "resource-id", "Values": [instance_id]},
+ {"Name": "key", "Values": ["lastfence"]}
+ ]
+ )
+
+ # Helper function to check if security groups have been modified
+ def check_sg_modifications():
+ try:
+ state, _, interfaces = get_instance_details(ec2_client, instance_id)
+ if state == "running": # Only check SGs if instance is running
+ sg_to_remove = options.get("--secg", "").split(",") if options.get("--secg") else []
+ if sg_to_remove:
+ # Check if all interfaces have had their security groups modified
+ all_interfaces_fenced = True
+ for interface in interfaces:
+ current_sgs = interface["SecurityGroups"]
+ if "--invert-sg-removal" in options:
+ # In keep_only mode, check if interface only has the specified groups
+ if sorted(current_sgs) != sorted(sg_to_remove):
+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has different security groups")
+ all_interfaces_fenced = False
+ break
+ else:
+ # In remove mode, check if specified groups were removed
+ if any(sg in current_sgs for sg in sg_to_remove):
+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has security groups that should be removed")
+ all_interfaces_fenced = False
+ break
+
+ if all_interfaces_fenced:
+ logger.debug("All interfaces have had their security groups successfully modified - considering instance fenced")
+ return True
+ except Exception as e:
+ logger.debug("Failed to check security group modifications: %s", e)
+ return False
+
+ # If --ignore-tag-write-failure is set, prioritize checking SG modifications
+ if "--ignore-tag-write-failure" in options:
+ logger.debug("--ignore-tag-write-failure is set, checking security group modifications first")
+ if check_sg_modifications():
+ logger.info("All interfaces are properly fenced based on security group state, ignoring tag state")
+ return "off"
+ logger.debug("Not all interfaces are fenced, proceeding with tag checks")
+ # Only proceed with tag checks if we haven't determined state from SG modifications
+
+ try:
+ # If no lastfence tag exists, instance is not fenced
+ if not lastfence_response["Tags"]:
+ logger.debug("No lastfence tag found for instance %s - instance is not fenced", instance_id)
+ return "on"
+
+ lastfence_timestamp = lastfence_response["Tags"][0]["Value"]
+ except Exception as e:
+ if "--ignore-tag-write-failure" in options:
+ logger.warning(f"Failed to check lastfence tag but continuing due to ignore-tag-write-failure: {str(e)}")
+ # If we can't read tags but --ignore-tag-write-failure is set, rely on SG state
+ return "on" # Default to "on" to allow fence operation to proceed
+ raise
+
+ # Check for backup tags with pattern Original_SG_Backup_{instance_id}_*
+ response = ec2_client.describe_tags(
+ Filters=[
+ {"Name": "resource-id", "Values": [instance_id]},
+ {"Name": "key", "Values": [f"Original_SG_Backup_{instance_id}*"]}
+ ]
+ )
+
+ # If no backup tags exist, instance is not fenced (unless --ignore-tag-write-failure handled above)
+ if not response["Tags"]:
+ logger.debug("No backup tags found for instance %s - instance is not fenced", instance_id)
+ return "on"
+
+ # Loop through backup tags to find matching timestamp
+ for tag in response["Tags"]:
+ try:
+ backup_data = json.loads(tag["Value"])
+ backup_timestamp = backup_data.get("t") # Using shortened timestamp field
+
+ if not backup_timestamp:
+ logger.debug("No timestamp found in backup data for tag %s", tag["Key"])
+ continue
+
+ # Validate timestamps match
+ if str(backup_timestamp) == str(lastfence_timestamp):
+ # Check if security groups were actually modified to confirm fencing
+ try:
+ state, _, interfaces = get_instance_details(ec2_client, instance_id)
+ if state == "running": # Only check SGs if instance is running
+ sg_to_remove = options.get("--secg", "").split(",") if options.get("--secg") else []
+ if sg_to_remove:
+ # Check if all interfaces have had their security groups modified
+ all_interfaces_fenced = True
+ for interface in interfaces:
+ current_sgs = interface["SecurityGroups"]
+ if "--invert-sg-removal" in options:
+ # In keep_only mode, check if interface only has the specified groups
+ if sorted(current_sgs) != sorted(sg_to_remove):
+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has different security groups")
+ all_interfaces_fenced = False
+ break
+ else:
+ # In remove mode, check if specified groups were removed
+ if any(sg in current_sgs for sg in sg_to_remove):
+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has security groups that should be removed")
+ all_interfaces_fenced = False
+ break
+
+ if all_interfaces_fenced:
+ logger.debug("Found matching backup tag %s and verified all interfaces have SG changes - instance is fenced", tag["Key"])
+ return "off"
+ except Exception as e:
+ logger.debug("Failed to check security group modifications: %s", e)
+ # If we can't verify SG changes but have matching tags, assume fenced for backward compatibility
+ logger.debug("Found matching backup tag %s but couldn't verify SG changes - assuming instance is fenced", tag["Key"])
+ return "off"
+
+ except (json.JSONDecodeError, KeyError) as e:
+ logger.error(f"Failed to parse backup data for tag {tag['Key']}: {str(e)}")
+ continue
+
+ logger.debug("No backup tags with matching timestamp found - instance is not fenced")
+ return "on"
+
+ except ClientError:
+ fail_usage("Failed: Incorrect Access Key or Secret Key.")
+ except EndpointConnectionError:
+ fail_usage("Failed: Incorrect Region.")
+ except IndexError:
+ fail(EC_STATUS)
+ except Exception as e:
+ logger.error("Failed to get power status: %s", e)
+ fail(EC_STATUS)
# Retrieve instance ID for self-check
def get_instance_id():
"""Retrieve the instance ID of the current EC2 instance."""
try:
token = requests.put(
"http://169.254.169.254/latest/api/token",
headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
).content.decode("UTF-8")
instance_id = requests.get(
"http://169.254.169.254/latest/meta-data/instance-id",
headers={"X-aws-ec2-metadata-token": token},
).content.decode("UTF-8")
return instance_id
except Exception as err:
logger.error("Failed to retrieve instance ID for self-check: %s", err)
return None
# Retrieve instance details
def get_instance_details(ec2_client, instance_id):
"""Retrieve instance details including state, VPC, interfaces, and attached SGs."""
try:
response = ec2_client.describe_instances(InstanceIds=[instance_id])
instance = response["Reservations"][0]["Instances"][0]
instance_state = instance["State"]["Name"]
vpc_id = instance["VpcId"]
network_interfaces = instance["NetworkInterfaces"]
interfaces = []
for interface in network_interfaces:
try:
interfaces.append(
{
"NetworkInterfaceId": interface["NetworkInterfaceId"],
"SecurityGroups": [sg["GroupId"] for sg in interface["Groups"]],
}
)
except KeyError as e:
logger.error(f"Malformed interface data: {str(e)}")
continue
return instance_state, vpc_id, interfaces
except ClientError as e:
logger.error(f"AWS API error while retrieving instance details: {str(e)}")
raise
except IndexError as e:
logger.error(f"Instance {instance_id} not found or no instances returned: {str(e)}")
raise
except KeyError as e:
logger.error(f"Unexpected response format from AWS API: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error while retrieving instance details: {str(e)}")
raise
# Check if we are the self-fencing node
def get_self_power_status(conn, instance_id):
- try:
- instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
- state = list(instance)[0].state["Name"]
- if state == "running":
- logger.debug(f"Captured my ({instance_id}) state and it {state.upper()} - returning OK - Proceeding with fencing")
- return "ok"
- else:
- logger.debug(f"Captured my ({instance_id}) state it is {state.upper()} - returning Alert - Unable to fence other nodes")
- return "alert"
-
- except ClientError:
- fail_usage("Failed: Incorrect Access Key or Secret Key.")
- except EndpointConnectionError:
- fail_usage("Failed: Incorrect Region.")
- except IndexError:
- return "fail"
+ try:
+ instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
+ state = list(instance)[0].state["Name"]
+ if state == "running":
+ logger.debug(f"Captured my ({instance_id}) state and it {state.upper()} - returning OK - Proceeding with fencing")
+ return "ok"
+ else:
+ logger.debug(f"Captured my ({instance_id}) state it is {state.upper()} - returning Alert - Unable to fence other nodes")
+ return "alert"
+
+ except ClientError:
+ fail_usage("Failed: Incorrect Access Key or Secret Key.")
+ except EndpointConnectionError:
+ fail_usage("Failed: Incorrect Region.")
+ except IndexError:
+ return "fail"
# Create backup tags for each network interface
def create_backup_tag(ec2_client, instance_id, interfaces, timestamp):
"""Create tags on the instance to backup original security groups for each network interface.
If the security groups list is too long, it will be split across multiple tags."""
try:
# Create tags for each network interface
for idx, interface in enumerate(interfaces, 1):
interface_id = interface["NetworkInterfaceId"]
security_groups = interface["SecurityGroups"]
# Initialize variables for chunking
sg_chunks = []
current_chunk = []
# Strip 'sg-' prefix from all security groups first
stripped_sgs = [sg[3:] if sg.startswith('sg-') else sg for sg in security_groups]
for sg in stripped_sgs:
# Create a test chunk with the new security group
test_chunk = current_chunk + [sg]
# Create a test backup object with this chunk
test_backup = {
"n": {
"i": interface_id,
"s": test_chunk,
"c": {
"i": len(sg_chunks),
"t": 1 # Temporary value, will update later
}
},
"t": timestamp
}
# Check if adding this SG would exceed the character limit
if len(json.dumps(test_backup)) > 254:
# Current chunk is full, add it to chunks and start a new one
if current_chunk: # Only add if not empty
sg_chunks.append(current_chunk)
current_chunk = [sg]
else:
# Edge case: single SG exceeds limit (shouldn't happen with normal SG IDs)
logger.warning(f"Security group ID {sg} is unusually long")
sg_chunks.append([sg])
else:
# Add SG to current chunk
current_chunk = test_chunk
# Add the last chunk if it has any items
if current_chunk:
sg_chunks.append(current_chunk)
# Update total chunks count and create tags
for chunk_idx, sg_chunk in enumerate(sg_chunks):
sg_backup = {
"n": { # NetworkInterface shortened to n
"i": interface_id, # ni shortened to i
"s": sg_chunk, # sg shortened to s, with 'sg-' prefix stripped
"c": { # ci shortened to c
"i": chunk_idx,
"t": len(sg_chunks)
}
},
"t": timestamp # ts shortened to t
}
tag_value = json.dumps(sg_backup)
tag_key = f"Original_SG_Backup_{instance_id}_{timestamp}_{idx}_{chunk_idx}"
# Create the tag
ec2_client.create_tags(
Resources=[instance_id],
Tags=[{"Key": tag_key, "Value": tag_value}],
)
# Verify the tag was created
response = ec2_client.describe_tags(
Filters=[
{"Name": "resource-id", "Values": [instance_id]},
{"Name": "key", "Values": [tag_key]}
]
)
if not response["Tags"]:
logger.error(f"Failed to verify creation of backup tag '{tag_key}' for instance {instance_id}")
raise Exception("Backup tag creation could not be verified")
created_tag_value = response["Tags"][0]["Value"]
if created_tag_value != tag_value:
logger.error(f"Created tag value does not match expected value for instance {instance_id}")
raise Exception("Backup tag value mismatch")
logger.info(f"Backup tag '{tag_key}' chunk {chunk_idx + 1}/{len(sg_chunks)} created and verified for interface {interface_id}.")
except ClientError as e:
logger.error(f"AWS API error while creating/verifying backup tag: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error while creating/verifying backup tag: {str(e)}")
raise
-def modify_security_groups(ec2_client, instance_id, sg_list, timestamp, mode="remove"):
+def modify_security_groups(ec2_client, instance_id, sg_list, timestamp, mode="remove", options=None):
"""
Modifies security groups on network interfaces based on the specified mode.
In 'remove' mode: Removes all SGs in sg_list from each interface
In 'keep_only' mode: Keeps only the SGs in sg_list and removes all others
Args:
ec2_client: The boto3 EC2 client
instance_id: The ID of the EC2 instance
sg_list: List of security group IDs to remove or keep
timestamp: Unix timestamp for backup tag
mode: Either "remove" or "keep_only" to determine operation mode
Raises:
ClientError: If AWS API calls fail
Exception: For other unexpected errors
"""
try:
# Get instance details
state, _, interfaces = get_instance_details(ec2_client, instance_id)
+ # Create a backup tag before making any changes
try:
- # Create a backup tag before making changes
create_backup_tag(ec2_client, instance_id, interfaces, timestamp)
- except ClientError as e:
- logger.warning(f"Failed to create backup tag: {str(e)}")
- # Continue execution even if backup tag creation fails
+ try:
+ set_lastfence_tag(ec2_client, instance_id, timestamp)
+ except Exception as e:
+ if "--ignore-tag-write-failure" in options:
+ logger.warning(f"Failed to set lastfence tag but continuing due to --ignore-tag-write-failure: {str(e)}")
+ logger.info("Will rely on security group state for fencing status")
+ else:
+ logger.error(f"Failed to set lastfence tag: {str(e)}")
+ raise
+ except Exception as e:
+ if "--ignore-tag-write-failure" in options:
+ logger.warning(f"Failed to create backup tag but continuing due to --ignore-tag-write-failure: {str(e)}")
+ logger.info("Will rely on security group state for fencing status")
+ else:
+ logger.error(f"Failed to create backup tag: {str(e)}")
+ raise
changed_any = False
for interface in interfaces:
try:
original_sgs = interface["SecurityGroups"]
if mode == "remove":
# Exclude any SGs that are in sg_list
updated_sgs = [sg for sg in original_sgs if sg not in sg_list]
operation_desc = f"removing {sg_list}"
else: # keep_only mode
# Set interface to only use the specified security groups
updated_sgs = sg_list
operation_desc = f"keeping only {sg_list}"
# Skip if we'd end up with zero SGs (only in remove mode)
if mode == "remove" and not updated_sgs:
logger.info(
f"Skipping interface {interface['NetworkInterfaceId']}: "
f"removal of {sg_list} would leave 0 SGs."
)
continue
# Skip if no changes needed
if updated_sgs == original_sgs:
continue
logger.info(
f"Updating interface {interface['NetworkInterfaceId']} from {original_sgs} "
f"to {updated_sgs} ({operation_desc})"
)
try:
ec2_client.modify_network_interface_attribute(
NetworkInterfaceId=interface["NetworkInterfaceId"],
Groups=updated_sgs
)
changed_any = True
except ClientError as e:
logger.error(
f"Failed to modify security groups for interface "
f"{interface['NetworkInterfaceId']}: {str(e)}"
)
continue
except KeyError as e:
logger.error(f"Malformed interface data: {str(e)}")
continue
- # If we didn't modify anything, log appropriate error
+ # If we didn't modify anything, raise an error
if not changed_any:
if mode == "remove":
error_msg = f"Security Groups {sg_list} not removed from any interface. Either not found, or removal left 0 SGs."
else:
error_msg = f"Security Groups {sg_list} not found on any interface. No changes made."
logger.error(error_msg)
- sys.exit(EC_GENERIC_ERROR)
+ raise Exception("Failed to modify security groups: " + error_msg)
# Wait a bit for changes to propagate
time.sleep(5)
except ClientError as e:
logger.error(f"AWS API error: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error: {str(e)}")
raise
def restore_security_groups(ec2_client, instance_id):
"""
Restores the original security groups from backup tags to each network interface.
Each network interface's original security groups are stored in a separate backup tag.
All backup tags share the same timestamp as the lastfence tag for validation.
The process:
1. Get lastfence tag timestamp
2. Find all backup tags with matching timestamp
3. Create a map of interface IDs to their original security groups
4. Restore each interface's security groups from the map
5. Clean up matching backup tags and lastfence tag
Args:
ec2_client: The boto3 EC2 client
instance_id: The ID of the EC2 instance
Raises:
ClientError: If AWS API calls fail
Exception: For other unexpected errors
SystemExit: If required tags are missing or no changes were made
"""
try:
# Get the lastfence tag first
lastfence_response = ec2_client.describe_tags(
Filters=[
{"Name": "resource-id", "Values": [instance_id]},
{"Name": "key", "Values": ["lastfence"]}
]
)
if not lastfence_response["Tags"]:
logger.error(f"No lastfence tag found for instance {instance_id}")
sys.exit(EC_GENERIC_ERROR)
lastfence_timestamp = lastfence_response["Tags"][0]["Value"]
# Get all backup tags for this instance
backup_response = ec2_client.describe_tags(
Filters=[
{"Name": "resource-id", "Values": [instance_id]},
{"Name": "key", "Values": [f"Original_SG_Backup_{instance_id}*"]}
]
)
if not backup_response["Tags"]:
logger.error(f"No backup tags found for instance {instance_id}")
sys.exit(EC_GENERIC_ERROR)
# Find and combine backup tags with matching timestamp
matching_backups = {}
interface_chunks = {}
for tag in backup_response["Tags"]:
try:
backup_data = json.loads(tag["Value"])
backup_timestamp = backup_data.get("t") # Using shortened timestamp field
if not backup_timestamp or str(backup_timestamp) != str(lastfence_timestamp):
continue
logger.info(f"Found matching backup tag {tag['Key']}")
interface_data = backup_data.get("n") # Using shortened NetworkInterface field
if not interface_data or "i" not in interface_data: # Using shortened interface id field
continue
interface_id = interface_data["i"] # Using shortened interface id field
chunk_info = interface_data.get("c", {}) # Using shortened chunk info field
chunk_index = chunk_info.get("i", 0)
total_chunks = chunk_info.get("t", 1)
# Initialize tracking for this interface if needed
if interface_id not in interface_chunks:
interface_chunks[interface_id] = {
"total": total_chunks,
"chunks": {},
"security_groups": []
}
# Add this chunk's security groups
interface_chunks[interface_id]["chunks"][chunk_index] = interface_data["s"] # Using shortened security groups field
# If we have all chunks for this interface, combine them
if len(interface_chunks[interface_id]["chunks"]) == total_chunks:
# Combine chunks and restore 'sg-' prefix
combined_sgs = []
for i in range(total_chunks):
chunk_sgs = interface_chunks[interface_id]["chunks"][i]
# Add back 'sg-' prefix if not already present
restored_sgs = ['sg-' + sg if not sg.startswith('sg-') else sg for sg in chunk_sgs]
combined_sgs.extend(restored_sgs)
matching_backups[interface_id] = combined_sgs
except (json.JSONDecodeError, KeyError) as e:
logger.error(f"Failed to parse backup data for tag {tag['Key']}: {str(e)}")
continue
if not matching_backups:
logger.error("No complete backup data found with matching timestamp")
sys.exit(EC_GENERIC_ERROR)
# Get current interfaces
_, _, current_interfaces = get_instance_details(ec2_client, instance_id)
# Use the combined matching_backups as our backup_sg_map
backup_sg_map = matching_backups
changed_any = False
for interface in current_interfaces:
try:
interface_id = interface["NetworkInterfaceId"]
if interface_id not in backup_sg_map:
logger.warning(
f"No backup data found for interface {interface_id}. Skipping."
)
continue
original_sgs = backup_sg_map[interface_id]
current_sgs = interface["SecurityGroups"]
if original_sgs == current_sgs:
logger.info(
f"Interface {interface_id} already has original security groups. Skipping."
)
continue
logger.info(
f"Restoring interface {interface_id} from {current_sgs} "
f"to original security groups {original_sgs}"
)
try:
ec2_client.modify_network_interface_attribute(
NetworkInterfaceId=interface_id,
Groups=original_sgs
)
changed_any = True
except ClientError as e:
logger.error(
f"Failed to restore security groups for interface "
f"{interface_id}: {str(e)}"
)
continue
except KeyError as e:
logger.error(f"Malformed interface data: {str(e)}")
continue
if not changed_any:
logger.error("No security groups were restored. All interfaces skipped.")
sys.exit(EC_GENERIC_ERROR)
# Wait for changes to propagate
time.sleep(5)
# Clean up only the matching backup tags and lastfence tag after successful restore
try:
# Delete all backup tags that match the lastfence timestamp
tags_to_delete = [{"Key": "lastfence"}]
deleted_tag_keys = []
for tag in backup_response["Tags"]:
try:
backup_data = json.loads(tag["Value"])
if str(backup_data.get("t")) == str(lastfence_timestamp): # Using shortened timestamp field
tags_to_delete.append({"Key": tag["Key"]})
deleted_tag_keys.append(tag["Key"])
except (json.JSONDecodeError, KeyError):
continue
if len(tags_to_delete) > 1: # More than just the lastfence tag
ec2_client.delete_tags(
Resources=[instance_id],
Tags=tags_to_delete
)
logger.info(f"Removed matching backup tags {deleted_tag_keys} and lastfence tag from instance {instance_id}")
except ClientError as e:
logger.warning(f"Failed to remove tags: {str(e)}")
# Continue since the restore operation was successful
except ClientError as e:
logger.error(f"AWS API error: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error: {str(e)}")
raise
# Shutdown instance
def shutdown_instance(ec2_client, instance_id):
"""Shutdown the instance and confirm the state transition."""
try:
logger.info(f"Initiating shutdown for instance {instance_id}...")
ec2_client.stop_instances(InstanceIds=[instance_id], Force=True)
while True:
try:
state, _, _ = get_instance_details(ec2_client, instance_id)
logger.info(f"Current instance state: {state}")
if state == "stopping":
logger.info(
f"Instance {instance_id} is transitioning to 'stopping'. Proceeding without waiting further."
)
break
except ClientError as e:
logger.error(f"Failed to get instance state during shutdown: {str(e)}")
fail_usage(f"AWS API error while checking instance state: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error while checking instance state: {str(e)}")
fail_usage(f"Failed to check instance state: {str(e)}")
except ClientError as e:
logger.error(f"AWS API error during instance shutdown: {str(e)}")
fail_usage(f"Failed to shutdown instance: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error during instance shutdown: {str(e)}")
fail_usage(f"Failed to shutdown instance due to unexpected error: {str(e)}")
# Perform the fencing action
def get_nodes_list(conn, options):
"""Get list of nodes and their status."""
logger.debug("Starting monitor operation")
result = {}
try:
if "--filter" in options:
filter_key = options["--filter"].split("=")[0].strip()
filter_value = options["--filter"].split("=")[1].strip()
filter = [{"Name": filter_key, "Values": [filter_value]}]
logging.debug("Filter: {}".format(filter))
for instance in conn.instances.filter(Filters=filter if 'filter' in vars() else []):
instance_name = ""
for tag in instance.tags or []:
if tag.get("Key") == "Name":
instance_name = tag["Value"]
try:
result[instance.id] = (instance_name, status[instance.state["Name"]])
except KeyError as e:
if options.get("--original-action") == "list-status":
logger.error("Unknown status \"{}\" returned for {} ({})".format(instance.state["Name"], instance.id, instance_name))
result[instance.id] = (instance_name, "unknown")
except Exception as e:
logger.error("Failed to get node list: %s", e)
return result
def set_lastfence_tag(ec2_client, instance_id, timestamp):
"""Set a lastfence tag on the instance with the timestamp."""
try:
ec2_client.create_tags(
Resources=[instance_id],
Tags=[{"Key": "lastfence", "Value": str(timestamp)}]
)
logger.info(f"Set lastfence tag with timestamp {timestamp} on instance {instance_id}")
except Exception as e:
logger.error(f"Failed to set lastfence tag: {str(e)}")
raise
def set_power_status(conn, options):
"""Set power status of the instance."""
timestamp = int(time.time()) # Unix timestamp
ec2_client = conn.meta.client
instance_id = options["--plug"]
sg_to_remove = options.get("--secg", "").split(",") if options.get("--secg") else []
# Perform self-check if skip-race not set
if "--skip-race-check" not in options:
self_instance_id = get_instance_id()
if self_instance_id == instance_id:
fail_usage("Self-fencing detected. Exiting.")
try:
# Only verify instance is running for 'off' action
if options["--action"] == "off":
instance_state, _, _ = get_instance_details(ec2_client, instance_id)
if instance_state != "running":
fail_usage(f"Instance {instance_id} is not running. Exiting.")
if options["--action"] == "on":
if not "--unfence-ignore-restore" in options:
restore_security_groups(ec2_client, instance_id)
else:
logger.info("Ignored Restoring security groups as --unfence-ignore-restore is set")
elif options["--action"] == "off":
if sg_to_remove:
mode = "keep_only" if "--invert-sg-removal" in options else "remove"
- modify_security_groups(ec2_client, instance_id, sg_to_remove, timestamp, mode)
- set_lastfence_tag(ec2_client, instance_id, timestamp)
- if "--onfence-poweroff" in options:
- shutdown_instance(ec2_client, instance_id)
+ try:
+ modify_security_groups(ec2_client, instance_id, sg_to_remove, timestamp, mode, options)
+ if "--onfence-poweroff" in options:
+ shutdown_instance(ec2_client, instance_id)
+ except Exception as e:
+ if isinstance(e, ClientError):
+ logger.error("AWS API error: %s", e)
+ fail_usage(str(e))
+ elif "--ignore-tag-write-failure" in options:
+ # If we're ignoring tag failures, only fail if the security group modifications failed
+ if "Failed to modify security groups" in str(e):
+ logger.error("Failed to modify security groups: %s", e)
+ fail(EC_STATUS)
+ else:
+ logger.warning("Ignoring error due to ignore-tag-write-failure: %s", e)
+ else:
+ logger.error("Failed to set power status: %s", e)
+ fail(EC_STATUS)
except Exception as e:
- logger.error("Failed to set power status: %s", e)
+ logger.error("Unexpected error in set_power_status: %s", e)
fail(EC_STATUS)
# Define fencing agent options
def define_new_opts():
all_opt["port"]["help"] = "-n, --plug=[id] AWS Instance ID to perform action on "
all_opt["port"]["shortdesc"] = "AWS Instance ID to perform action on "
all_opt["region"] = {
"getopt": "r:",
"longopt": "region",
"help": "-r, --region=[region] AWS region (e.g., us-east-1)",
"shortdesc": "AWS Region.",
"required": "0",
"order": 1,
}
all_opt["access_key"] = {
"getopt": "a:",
"longopt": "access-key",
"help": "-a, --access-key=[key] AWS access key.",
"shortdesc": "AWS Access Key.",
"required": "0",
"order": 2,
}
all_opt["secret_key"] = {
"getopt": "s:",
"longopt": "secret-key",
"help": "-s, --secret-key=[key] AWS secret key.",
"shortdesc": "AWS Secret Key.",
"required": "0",
"order": 3,
}
all_opt["secg"] = {
"getopt": ":",
"longopt": "secg",
"help": "--secg=[sg1,sg2,...] Comma-separated list of Security Groups to remove.",
"shortdesc": "Security Groups to remove.",
"required": "0",
"order": 4,
}
all_opt["skip_race_check"] = {
"getopt": "",
"longopt": "skip-race-check",
"help": "--skip-race-check Skip race condition check.",
"shortdesc": "Skip race condition check.",
"required": "0",
"order": 6,
}
all_opt["invert-sg-removal"] = {
"getopt": "",
"longopt": "invert-sg-removal",
"help": "--invert-sg-removal Remove all security groups except the specified one(s).",
"shortdesc": "Remove all security groups except specified..",
"required": "0",
"order": 7,
}
all_opt["unfence-ignore-restore"] = {
"getopt": "",
"longopt": "unfence-ignore-restore",
"help": "--unfence-ignore-restore Do not restore security groups from tag when unfencing (off).",
"shortdesc": "Remove all security groups except specified..",
"required": "0",
"order": 8,
}
all_opt["filter"] = {
"getopt": ":",
"longopt": "filter",
"help": "--filter=[key=value] Filter (e.g. vpc-id=[vpc-XXYYZZAA])",
"shortdesc": "Filter for list-action",
"required": "0",
"order": 9
}
all_opt["boto3_debug"] = {
"getopt": "b:",
"longopt": "boto3_debug",
"help": "-b, --boto3_debug=[option] Boto3 and Botocore library debug logging",
"shortdesc": "Boto Lib debug",
"required": "0",
"default": "False",
"order": 10
}
all_opt["onfence-poweroff"] = {
"getopt": "",
"longopt": "onfence-poweroff",
"help": "--onfence-poweroff Power off the machine async upon fence (this is a network fencing agent...)",
"shortdesc": "Power off the machine async..",
"required": "0",
"order": 11
}
+ all_opt["ignore-tag-write-failure"] = {
+ "getopt": "",
+ "longopt": "ignore-tag-write-failure",
+ "help": "--ignore-tag-write-failure Continue to fence even if backup tag fails. This ensures prioriization of fencing over AWS backplane access",
+ "shortdesc": "Continue to fence even if backup tag fails..",
+ "required": "0",
+ "order": 12
+ }
def main():
conn = None
device_opt = [
"no_password",
"region",
"access_key",
"secret_key",
"secg",
"port",
"skip_race_check",
"invert-sg-removal",
"unfence-ignore-restore",
"filter",
"boto3_debug",
- "onfence-poweroff"
+ "onfence-poweroff",
+ "ignore-tag-write-failure"
]
atexit.register(atexit_handler)
define_new_opts()
try:
processed_input = process_input(device_opt)
options = check_input(device_opt, processed_input)
except Exception as e:
logger.error(f"Failed to process input options: {str(e)}")
sys.exit(EC_GENERIC_ERROR)
run_delay(options)
docs = {
"shortdesc": "Fence agent for AWS (Amazon Web Services) Net",
"longdesc": (
"fence_aws_vpc is a Network and Power Fencing agent for AWS VPC that works by "
"manipulating security groups. It uses the boto3 library to connect to AWS.\n\n"
"boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.\n"
"For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration"
" "
"NOTE: If onfence-poweroff is set, the agent won't be able to power on the node again, it will have to be powered on manually or with other automation."
),
"vendorurl": "http://www.amazon.com"
}
show_docs(options, docs)
if "--onfence-poweroff" not in options and options.get("--action", "") == "reboot":
options["--action"] = "off"
# Configure logging
if "--debug-file" in options:
for handler in logger.handlers:
if isinstance(handler, logging.FileHandler):
logger.removeHandler(handler)
lh = logging.FileHandler(options["--debug-file"])
logger.addHandler(lh)
lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
lh.setFormatter(lhf)
lh.setLevel(logging.DEBUG)
# Configure boto3 logging
if options.get("--boto3_debug", "").lower() not in ["1", "yes", "on", "true"]:
boto3.set_stream_logger('boto3', logging.INFO)
boto3.set_stream_logger('botocore', logging.CRITICAL)
logging.getLogger('botocore').propagate = False
logging.getLogger('boto3').propagate = False
else:
log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
logging.getLogger('botocore').propagate = False
logging.getLogger('boto3').propagate = False
fdh = logging.FileHandler('/var/log/fence_aws_vpc_boto3.log')
fdh.setFormatter(log_format)
logging.getLogger('boto3').addHandler(fdh)
logging.getLogger('botocore').addHandler(fdh)
logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_vpc_boto3.log",
options.get("--boto3_debug"))
# Establish AWS connection
region = options.get("--region")
access_key = options.get("--access-key")
secret_key = options.get("--secret-key")
try:
conn = boto3.resource(
"ec2",
region_name=region,
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
)
except Exception as e:
if not options.get("--action", "") in ["metadata", "manpage", "validate-all"]:
fail_usage("Failed: Unable to connect to AWS: " + str(e))
else:
pass
# Operate the fencing device using the fence library's fence_action
result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list)
sys.exit(result)
if __name__ == "__main__":
main()
+
diff --git a/agents/aws_vpc_net/readme.md b/agents/aws_vpc_net/readme.md
index 47f467ea..de1a12e5 100644
--- a/agents/aws_vpc_net/readme.md
+++ b/agents/aws_vpc_net/readme.md
@@ -1,328 +1,461 @@
# Fence AWS VPC Network Agent Design Document
## Overview
The fence_aws_vpc_net agent is a network and power fencing agent for AWS VPC that operates by manipulating security groups. This document outlines the design and architecture of the agent.
## Class Diagram
```mermaid
classDiagram
class FenceAWSVPCNet {
-logger: Logger
-conn: boto3.resource
-options: dict
+main()
+define_new_opts()
+process_input()
+check_input()
+ +get_power_status()
+ +set_power_status()
}
- class AWSConnection {
- -region: str
- -access_key: str
- -secret_key: str
- +establish_connection()
- +validate_credentials()
+ class InstanceOperations {
+ +get_instance_id()
+ +get_instance_details()
+ +shutdown_instance()
+ +get_nodes_list()
}
- class SecurityGroupManager {
+ class SecurityGroupOperations {
+modify_security_groups()
+create_backup_tag()
+restore_security_groups()
-validate_sg_changes()
}
- class InstanceManager {
- +get_instance_details()
- +shutdown_instance()
- +get_power_status()
- +set_power_status()
- -validate_instance_state()
- }
-
- class TagManager {
+ class TagOperations {
+set_lastfence_tag()
- +get_backup_tags()
- +cleanup_tags()
- -validate_tag_operations()
+ +create_backup_tag()
+ +restore_from_backup()
+ -handle_chunked_tags()
}
- FenceAWSVPCNet --> AWSConnection
- FenceAWSVPCNet --> SecurityGroupManager
- FenceAWSVPCNet --> InstanceManager
- SecurityGroupManager --> TagManager
- InstanceManager --> TagManager
+ class LoggingManager {
+ +configure_logging()
+ +configure_boto3_logging()
+ +handle_debug_file()
+ }
+ FenceAWSVPCNet --> InstanceOperations
+ FenceAWSVPCNet --> SecurityGroupOperations
+ FenceAWSVPCNet --> TagOperations
+ FenceAWSVPCNet --> LoggingManager
+ SecurityGroupOperations --> TagOperations
```
## Sequence Diagrams
### Fence Operation (Power Off)
```mermaid
sequenceDiagram
participant Client
participant FenceAgent
participant AWS
participant SecurityGroups
participant Tags
Client->>FenceAgent: Execute fence operation
FenceAgent->>AWS: Validate AWS credentials
AWS-->>FenceAgent: Credentials valid
+ opt skip-race-check not set
+ FenceAgent->>AWS: Get self instance ID
+ AWS-->>FenceAgent: Instance ID
+ FenceAgent->>FenceAgent: Check for self-fencing
+ end
+
FenceAgent->>AWS: Get instance details
AWS-->>FenceAgent: Instance details
alt Instance is running
FenceAgent->>SecurityGroups: Backup current security groups
SecurityGroups-->>FenceAgent: Backup created
- FenceAgent->>Tags: Create lastfence tag
- Tags-->>FenceAgent: Tag created
+ alt ignore-tag-write-failure not set
+ FenceAgent->>Tags: Create lastfence tag
+ Tags-->>FenceAgent: Tag created
+ end
FenceAgent->>SecurityGroups: Modify security groups
SecurityGroups-->>FenceAgent: Groups modified
opt onfence-poweroff enabled
FenceAgent->>AWS: Initiate shutdown
AWS-->>FenceAgent: Shutdown initiated
end
FenceAgent-->>Client: Success
else Instance not running
FenceAgent-->>Client: Fail - Instance not running
end
```
### Unfence Operation (Power On)
```mermaid
sequenceDiagram
participant Client
participant FenceAgent
participant AWS
participant SecurityGroups
participant Tags
Client->>FenceAgent: Execute unfence operation
FenceAgent->>AWS: Validate AWS credentials
AWS-->>FenceAgent: Credentials valid
- FenceAgent->>Tags: Get lastfence tag
- Tags-->>FenceAgent: Lastfence tag
+ alt unfence-ignore-restore not set
+ FenceAgent->>Tags: Get lastfence tag
+ Tags-->>FenceAgent: Lastfence tag
- FenceAgent->>Tags: Get backup tags
- Tags-->>FenceAgent: Backup tags
+ FenceAgent->>Tags: Get backup tags
+ Tags-->>FenceAgent: Backup tags
- alt Valid backup found
- FenceAgent->>SecurityGroups: Restore original security groups
- SecurityGroups-->>FenceAgent: Groups restored
+ alt Valid backup found
+ FenceAgent->>SecurityGroups: Restore original security groups
+ SecurityGroups-->>FenceAgent: Groups restored
- FenceAgent->>Tags: Cleanup backup tags
- Tags-->>FenceAgent: Tags cleaned
+ FenceAgent->>Tags: Cleanup backup tags
+ Tags-->>FenceAgent: Tags cleaned
- FenceAgent-->>Client: Success
- else No valid backup
- FenceAgent-->>Client: Fail - No valid backup found
+ FenceAgent-->>Client: Success
+ else No valid backup
+ FenceAgent-->>Client: Fail - No valid backup found
+ end
+ else
+ FenceAgent-->>Client: Success - Restore skipped
end
```
## Component Details
### 1. Main Controller (FenceAWSVPCNet)
- **Purpose**: Main entry point and orchestration
- **Key Responsibilities**:
- Process command line options
- Initialize AWS connection
- Execute fence operations
- Handle logging and errors
+ - Manage self-fencing prevention
+ - Support tag write failure handling
-### 2. AWS Connection Manager
-- **Purpose**: Handle AWS connectivity
+### 2. Instance Operations
+- **Purpose**: Handle EC2 instance operations
- **Key Responsibilities**:
- - Establish and maintain AWS connection
- - Handle credentials and regions
- - Manage API retries and timeouts
+ - Get instance details and metadata
+ - Handle instance power operations
+ - Validate instance states
+ - List and filter instances
+ - Handle instance shutdown
-### 3. Security Group Manager
+### 3. Security Group Operations
- **Purpose**: Manage security group operations
- **Key Responsibilities**:
- - Modify security groups
- - Create backups of security group configurations
+ - Modify security groups (remove or keep-only modes)
+ - Handle chunked backup operations
- Restore security groups from backups
- Validate security group changes
+ - Support partial success scenarios
-### 4. Instance Manager
-- **Purpose**: Handle EC2 instance operations
-- **Key Responsibilities**:
- - Get instance details and status
- - Handle instance power operations
- - Validate instance states
- - Manage self-fencing prevention
-
-### 5. Tag Manager
+### 4. Tag Operations
- **Purpose**: Manage AWS resource tagging
- **Key Responsibilities**:
- Create and manage backup tags
- - Handle lastfence tags
+ - Handle chunked tag data
+ - Manage lastfence tags
- Clean up tags after operations
- - Validate tag operations
+ - Support tag write failure scenarios
+
+### 5. Logging Manager
+- **Purpose**: Handle logging configuration
+- **Key Responsibilities**:
+ - Configure application logging
+ - Manage boto3 debug logging
+ - Handle debug file output
+ - Control log propagation
## Success and Failure Paths
### Success Paths
-1. **Normal Fence Operation**
+1. **Normal Fence Operation (Without ignore-tag-write-failure)**
```
Start
├── Validate AWS credentials
+├── Check for self-fencing (if enabled)
├── Check instance is running
-├── Backup security groups
+├── Backup security groups (with chunking)
+│ ├── Create backup tags for each interface
+│ └── Verify backup tag creation
├── Create lastfence tag
├── Modify security groups
+│ ├── Remove specified groups
+│ └── Verify modifications
├── [Optional] Shutdown instance
└── Success
```
-2. **Normal Unfence Operation**
+2. **Fence Operation (With ignore-tag-write-failure)**
```
Start
├── Validate AWS credentials
-├── Find lastfence tag
-├── Find backup tags
-├── Restore security groups
-├── Clean up tags
+├── Check for self-fencing (if enabled)
+├── Check instance is running
+├── Attempt backup tag creation
+│ ├── Success: Create backup tags
+│ └── Failure: Log warning and continue
+├── Attempt lastfence tag creation
+│ ├── Success: Create tag
+│ └── Failure: Log warning and continue
+├── Modify security groups
+│ ├── Remove specified groups
+│ ├── Verify modifications
+│ └── Check all interfaces modified
+│ ├── All modified: Success
+│ └── Partial: Fail with modification error
+├── [Optional] Shutdown instance
+└── Success (if security groups modified)
+```
+
+3. **Normal Unfence Operation**
+```
+Start
+├── Validate AWS credentials
+├── [Skip if unfence-ignore-restore]
+│ ├── Find lastfence tag
+│ ├── Find backup tags
+│ ├── Restore security groups
+│ └── Clean up tags
└── Success
```
### Failure Paths
1. **Authentication Failures**
```
Start
├── Invalid AWS credentials
+│ ├── Missing credentials
+│ ├── Invalid access key
+│ ├── Invalid secret key
+│ └── Invalid region
└── Fail with auth error
```
2. **Instance State Failures**
```
Start
+├── Instance not found
+│ └── Fail with instance error
├── Instance not in required state
-└── Fail with state error
+│ └── Fail with state error
+└── Self-fencing detected
+ └── Fail with self-fencing error
```
-3. **Security Group Operation Failures**
+3. **Security Group Operation Failures (Without ignore-tag-write-failure)**
```
Start
├── Backup creation fails
+│ ├── Tag size too large
+│ ├── API error
│ └── Fail with backup error
├── Security group modification fails
+│ ├── Permission denied
+│ ├── Invalid group ID
+│ ├── Rate limit exceeded
│ └── Fail with modification error
└── Restoration fails
+ ├── Missing backup data
+ ├── Invalid backup format
+ ├── Modification error
└── Fail with restore error
```
-4. **Tag Operation Failures**
+4. **Security Group Operation Failures (With ignore-tag-write-failure)**
+```
+Start
+├── Backup creation fails
+│ ├── Log warning
+│ └── Continue to modifications
+├── Security group modification attempt
+│ ├── Success: All interfaces modified
+│ │ └── Continue to completion
+│ ├── Partial success
+│ │ ├── Verify fencing state
+│ │ │ ├── Sufficient interfaces modified
+│ │ │ │ └── Continue to completion
+│ │ │ └── Insufficient modifications
+│ │ │ └── Fail with partial error
+│ │ └── Log warning
+│ └── Complete failure
+│ └── Fail with modification error
+├── [Optional] Shutdown attempt
+│ ├── Success
+│ │ └── Continue to completion
+│ └── Failure
+│ └── Log warning (non-fatal)
+└── Final state determined by SG modifications
+```
+
+5. **Tag Operation Failures (Without ignore-tag-write-failure)**
```
Start
├── Tag creation fails
+│ ├── Size limit exceeded
+│ ├── API error
│ └── Fail with tag error
├── Tag retrieval fails
+│ ├── Missing tags
+│ ├── Invalid format
│ └── Fail with retrieval error
└── Tag cleanup fails
└── Warning (non-fatal)
```
+6. **Tag Operation Failures (With ignore-tag-write-failure)**
+```
+Start
+├── Backup tag creation fails
+│ ├── Log warning
+│ └── Continue operation
+├── Lastfence tag creation fails
+│ ├── Log warning
+│ └── Continue operation
+├── Tag retrieval fails
+│ ├── Check security group state
+│ │ ├── Groups properly modified
+│ │ │ └── Continue operation
+│ │ └── Groups not modified
+│ │ └── Fail with SG error
+│ └── Log warning
+└── Tag cleanup fails
+ └── Warning (non-fatal)
+```
+
## Error Handling
### Error Categories
1. **AWS API Errors**
- ConnectionError
- ClientError
- EndpointConnectionError
- NoRegionError
+ - Tag size limitations
+ - API rate limiting
2. **Validation Errors**
- Invalid parameters
- Missing required options
- Invalid security group configurations
+ - Malformed tag data
3. **State Errors**
- Instance state conflicts
- Security group conflicts
- Self-fencing detection
+ - Partial operation completion
### Error Recovery
- Automatic retries for transient AWS API errors
+- Chunked tag handling for large security group lists
+- Support for continuing operation despite tag failures
- Rollback of security group changes on partial failures
- Preservation of backup tags for manual recovery
- Detailed logging for troubleshooting
## Configuration Options
### Required Options
- `--plug`: AWS Instance ID
- AWS credentials (via options or environment)
### Optional Options
- `--region`: AWS region
-- `--secg`: Security groups to remove
+- `--access-key`: AWS access key
+- `--secret-key`: AWS secret key
+- `--secg`: Security groups to remove/keep
- `--skip-race-check`: Skip self-fencing check
-- `--invert-sg-removal`: Invert security group removal
+- `--invert-sg-removal`: Keep only specified security groups
- `--unfence-ignore-restore`: Skip restore on unfence
- `--onfence-poweroff`: Power off on fence
+- `--ignore-tag-write-failure`: Continue despite tag failures
+- `--filter`: Filter instances for list operation
+- `--boto3_debug`: Enable boto3 debug logging
## Logging and Monitoring
### Log Levels
-- ERROR: Operation failures
-- WARNING: Non-critical issues
-- INFO: Operation progress
-- DEBUG: Detailed operation data
+- ERROR: Operation failures and AWS API errors
+- WARNING: Non-critical issues and tag operation failures
+- INFO: Operation progress and success
+- DEBUG: Detailed operation data and API responses
### Key Metrics
- Operation success/failure rates
-- Operation duration
+- Tag operation success rates
+- Security group modification status
- AWS API call latency
- Error frequency and types
+- Tag size and chunking metrics
## Security Considerations
### Authentication
- AWS credential management
- IAM role requirements
- Access key security
+- Instance metadata security
### Operation Safety
- Self-fencing prevention
- Backup verification
- Security group validation
- State verification
+- Tag operation integrity
+- Partial success handling
## Best Practices
1. **Operation Safety**
- Always verify instance state
+ - Use self-fencing prevention
- Validate security group changes
- Maintain accurate backups
- - Prevent self-fencing
+ - Handle tag operation failures gracefully
2. **Error Handling**
- Implement proper rollbacks
+ - Use chunked tag operations
- Maintain detailed logs
- Preserve recovery data
- Handle edge cases
+ - Support partial success scenarios
3. **Performance**
- Minimize API calls
- Implement retries
- Handle rate limiting
- - Optimize operations
+ - Optimize tag operations
+ - Use efficient security group modifications
4. **Maintenance**
- Regular backup cleanup
- Log rotation
- Configuration updates
- Security updates
+ - Monitor tag usage
+ - Clean up orphaned tags
+
diff --git a/tests/data/metadata/fence_aws_vpc_net.xml b/tests/data/metadata/fence_aws_vpc_net.xml
index a252f28e..63607ce6 100644
--- a/tests/data/metadata/fence_aws_vpc_net.xml
+++ b/tests/data/metadata/fence_aws_vpc_net.xml
@@ -1,186 +1,196 @@
<?xml version="1.0" ?>
<resource-agent name="fence_aws_vpc_net" shortdesc="Fence agent for AWS (Amazon Web Services) Net" >
<longdesc>fence_aws_vpc is a Network and Power Fencing agent for AWS VPC that works by manipulating security groups. It uses the boto3 library to connect to AWS.
boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.
For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration NOTE: If onfence-poweroff is set, the agent won't be able to power on the node again, it will have to be powered on manually or with other automation.</longdesc>
<vendor-url>http://www.amazon.com</vendor-url>
<parameters>
<parameter name="action" unique="0" required="1">
<getopt mixed="-o, --action=[action]" />
<content type="string" default="reboot" />
<shortdesc lang="en">Fencing action</shortdesc>
</parameter>
<parameter name="plug" unique="0" required="1" obsoletes="port">
<getopt mixed="-n, --plug=[id]" />
<content type="string" />
<shortdesc lang="en">AWS Instance ID to perform action on </shortdesc>
</parameter>
<parameter name="port" unique="0" required="1" deprecated="1">
<getopt mixed="-n, --plug=[id]" />
<content type="string" />
<shortdesc lang="en">AWS Instance ID to perform action on </shortdesc>
</parameter>
<parameter name="region" unique="0" required="0">
<getopt mixed="-r, --region=[region]" />
<content type="string" />
<shortdesc lang="en">AWS Region.</shortdesc>
</parameter>
<parameter name="access_key" unique="0" required="0">
<getopt mixed="-a, --access-key=[key]" />
<content type="string" />
<shortdesc lang="en">AWS Access Key.</shortdesc>
</parameter>
<parameter name="secret_key" unique="0" required="0">
<getopt mixed="-s, --secret-key=[key]" />
<content type="string" />
<shortdesc lang="en">AWS Secret Key.</shortdesc>
</parameter>
<parameter name="secg" unique="0" required="0">
<getopt mixed="--secg=[sg1,sg2,...]" />
<content type="string" />
<shortdesc lang="en">Security Groups to remove.</shortdesc>
</parameter>
<parameter name="skip_race_check" unique="0" required="0">
<getopt mixed="--skip-race-check" />
<content type="boolean" />
<shortdesc lang="en">Skip race condition check.</shortdesc>
</parameter>
<parameter name="invert-sg-removal" unique="0" required="0" deprecated="1">
<getopt mixed="--invert-sg-removal" />
<content type="boolean" />
<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
</parameter>
<parameter name="invert_sg_removal" unique="0" required="0" obsoletes="invert-sg-removal">
<getopt mixed="--invert-sg-removal" />
<content type="boolean" />
<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
</parameter>
<parameter name="unfence-ignore-restore" unique="0" required="0" deprecated="1">
<getopt mixed="--unfence-ignore-restore" />
<content type="boolean" />
<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
</parameter>
<parameter name="unfence_ignore_restore" unique="0" required="0" obsoletes="unfence-ignore-restore">
<getopt mixed="--unfence-ignore-restore" />
<content type="boolean" />
<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
</parameter>
<parameter name="filter" unique="0" required="0">
<getopt mixed="--filter=[key=value]" />
<content type="string" />
<shortdesc lang="en">Filter for list-action</shortdesc>
</parameter>
<parameter name="boto3_debug" unique="0" required="0">
<getopt mixed="-b, --boto3_debug=[option]" />
<content type="string" default="False" />
<shortdesc lang="en">Boto Lib debug</shortdesc>
</parameter>
<parameter name="onfence-poweroff" unique="0" required="0" deprecated="1">
<getopt mixed="--onfence-poweroff" />
<content type="boolean" />
<shortdesc lang="en">Power off the machine async..</shortdesc>
</parameter>
<parameter name="onfence_poweroff" unique="0" required="0" obsoletes="onfence-poweroff">
<getopt mixed="--onfence-poweroff" />
<content type="boolean" />
<shortdesc lang="en">Power off the machine async..</shortdesc>
</parameter>
+ <parameter name="ignore-tag-write-failure" unique="0" required="0" deprecated="1">
+ <getopt mixed="--ignore-tag-write-failure" />
+ <content type="boolean" />
+ <shortdesc lang="en">Continue to fence even if backup tag fails..</shortdesc>
+ </parameter>
+ <parameter name="ignore_tag_write_failure" unique="0" required="0" obsoletes="ignore-tag-write-failure">
+ <getopt mixed="--ignore-tag-write-failure" />
+ <content type="boolean" />
+ <shortdesc lang="en">Continue to fence even if backup tag fails..</shortdesc>
+ </parameter>
<parameter name="quiet" unique="0" required="0">
<getopt mixed="-q, --quiet" />
<content type="boolean" />
<shortdesc lang="en">Disable logging to stderr. Does not affect --verbose or --debug-file or logging to syslog.</shortdesc>
</parameter>
<parameter name="verbose" unique="0" required="0">
<getopt mixed="-v, --verbose" />
<content type="boolean" />
<shortdesc lang="en">Verbose mode. Multiple -v flags can be stacked on the command line (e.g., -vvv) to increase verbosity.</shortdesc>
</parameter>
<parameter name="verbose_level" unique="0" required="0">
<getopt mixed="--verbose-level" />
<content type="integer" />
<shortdesc lang="en">Level of debugging detail in output. Defaults to the number of --verbose flags specified on the command line, or to 1 if verbose=1 in a stonith device configuration (i.e., on stdin).</shortdesc>
</parameter>
<parameter name="debug" unique="0" required="0" deprecated="1">
<getopt mixed="-D, --debug-file=[debugfile]" />
<content type="string" />
<shortdesc lang="en">Write debug information to given file</shortdesc>
</parameter>
<parameter name="debug_file" unique="0" required="0" obsoletes="debug">
<getopt mixed="-D, --debug-file=[debugfile]" />
<shortdesc lang="en">Write debug information to given file</shortdesc>
</parameter>
<parameter name="version" unique="0" required="0">
<getopt mixed="-V, --version" />
<content type="boolean" />
<shortdesc lang="en">Display version information and exit</shortdesc>
</parameter>
<parameter name="help" unique="0" required="0">
<getopt mixed="-h, --help" />
<content type="boolean" />
<shortdesc lang="en">Display help and exit</shortdesc>
</parameter>
<parameter name="plug_separator" unique="0" required="0">
<getopt mixed="--plug-separator=[char]" />
<content type="string" default="," />
<shortdesc lang="en">Separator for plug parameter when specifying more than 1 plug</shortdesc>
</parameter>
<parameter name="separator" unique="0" required="0">
<getopt mixed="-C, --separator=[char]" />
<content type="string" default="," />
<shortdesc lang="en">Separator for CSV created by 'list' operation</shortdesc>
</parameter>
<parameter name="delay" unique="0" required="0">
<getopt mixed="--delay=[seconds]" />
<content type="second" default="0" />
<shortdesc lang="en">Wait X seconds before fencing is started</shortdesc>
</parameter>
<parameter name="disable_timeout" unique="0" required="0">
<getopt mixed="--disable-timeout=[true/false]" />
<content type="string" />
<shortdesc lang="en">Disable timeout (true/false) (default: true when run from Pacemaker 2.0+)</shortdesc>
</parameter>
<parameter name="login_timeout" unique="0" required="0">
<getopt mixed="--login-timeout=[seconds]" />
<content type="second" default="5" />
<shortdesc lang="en">Wait X seconds for cmd prompt after login</shortdesc>
</parameter>
<parameter name="power_timeout" unique="0" required="0">
<getopt mixed="--power-timeout=[seconds]" />
<content type="second" default="20" />
<shortdesc lang="en">Test X seconds for status change after ON/OFF</shortdesc>
</parameter>
<parameter name="power_wait" unique="0" required="0">
<getopt mixed="--power-wait=[seconds]" />
<content type="second" default="0" />
<shortdesc lang="en">Wait X seconds after issuing ON/OFF</shortdesc>
</parameter>
<parameter name="shell_timeout" unique="0" required="0">
<getopt mixed="--shell-timeout=[seconds]" />
<content type="second" default="3" />
<shortdesc lang="en">Wait X seconds for cmd prompt after issuing command</shortdesc>
</parameter>
<parameter name="stonith_status_sleep" unique="0" required="0">
<getopt mixed="--stonith-status-sleep=[seconds]" />
<content type="second" default="1" />
<shortdesc lang="en">Sleep X seconds between status calls during a STONITH action</shortdesc>
</parameter>
<parameter name="retry_on" unique="0" required="0">
<getopt mixed="--retry-on=[attempts]" />
<content type="integer" default="1" />
<shortdesc lang="en">Count of attempts to retry power on</shortdesc>
</parameter>
</parameters>
<actions>
<action name="on" automatic="0"/>
<action name="off" />
<action name="reboot" />
<action name="status" />
<action name="list" />
<action name="list-status" />
<action name="monitor" />
<action name="metadata" />
<action name="manpage" />
<action name="validate-all" />
</actions>
</resource-agent>

File Metadata

Mime Type
text/x-diff
Expires
Mon, Feb 24, 10:57 PM (17 h, 24 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1464507
Default Alt Text
(70 KB)

Event Timeline