No OneTemporary
Actions

Size

70 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/agents/aws_vpc_net/fence_aws_vpc_net.py b/agents/aws_vpc_net/fence_aws_vpc_net.py
	index e1415c52..cf61f2d7 100644
	--- a/agents/aws_vpc_net/fence_aws_vpc_net.py
	+++ b/agents/aws_vpc_net/fence_aws_vpc_net.py
	@@ -1,864 +1,980 @@
	#!@PYTHON@ -tt

	import sys, re
	import json
	import atexit
	import logging
	import time
	import requests

	sys.path.append("@FENCEAGENTSLIBDIR@")

	from fencing import *
	from fencing import (
	run_delay,
	fail,
	fail_usage,
	EC_STATUS,
	EC_GENERIC_ERROR,
	SyslogLibHandler
	)

	try:
	- import boto3
	- from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError
	+ import boto3
	+ from botocore.exceptions import ConnectionError, ClientError, EndpointConnectionError, NoRegionError
	except ImportError:
	- pass
	+ logger.error("Unable to import boto3 module. Please install boto3: pip install boto3")
	+ sys.exit(EC_GENERIC_ERROR)


	# Logger configuration
	logger = logging.getLogger()
	logger.propagate = False
	logger.setLevel(logging.INFO)
	logger.addHandler(SyslogLibHandler())
	logging.getLogger('botocore.vendored').propagate = False

	status = {
	- "running": "on",
	- "stopped": "off",
	- "pending": "unknown",
	- "stopping": "unknown",
	- "shutting-down": "unknown",
	- "terminated": "unknown"
	+ "running": "on",
	+ "stopped": "off",
	+ "pending": "unknown",
	+ "stopping": "unknown",
	+ "shutting-down": "unknown",
	+ "terminated": "unknown"
	}

	def get_power_status(conn, options):
	- logger.debug("Starting status operation")
	- try:
	- instance_id = options["--plug"]
	- ec2_client = conn.meta.client
	-
	- # Get the lastfence tag first
	- lastfence_response = ec2_client.describe_tags(
	- Filters=[
	- {"Name": "resource-id", "Values": [instance_id]},
	- {"Name": "key", "Values": ["lastfence"]}
	- ]
	- )
	-
	- if not lastfence_response["Tags"]:
	- logger.debug("No lastfence tag found for instance %s - instance is not fenced", instance_id)
	- return "on"
	-
	- lastfence_timestamp = lastfence_response["Tags"][0]["Value"]
	-
	- # Check for backup tags with pattern Original_SG_Backup_{instance_id}_*
	- response = ec2_client.describe_tags(
	- Filters=[
	- {"Name": "resource-id", "Values": [instance_id]},
	- {"Name": "key", "Values": [f"Original_SG_Backup_{instance_id}*"]}
	- ]
	- )
	-
	- if not response["Tags"]:
	- logger.debug("No backup tags found for instance %s - instance is not fenced", instance_id)
	- return "on"
	-
	- # Loop through backup tags to find matching timestamp
	- for tag in response["Tags"]:
	- try:
	- backup_data = json.loads(tag["Value"])
	- backup_timestamp = backup_data.get("t") # Using shortened timestamp field
	-
	- if not backup_timestamp:
	- logger.debug("No timestamp found in backup data for tag %s", tag["Key"])
	- continue
	-
	- # Validate timestamps match
	- if str(backup_timestamp) == str(lastfence_timestamp):
	- logger.debug("Found matching backup tag %s - instance is fenced", tag["Key"])
	- return "off"
	-
	- except (json.JSONDecodeError, KeyError) as e:
	- logger.error(f"Failed to parse backup data for tag {tag['Key']}: {str(e)}")
	- continue
	-
	- logger.debug("No backup tags with matching timestamp found - instance is not fenced")
	- return "on"
	-
	- except ClientError:
	- fail_usage("Failed: Incorrect Access Key or Secret Key.")
	- except EndpointConnectionError:
	- fail_usage("Failed: Incorrect Region.")
	- except IndexError:
	- fail(EC_STATUS)
	- except Exception as e:
	- logger.error("Failed to get power status: %s", e)
	- fail(EC_STATUS)
	+ logger.debug("Starting status operation")
	+ try:
	+ instance_id = options["--plug"]
	+ ec2_client = conn.meta.client
	+
	+ # Get the lastfence tag first
	+ lastfence_response = ec2_client.describe_tags(
	+ Filters=[
	+ {"Name": "resource-id", "Values": [instance_id]},
	+ {"Name": "key", "Values": ["lastfence"]}
	+ ]
	+ )
	+
	+ # Helper function to check if security groups have been modified
	+ def check_sg_modifications():
	+ try:
	+ state, _, interfaces = get_instance_details(ec2_client, instance_id)
	+ if state == "running": # Only check SGs if instance is running
	+ sg_to_remove = options.get("--secg", "").split(",") if options.get("--secg") else []
	+ if sg_to_remove:
	+ # Check if all interfaces have had their security groups modified
	+ all_interfaces_fenced = True
	+ for interface in interfaces:
	+ current_sgs = interface["SecurityGroups"]
	+ if "--invert-sg-removal" in options:
	+ # In keep_only mode, check if interface only has the specified groups
	+ if sorted(current_sgs) != sorted(sg_to_remove):
	+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has different security groups")
	+ all_interfaces_fenced = False
	+ break
	+ else:
	+ # In remove mode, check if specified groups were removed
	+ if any(sg in current_sgs for sg in sg_to_remove):
	+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has security groups that should be removed")
	+ all_interfaces_fenced = False
	+ break
	+
	+ if all_interfaces_fenced:
	+ logger.debug("All interfaces have had their security groups successfully modified - considering instance fenced")
	+ return True
	+ except Exception as e:
	+ logger.debug("Failed to check security group modifications: %s", e)
	+ return False
	+
	+ # If --ignore-tag-write-failure is set, prioritize checking SG modifications
	+ if "--ignore-tag-write-failure" in options:
	+ logger.debug("--ignore-tag-write-failure is set, checking security group modifications first")
	+ if check_sg_modifications():
	+ logger.info("All interfaces are properly fenced based on security group state, ignoring tag state")
	+ return "off"
	+ logger.debug("Not all interfaces are fenced, proceeding with tag checks")
	+ # Only proceed with tag checks if we haven't determined state from SG modifications
	+
	+ try:
	+ # If no lastfence tag exists, instance is not fenced
	+ if not lastfence_response["Tags"]:
	+ logger.debug("No lastfence tag found for instance %s - instance is not fenced", instance_id)
	+ return "on"
	+
	+ lastfence_timestamp = lastfence_response["Tags"][0]["Value"]
	+ except Exception as e:
	+ if "--ignore-tag-write-failure" in options:
	+ logger.warning(f"Failed to check lastfence tag but continuing due to ignore-tag-write-failure: {str(e)}")
	+ # If we can't read tags but --ignore-tag-write-failure is set, rely on SG state
	+ return "on" # Default to "on" to allow fence operation to proceed
	+ raise
	+
	+ # Check for backup tags with pattern Original_SG_Backup_{instance_id}_*
	+ response = ec2_client.describe_tags(
	+ Filters=[
	+ {"Name": "resource-id", "Values": [instance_id]},
	+ {"Name": "key", "Values": [f"Original_SG_Backup_{instance_id}*"]}
	+ ]
	+ )
	+
	+ # If no backup tags exist, instance is not fenced (unless --ignore-tag-write-failure handled above)
	+ if not response["Tags"]:
	+ logger.debug("No backup tags found for instance %s - instance is not fenced", instance_id)
	+ return "on"
	+
	+ # Loop through backup tags to find matching timestamp
	+ for tag in response["Tags"]:
	+ try:
	+ backup_data = json.loads(tag["Value"])
	+ backup_timestamp = backup_data.get("t") # Using shortened timestamp field
	+
	+ if not backup_timestamp:
	+ logger.debug("No timestamp found in backup data for tag %s", tag["Key"])
	+ continue
	+
	+ # Validate timestamps match
	+ if str(backup_timestamp) == str(lastfence_timestamp):
	+ # Check if security groups were actually modified to confirm fencing
	+ try:
	+ state, _, interfaces = get_instance_details(ec2_client, instance_id)
	+ if state == "running": # Only check SGs if instance is running
	+ sg_to_remove = options.get("--secg", "").split(",") if options.get("--secg") else []
	+ if sg_to_remove:
	+ # Check if all interfaces have had their security groups modified
	+ all_interfaces_fenced = True
	+ for interface in interfaces:
	+ current_sgs = interface["SecurityGroups"]
	+ if "--invert-sg-removal" in options:
	+ # In keep_only mode, check if interface only has the specified groups
	+ if sorted(current_sgs) != sorted(sg_to_remove):
	+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has different security groups")
	+ all_interfaces_fenced = False
	+ break
	+ else:
	+ # In remove mode, check if specified groups were removed
	+ if any(sg in current_sgs for sg in sg_to_remove):
	+ logger.debug(f"Interface {interface['NetworkInterfaceId']} still has security groups that should be removed")
	+ all_interfaces_fenced = False
	+ break
	+
	+ if all_interfaces_fenced:
	+ logger.debug("Found matching backup tag %s and verified all interfaces have SG changes - instance is fenced", tag["Key"])
	+ return "off"
	+ except Exception as e:
	+ logger.debug("Failed to check security group modifications: %s", e)
	+ # If we can't verify SG changes but have matching tags, assume fenced for backward compatibility
	+ logger.debug("Found matching backup tag %s but couldn't verify SG changes - assuming instance is fenced", tag["Key"])
	+ return "off"
	+
	+ except (json.JSONDecodeError, KeyError) as e:
	+ logger.error(f"Failed to parse backup data for tag {tag['Key']}: {str(e)}")
	+ continue
	+
	+ logger.debug("No backup tags with matching timestamp found - instance is not fenced")
	+ return "on"
	+
	+ except ClientError:
	+ fail_usage("Failed: Incorrect Access Key or Secret Key.")
	+ except EndpointConnectionError:
	+ fail_usage("Failed: Incorrect Region.")
	+ except IndexError:
	+ fail(EC_STATUS)
	+ except Exception as e:
	+ logger.error("Failed to get power status: %s", e)
	+ fail(EC_STATUS)

	# Retrieve instance ID for self-check
	def get_instance_id():
	"""Retrieve the instance ID of the current EC2 instance."""
	try:
	token = requests.put(
	"http://169.254.169.254/latest/api/token",
	headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"},
	).content.decode("UTF-8")
	instance_id = requests.get(
	"http://169.254.169.254/latest/meta-data/instance-id",
	headers={"X-aws-ec2-metadata-token": token},
	).content.decode("UTF-8")
	return instance_id
	except Exception as err:
	logger.error("Failed to retrieve instance ID for self-check: %s", err)
	return None


	# Retrieve instance details
	def get_instance_details(ec2_client, instance_id):
	"""Retrieve instance details including state, VPC, interfaces, and attached SGs."""
	try:
	response = ec2_client.describe_instances(InstanceIds=[instance_id])
	instance = response["Reservations"][0]["Instances"][0]

	instance_state = instance["State"]["Name"]
	vpc_id = instance["VpcId"]
	network_interfaces = instance["NetworkInterfaces"]

	interfaces = []
	for interface in network_interfaces:
	try:
	interfaces.append(
	{
	"NetworkInterfaceId": interface["NetworkInterfaceId"],
	"SecurityGroups": [sg["GroupId"] for sg in interface["Groups"]],
	}
	)
	except KeyError as e:
	logger.error(f"Malformed interface data: {str(e)}")
	continue

	return instance_state, vpc_id, interfaces

	except ClientError as e:
	logger.error(f"AWS API error while retrieving instance details: {str(e)}")
	raise
	except IndexError as e:
	logger.error(f"Instance {instance_id} not found or no instances returned: {str(e)}")
	raise
	except KeyError as e:
	logger.error(f"Unexpected response format from AWS API: {str(e)}")
	raise
	except Exception as e:
	logger.error(f"Unexpected error while retrieving instance details: {str(e)}")
	raise

	# Check if we are the self-fencing node
	def get_self_power_status(conn, instance_id):
	- try:
	- instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
	- state = list(instance)[0].state["Name"]
	- if state == "running":
	- logger.debug(f"Captured my ({instance_id}) state and it {state.upper()} - returning OK - Proceeding with fencing")
	- return "ok"
	- else:
	- logger.debug(f"Captured my ({instance_id}) state it is {state.upper()} - returning Alert - Unable to fence other nodes")
	- return "alert"
	-
	- except ClientError:
	- fail_usage("Failed: Incorrect Access Key or Secret Key.")
	- except EndpointConnectionError:
	- fail_usage("Failed: Incorrect Region.")
	- except IndexError:
	- return "fail"
	+ try:
	+ instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
	+ state = list(instance)[0].state["Name"]
	+ if state == "running":
	+ logger.debug(f"Captured my ({instance_id}) state and it {state.upper()} - returning OK - Proceeding with fencing")
	+ return "ok"
	+ else:
	+ logger.debug(f"Captured my ({instance_id}) state it is {state.upper()} - returning Alert - Unable to fence other nodes")
	+ return "alert"
	+
	+ except ClientError:
	+ fail_usage("Failed: Incorrect Access Key or Secret Key.")
	+ except EndpointConnectionError:
	+ fail_usage("Failed: Incorrect Region.")
	+ except IndexError:
	+ return "fail"

	# Create backup tags for each network interface
	def create_backup_tag(ec2_client, instance_id, interfaces, timestamp):
	"""Create tags on the instance to backup original security groups for each network interface.
	If the security groups list is too long, it will be split across multiple tags."""
	try:
	# Create tags for each network interface
	for idx, interface in enumerate(interfaces, 1):
	interface_id = interface["NetworkInterfaceId"]
	security_groups = interface["SecurityGroups"]

	# Initialize variables for chunking
	sg_chunks = []
	current_chunk = []

	# Strip 'sg-' prefix from all security groups first
	stripped_sgs = [sg[3:] if sg.startswith('sg-') else sg for sg in security_groups]

	for sg in stripped_sgs:
	# Create a test chunk with the new security group
	test_chunk = current_chunk + [sg]

	# Create a test backup object with this chunk
	test_backup = {
	"n": {
	"i": interface_id,
	"s": test_chunk,
	"c": {
	"i": len(sg_chunks),
	"t": 1 # Temporary value, will update later
	}
	},
	"t": timestamp
	}

	# Check if adding this SG would exceed the character limit
	if len(json.dumps(test_backup)) > 254:
	# Current chunk is full, add it to chunks and start a new one
	if current_chunk: # Only add if not empty
	sg_chunks.append(current_chunk)
	current_chunk = [sg]
	else:
	# Edge case: single SG exceeds limit (shouldn't happen with normal SG IDs)
	logger.warning(f"Security group ID {sg} is unusually long")
	sg_chunks.append([sg])
	else:
	# Add SG to current chunk
	current_chunk = test_chunk

	# Add the last chunk if it has any items
	if current_chunk:
	sg_chunks.append(current_chunk)

	# Update total chunks count and create tags
	for chunk_idx, sg_chunk in enumerate(sg_chunks):

	sg_backup = {
	"n": { # NetworkInterface shortened to n
	"i": interface_id, # ni shortened to i
	"s": sg_chunk, # sg shortened to s, with 'sg-' prefix stripped
	"c": { # ci shortened to c
	"i": chunk_idx,
	"t": len(sg_chunks)
	}
	},
	"t": timestamp # ts shortened to t
	}
	tag_value = json.dumps(sg_backup)
	tag_key = f"Original_SG_Backup_{instance_id}_{timestamp}_{idx}_{chunk_idx}"

	# Create the tag
	ec2_client.create_tags(
	Resources=[instance_id],
	Tags=[{"Key": tag_key, "Value": tag_value}],
	)

	# Verify the tag was created
	response = ec2_client.describe_tags(
	Filters=[
	{"Name": "resource-id", "Values": [instance_id]},
	{"Name": "key", "Values": [tag_key]}
	]
	)

	if not response["Tags"]:
	logger.error(f"Failed to verify creation of backup tag '{tag_key}' for instance {instance_id}")
	raise Exception("Backup tag creation could not be verified")

	created_tag_value = response["Tags"][0]["Value"]
	if created_tag_value != tag_value:
	logger.error(f"Created tag value does not match expected value for instance {instance_id}")
	raise Exception("Backup tag value mismatch")

	logger.info(f"Backup tag '{tag_key}' chunk {chunk_idx + 1}/{len(sg_chunks)} created and verified for interface {interface_id}.")
	except ClientError as e:
	logger.error(f"AWS API error while creating/verifying backup tag: {str(e)}")
	raise
	except Exception as e:
	logger.error(f"Unexpected error while creating/verifying backup tag: {str(e)}")
	raise


	-def modify_security_groups(ec2_client, instance_id, sg_list, timestamp, mode="remove"):
	+def modify_security_groups(ec2_client, instance_id, sg_list, timestamp, mode="remove", options=None):
	"""
	Modifies security groups on network interfaces based on the specified mode.
	In 'remove' mode: Removes all SGs in sg_list from each interface
	In 'keep_only' mode: Keeps only the SGs in sg_list and removes all others

	Args:
	ec2_client: The boto3 EC2 client
	instance_id: The ID of the EC2 instance
	sg_list: List of security group IDs to remove or keep
	timestamp: Unix timestamp for backup tag
	mode: Either "remove" or "keep_only" to determine operation mode

	Raises:
	ClientError: If AWS API calls fail
	Exception: For other unexpected errors
	"""
	try:
	# Get instance details
	state, _, interfaces = get_instance_details(ec2_client, instance_id)

	+ # Create a backup tag before making any changes
	try:
	- # Create a backup tag before making changes
	create_backup_tag(ec2_client, instance_id, interfaces, timestamp)
	- except ClientError as e:
	- logger.warning(f"Failed to create backup tag: {str(e)}")
	- # Continue execution even if backup tag creation fails
	+ try:
	+ set_lastfence_tag(ec2_client, instance_id, timestamp)
	+ except Exception as e:
	+ if "--ignore-tag-write-failure" in options:
	+ logger.warning(f"Failed to set lastfence tag but continuing due to --ignore-tag-write-failure: {str(e)}")
	+ logger.info("Will rely on security group state for fencing status")
	+ else:
	+ logger.error(f"Failed to set lastfence tag: {str(e)}")
	+ raise
	+ except Exception as e:
	+ if "--ignore-tag-write-failure" in options:
	+ logger.warning(f"Failed to create backup tag but continuing due to --ignore-tag-write-failure: {str(e)}")
	+ logger.info("Will rely on security group state for fencing status")
	+ else:
	+ logger.error(f"Failed to create backup tag: {str(e)}")
	+ raise

	changed_any = False
	for interface in interfaces:
	try:
	original_sgs = interface["SecurityGroups"]

	if mode == "remove":
	# Exclude any SGs that are in sg_list
	updated_sgs = [sg for sg in original_sgs if sg not in sg_list]
	operation_desc = f"removing {sg_list}"
	else: # keep_only mode
	# Set interface to only use the specified security groups
	updated_sgs = sg_list
	operation_desc = f"keeping only {sg_list}"

	# Skip if we'd end up with zero SGs (only in remove mode)
	if mode == "remove" and not updated_sgs:
	logger.info(
	f"Skipping interface {interface['NetworkInterfaceId']}: "
	f"removal of {sg_list} would leave 0 SGs."
	)
	continue

	# Skip if no changes needed
	if updated_sgs == original_sgs:
	continue

	logger.info(
	f"Updating interface {interface['NetworkInterfaceId']} from {original_sgs} "
	f"to {updated_sgs} ({operation_desc})"
	)

	try:
	ec2_client.modify_network_interface_attribute(
	NetworkInterfaceId=interface["NetworkInterfaceId"],
	Groups=updated_sgs
	)
	changed_any = True
	except ClientError as e:
	logger.error(
	f"Failed to modify security groups for interface "
	f"{interface['NetworkInterfaceId']}: {str(e)}"
	)
	continue

	except KeyError as e:
	logger.error(f"Malformed interface data: {str(e)}")
	continue

	- # If we didn't modify anything, log appropriate error
	+ # If we didn't modify anything, raise an error
	if not changed_any:
	if mode == "remove":
	error_msg = f"Security Groups {sg_list} not removed from any interface. Either not found, or removal left 0 SGs."
	else:
	error_msg = f"Security Groups {sg_list} not found on any interface. No changes made."
	logger.error(error_msg)
	- sys.exit(EC_GENERIC_ERROR)
	+ raise Exception("Failed to modify security groups: " + error_msg)

	# Wait a bit for changes to propagate
	time.sleep(5)

	except ClientError as e:
	logger.error(f"AWS API error: {str(e)}")
	raise
	except Exception as e:
	logger.error(f"Unexpected error: {str(e)}")
	raise

	def restore_security_groups(ec2_client, instance_id):
	"""
	Restores the original security groups from backup tags to each network interface.
	Each network interface's original security groups are stored in a separate backup tag.
	All backup tags share the same timestamp as the lastfence tag for validation.

	The process:
	1. Get lastfence tag timestamp
	2. Find all backup tags with matching timestamp
	3. Create a map of interface IDs to their original security groups
	4. Restore each interface's security groups from the map
	5. Clean up matching backup tags and lastfence tag

	Args:
	ec2_client: The boto3 EC2 client
	instance_id: The ID of the EC2 instance

	Raises:
	ClientError: If AWS API calls fail
	Exception: For other unexpected errors
	SystemExit: If required tags are missing or no changes were made
	"""
	try:
	# Get the lastfence tag first
	lastfence_response = ec2_client.describe_tags(
	Filters=[
	{"Name": "resource-id", "Values": [instance_id]},
	{"Name": "key", "Values": ["lastfence"]}
	]
	)

	if not lastfence_response["Tags"]:
	logger.error(f"No lastfence tag found for instance {instance_id}")
	sys.exit(EC_GENERIC_ERROR)

	lastfence_timestamp = lastfence_response["Tags"][0]["Value"]

	# Get all backup tags for this instance
	backup_response = ec2_client.describe_tags(
	Filters=[
	{"Name": "resource-id", "Values": [instance_id]},
	{"Name": "key", "Values": [f"Original_SG_Backup_{instance_id}*"]}
	]
	)

	if not backup_response["Tags"]:
	logger.error(f"No backup tags found for instance {instance_id}")
	sys.exit(EC_GENERIC_ERROR)

	# Find and combine backup tags with matching timestamp
	matching_backups = {}
	interface_chunks = {}

	for tag in backup_response["Tags"]:
	try:
	backup_data = json.loads(tag["Value"])
	backup_timestamp = backup_data.get("t") # Using shortened timestamp field

	if not backup_timestamp or str(backup_timestamp) != str(lastfence_timestamp):
	continue

	logger.info(f"Found matching backup tag {tag['Key']}")
	interface_data = backup_data.get("n") # Using shortened NetworkInterface field

	if not interface_data or "i" not in interface_data: # Using shortened interface id field
	continue

	interface_id = interface_data["i"] # Using shortened interface id field
	chunk_info = interface_data.get("c", {}) # Using shortened chunk info field
	chunk_index = chunk_info.get("i", 0)
	total_chunks = chunk_info.get("t", 1)

	# Initialize tracking for this interface if needed
	if interface_id not in interface_chunks:
	interface_chunks[interface_id] = {
	"total": total_chunks,
	"chunks": {},
	"security_groups": []
	}

	# Add this chunk's security groups
	interface_chunks[interface_id]["chunks"][chunk_index] = interface_data["s"] # Using shortened security groups field

	# If we have all chunks for this interface, combine them
	if len(interface_chunks[interface_id]["chunks"]) == total_chunks:
	# Combine chunks and restore 'sg-' prefix
	combined_sgs = []
	for i in range(total_chunks):
	chunk_sgs = interface_chunks[interface_id]["chunks"][i]
	# Add back 'sg-' prefix if not already present
	restored_sgs = ['sg-' + sg if not sg.startswith('sg-') else sg for sg in chunk_sgs]
	combined_sgs.extend(restored_sgs)
	matching_backups[interface_id] = combined_sgs

	except (json.JSONDecodeError, KeyError) as e:
	logger.error(f"Failed to parse backup data for tag {tag['Key']}: {str(e)}")
	continue

	if not matching_backups:
	logger.error("No complete backup data found with matching timestamp")
	sys.exit(EC_GENERIC_ERROR)

	# Get current interfaces
	_, _, current_interfaces = get_instance_details(ec2_client, instance_id)

	# Use the combined matching_backups as our backup_sg_map
	backup_sg_map = matching_backups

	changed_any = False
	for interface in current_interfaces:
	try:
	interface_id = interface["NetworkInterfaceId"]
	if interface_id not in backup_sg_map:
	logger.warning(
	f"No backup data found for interface {interface_id}. Skipping."
	)
	continue

	original_sgs = backup_sg_map[interface_id]
	current_sgs = interface["SecurityGroups"]

	if original_sgs == current_sgs:
	logger.info(
	f"Interface {interface_id} already has original security groups. Skipping."
	)
	continue

	logger.info(
	f"Restoring interface {interface_id} from {current_sgs} "
	f"to original security groups {original_sgs}"
	)

	try:
	ec2_client.modify_network_interface_attribute(
	NetworkInterfaceId=interface_id,
	Groups=original_sgs
	)
	changed_any = True
	except ClientError as e:
	logger.error(
	f"Failed to restore security groups for interface "
	f"{interface_id}: {str(e)}"
	)
	continue

	except KeyError as e:
	logger.error(f"Malformed interface data: {str(e)}")
	continue

	if not changed_any:
	logger.error("No security groups were restored. All interfaces skipped.")
	sys.exit(EC_GENERIC_ERROR)

	# Wait for changes to propagate
	time.sleep(5)

	# Clean up only the matching backup tags and lastfence tag after successful restore
	try:
	# Delete all backup tags that match the lastfence timestamp
	tags_to_delete = [{"Key": "lastfence"}]
	deleted_tag_keys = []
	for tag in backup_response["Tags"]:
	try:
	backup_data = json.loads(tag["Value"])
	if str(backup_data.get("t")) == str(lastfence_timestamp): # Using shortened timestamp field
	tags_to_delete.append({"Key": tag["Key"]})
	deleted_tag_keys.append(tag["Key"])
	except (json.JSONDecodeError, KeyError):
	continue

	if len(tags_to_delete) > 1: # More than just the lastfence tag
	ec2_client.delete_tags(
	Resources=[instance_id],
	Tags=tags_to_delete
	)
	logger.info(f"Removed matching backup tags {deleted_tag_keys} and lastfence tag from instance {instance_id}")
	except ClientError as e:
	logger.warning(f"Failed to remove tags: {str(e)}")
	# Continue since the restore operation was successful

	except ClientError as e:
	logger.error(f"AWS API error: {str(e)}")
	raise
	except Exception as e:
	logger.error(f"Unexpected error: {str(e)}")
	raise

	# Shutdown instance
	def shutdown_instance(ec2_client, instance_id):
	"""Shutdown the instance and confirm the state transition."""
	try:
	logger.info(f"Initiating shutdown for instance {instance_id}...")
	ec2_client.stop_instances(InstanceIds=[instance_id], Force=True)

	while True:
	try:
	state, _, _ = get_instance_details(ec2_client, instance_id)
	logger.info(f"Current instance state: {state}")
	if state == "stopping":
	logger.info(
	f"Instance {instance_id} is transitioning to 'stopping'. Proceeding without waiting further."
	)
	break
	except ClientError as e:
	logger.error(f"Failed to get instance state during shutdown: {str(e)}")
	fail_usage(f"AWS API error while checking instance state: {str(e)}")
	except Exception as e:
	logger.error(f"Unexpected error while checking instance state: {str(e)}")
	fail_usage(f"Failed to check instance state: {str(e)}")

	except ClientError as e:
	logger.error(f"AWS API error during instance shutdown: {str(e)}")
	fail_usage(f"Failed to shutdown instance: {str(e)}")
	except Exception as e:
	logger.error(f"Unexpected error during instance shutdown: {str(e)}")
	fail_usage(f"Failed to shutdown instance due to unexpected error: {str(e)}")


	# Perform the fencing action
	def get_nodes_list(conn, options):
	"""Get list of nodes and their status."""
	logger.debug("Starting monitor operation")
	result = {}
	try:
	if "--filter" in options:
	filter_key = options["--filter"].split("=")[0].strip()
	filter_value = options["--filter"].split("=")[1].strip()
	filter = [{"Name": filter_key, "Values": [filter_value]}]
	logging.debug("Filter: {}".format(filter))

	for instance in conn.instances.filter(Filters=filter if 'filter' in vars() else []):
	instance_name = ""
	for tag in instance.tags or []:
	if tag.get("Key") == "Name":
	instance_name = tag["Value"]
	try:
	result[instance.id] = (instance_name, status[instance.state["Name"]])
	except KeyError as e:
	if options.get("--original-action") == "list-status":
	logger.error("Unknown status \"{}\" returned for {} ({})".format(instance.state["Name"], instance.id, instance_name))
	result[instance.id] = (instance_name, "unknown")
	except Exception as e:
	logger.error("Failed to get node list: %s", e)
	return result

	def set_lastfence_tag(ec2_client, instance_id, timestamp):
	"""Set a lastfence tag on the instance with the timestamp."""
	try:
	ec2_client.create_tags(
	Resources=[instance_id],
	Tags=[{"Key": "lastfence", "Value": str(timestamp)}]
	)
	logger.info(f"Set lastfence tag with timestamp {timestamp} on instance {instance_id}")
	except Exception as e:
	logger.error(f"Failed to set lastfence tag: {str(e)}")
	raise

	def set_power_status(conn, options):
	"""Set power status of the instance."""
	timestamp = int(time.time()) # Unix timestamp
	ec2_client = conn.meta.client
	instance_id = options["--plug"]
	sg_to_remove = options.get("--secg", "").split(",") if options.get("--secg") else []

	# Perform self-check if skip-race not set
	if "--skip-race-check" not in options:
	self_instance_id = get_instance_id()
	if self_instance_id == instance_id:
	fail_usage("Self-fencing detected. Exiting.")

	try:
	# Only verify instance is running for 'off' action
	if options["--action"] == "off":
	instance_state, _, _ = get_instance_details(ec2_client, instance_id)
	if instance_state != "running":
	fail_usage(f"Instance {instance_id} is not running. Exiting.")

	if options["--action"] == "on":
	if not "--unfence-ignore-restore" in options:
	restore_security_groups(ec2_client, instance_id)
	else:
	logger.info("Ignored Restoring security groups as --unfence-ignore-restore is set")
	elif options["--action"] == "off":
	if sg_to_remove:
	mode = "keep_only" if "--invert-sg-removal" in options else "remove"
	- modify_security_groups(ec2_client, instance_id, sg_to_remove, timestamp, mode)
	- set_lastfence_tag(ec2_client, instance_id, timestamp)
	- if "--onfence-poweroff" in options:
	- shutdown_instance(ec2_client, instance_id)
	+ try:
	+ modify_security_groups(ec2_client, instance_id, sg_to_remove, timestamp, mode, options)
	+ if "--onfence-poweroff" in options:
	+ shutdown_instance(ec2_client, instance_id)
	+ except Exception as e:
	+ if isinstance(e, ClientError):
	+ logger.error("AWS API error: %s", e)
	+ fail_usage(str(e))
	+ elif "--ignore-tag-write-failure" in options:
	+ # If we're ignoring tag failures, only fail if the security group modifications failed
	+ if "Failed to modify security groups" in str(e):
	+ logger.error("Failed to modify security groups: %s", e)
	+ fail(EC_STATUS)
	+ else:
	+ logger.warning("Ignoring error due to ignore-tag-write-failure: %s", e)
	+ else:
	+ logger.error("Failed to set power status: %s", e)
	+ fail(EC_STATUS)
	except Exception as e:
	- logger.error("Failed to set power status: %s", e)
	+ logger.error("Unexpected error in set_power_status: %s", e)
	fail(EC_STATUS)


	# Define fencing agent options
	def define_new_opts():
	all_opt["port"]["help"] = "-n, --plug=[id] AWS Instance ID to perform action on "
	all_opt["port"]["shortdesc"] = "AWS Instance ID to perform action on "

	all_opt["region"] = {
	"getopt": "r:",
	"longopt": "region",
	"help": "-r, --region=[region] AWS region (e.g., us-east-1)",
	"shortdesc": "AWS Region.",
	"required": "0",
	"order": 1,
	}
	all_opt["access_key"] = {
	"getopt": "a:",
	"longopt": "access-key",
	"help": "-a, --access-key=[key] AWS access key.",
	"shortdesc": "AWS Access Key.",
	"required": "0",
	"order": 2,
	}
	all_opt["secret_key"] = {
	"getopt": "s:",
	"longopt": "secret-key",
	"help": "-s, --secret-key=[key] AWS secret key.",
	"shortdesc": "AWS Secret Key.",
	"required": "0",
	"order": 3,
	}
	all_opt["secg"] = {
	"getopt": ":",
	"longopt": "secg",
	"help": "--secg=[sg1,sg2,...] Comma-separated list of Security Groups to remove.",
	"shortdesc": "Security Groups to remove.",
	"required": "0",
	"order": 4,
	}
	all_opt["skip_race_check"] = {
	"getopt": "",
	"longopt": "skip-race-check",
	"help": "--skip-race-check Skip race condition check.",
	"shortdesc": "Skip race condition check.",
	"required": "0",
	"order": 6,
	}
	all_opt["invert-sg-removal"] = {
	"getopt": "",
	"longopt": "invert-sg-removal",
	"help": "--invert-sg-removal Remove all security groups except the specified one(s).",
	"shortdesc": "Remove all security groups except specified..",
	"required": "0",
	"order": 7,
	}
	all_opt["unfence-ignore-restore"] = {
	"getopt": "",
	"longopt": "unfence-ignore-restore",
	"help": "--unfence-ignore-restore Do not restore security groups from tag when unfencing (off).",
	"shortdesc": "Remove all security groups except specified..",
	"required": "0",
	"order": 8,

	}
	all_opt["filter"] = {
	"getopt": ":",
	"longopt": "filter",
	"help": "--filter=[key=value] Filter (e.g. vpc-id=[vpc-XXYYZZAA])",
	"shortdesc": "Filter for list-action",
	"required": "0",
	"order": 9
	}
	all_opt["boto3_debug"] = {
	"getopt": "b:",
	"longopt": "boto3_debug",
	"help": "-b, --boto3_debug=[option] Boto3 and Botocore library debug logging",
	"shortdesc": "Boto Lib debug",
	"required": "0",
	"default": "False",
	"order": 10
	}
	all_opt["onfence-poweroff"] = {
	"getopt": "",
	"longopt": "onfence-poweroff",
	"help": "--onfence-poweroff Power off the machine async upon fence (this is a network fencing agent...)",
	"shortdesc": "Power off the machine async..",
	"required": "0",
	"order": 11
	}
	+ all_opt["ignore-tag-write-failure"] = {
	+ "getopt": "",
	+ "longopt": "ignore-tag-write-failure",
	+ "help": "--ignore-tag-write-failure Continue to fence even if backup tag fails. This ensures prioriization of fencing over AWS backplane access",
	+ "shortdesc": "Continue to fence even if backup tag fails..",
	+ "required": "0",
	+ "order": 12
	+ }


	def main():
	conn = None

	device_opt = [
	"no_password",
	"region",
	"access_key",
	"secret_key",
	"secg",
	"port",
	"skip_race_check",
	"invert-sg-removal",
	"unfence-ignore-restore",
	"filter",
	"boto3_debug",
	- "onfence-poweroff"
	+ "onfence-poweroff",
	+ "ignore-tag-write-failure"
	]

	atexit.register(atexit_handler)

	define_new_opts()

	try:
	processed_input = process_input(device_opt)
	options = check_input(device_opt, processed_input)
	except Exception as e:
	logger.error(f"Failed to process input options: {str(e)}")
	sys.exit(EC_GENERIC_ERROR)

	run_delay(options)

	docs = {
	"shortdesc": "Fence agent for AWS (Amazon Web Services) Net",
	"longdesc": (
	"fence_aws_vpc is a Network and Power Fencing agent for AWS VPC that works by "
	"manipulating security groups. It uses the boto3 library to connect to AWS.\n\n"
	"boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.\n"
	"For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration"
	" "
	"NOTE: If onfence-poweroff is set, the agent won't be able to power on the node again, it will have to be powered on manually or with other automation."
	),
	"vendorurl": "http://www.amazon.com"
	}
	show_docs(options, docs)

	if "--onfence-poweroff" not in options and options.get("--action", "") == "reboot":
	options["--action"] = "off"

	# Configure logging
	if "--debug-file" in options:
	for handler in logger.handlers:
	if isinstance(handler, logging.FileHandler):
	logger.removeHandler(handler)
	lh = logging.FileHandler(options["--debug-file"])
	logger.addHandler(lh)
	lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	lh.setFormatter(lhf)
	lh.setLevel(logging.DEBUG)

	# Configure boto3 logging
	if options.get("--boto3_debug", "").lower() not in ["1", "yes", "on", "true"]:
	boto3.set_stream_logger('boto3', logging.INFO)
	boto3.set_stream_logger('botocore', logging.CRITICAL)
	logging.getLogger('botocore').propagate = False
	logging.getLogger('boto3').propagate = False
	else:
	log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
	logging.getLogger('botocore').propagate = False
	logging.getLogger('boto3').propagate = False
	fdh = logging.FileHandler('/var/log/fence_aws_vpc_boto3.log')
	fdh.setFormatter(log_format)
	logging.getLogger('boto3').addHandler(fdh)
	logging.getLogger('botocore').addHandler(fdh)
	logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_vpc_boto3.log",
	options.get("--boto3_debug"))

	# Establish AWS connection
	region = options.get("--region")
	access_key = options.get("--access-key")
	secret_key = options.get("--secret-key")

	try:
	conn = boto3.resource(
	"ec2",
	region_name=region,
	aws_access_key_id=access_key,
	aws_secret_access_key=secret_key,
	)
	except Exception as e:
	if not options.get("--action", "") in ["metadata", "manpage", "validate-all"]:
	fail_usage("Failed: Unable to connect to AWS: " + str(e))
	else:
	pass

	# Operate the fencing device using the fence library's fence_action
	result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list)
	sys.exit(result)


	if __name__ == "__main__":
	main()
	+
	diff --git a/agents/aws_vpc_net/readme.md b/agents/aws_vpc_net/readme.md
	index 47f467ea..de1a12e5 100644
	--- a/agents/aws_vpc_net/readme.md
	+++ b/agents/aws_vpc_net/readme.md
	@@ -1,328 +1,461 @@
	# Fence AWS VPC Network Agent Design Document

	## Overview

	The fence_aws_vpc_net agent is a network and power fencing agent for AWS VPC that operates by manipulating security groups. This document outlines the design and architecture of the agent.

	## Class Diagram

	```mermaid
	classDiagram
	class FenceAWSVPCNet {
	-logger: Logger
	-conn: boto3.resource
	-options: dict
	+main()
	+define_new_opts()
	+process_input()
	+check_input()
	+ +get_power_status()
	+ +set_power_status()
	}

	- class AWSConnection {
	- -region: str
	- -access_key: str
	- -secret_key: str
	- +establish_connection()
	- +validate_credentials()
	+ class InstanceOperations {
	+ +get_instance_id()
	+ +get_instance_details()
	+ +shutdown_instance()
	+ +get_nodes_list()
	}

	- class SecurityGroupManager {
	+ class SecurityGroupOperations {
	+modify_security_groups()
	+create_backup_tag()
	+restore_security_groups()
	-validate_sg_changes()
	}

	- class InstanceManager {
	- +get_instance_details()
	- +shutdown_instance()
	- +get_power_status()
	- +set_power_status()
	- -validate_instance_state()
	- }
	-
	- class TagManager {
	+ class TagOperations {
	+set_lastfence_tag()
	- +get_backup_tags()
	- +cleanup_tags()
	- -validate_tag_operations()
	+ +create_backup_tag()
	+ +restore_from_backup()
	+ -handle_chunked_tags()
	}

	- FenceAWSVPCNet --> AWSConnection
	- FenceAWSVPCNet --> SecurityGroupManager
	- FenceAWSVPCNet --> InstanceManager
	- SecurityGroupManager --> TagManager
	- InstanceManager --> TagManager
	+ class LoggingManager {
	+ +configure_logging()
	+ +configure_boto3_logging()
	+ +handle_debug_file()
	+ }

	+ FenceAWSVPCNet --> InstanceOperations
	+ FenceAWSVPCNet --> SecurityGroupOperations
	+ FenceAWSVPCNet --> TagOperations
	+ FenceAWSVPCNet --> LoggingManager
	+ SecurityGroupOperations --> TagOperations
	```

	## Sequence Diagrams

	### Fence Operation (Power Off)

	```mermaid
	sequenceDiagram
	participant Client
	participant FenceAgent
	participant AWS
	participant SecurityGroups
	participant Tags

	Client->>FenceAgent: Execute fence operation
	FenceAgent->>AWS: Validate AWS credentials
	AWS-->>FenceAgent: Credentials valid

	+ opt skip-race-check not set
	+ FenceAgent->>AWS: Get self instance ID
	+ AWS-->>FenceAgent: Instance ID
	+ FenceAgent->>FenceAgent: Check for self-fencing
	+ end
	+
	FenceAgent->>AWS: Get instance details
	AWS-->>FenceAgent: Instance details

	alt Instance is running
	FenceAgent->>SecurityGroups: Backup current security groups
	SecurityGroups-->>FenceAgent: Backup created

	- FenceAgent->>Tags: Create lastfence tag
	- Tags-->>FenceAgent: Tag created
	+ alt ignore-tag-write-failure not set
	+ FenceAgent->>Tags: Create lastfence tag
	+ Tags-->>FenceAgent: Tag created
	+ end

	FenceAgent->>SecurityGroups: Modify security groups
	SecurityGroups-->>FenceAgent: Groups modified

	opt onfence-poweroff enabled
	FenceAgent->>AWS: Initiate shutdown
	AWS-->>FenceAgent: Shutdown initiated
	end

	FenceAgent-->>Client: Success
	else Instance not running
	FenceAgent-->>Client: Fail - Instance not running
	end
	```

	### Unfence Operation (Power On)

	```mermaid
	sequenceDiagram
	participant Client
	participant FenceAgent
	participant AWS
	participant SecurityGroups
	participant Tags

	Client->>FenceAgent: Execute unfence operation
	FenceAgent->>AWS: Validate AWS credentials
	AWS-->>FenceAgent: Credentials valid

	- FenceAgent->>Tags: Get lastfence tag
	- Tags-->>FenceAgent: Lastfence tag
	+ alt unfence-ignore-restore not set
	+ FenceAgent->>Tags: Get lastfence tag
	+ Tags-->>FenceAgent: Lastfence tag

	- FenceAgent->>Tags: Get backup tags
	- Tags-->>FenceAgent: Backup tags
	+ FenceAgent->>Tags: Get backup tags
	+ Tags-->>FenceAgent: Backup tags

	- alt Valid backup found
	- FenceAgent->>SecurityGroups: Restore original security groups
	- SecurityGroups-->>FenceAgent: Groups restored
	+ alt Valid backup found
	+ FenceAgent->>SecurityGroups: Restore original security groups
	+ SecurityGroups-->>FenceAgent: Groups restored

	- FenceAgent->>Tags: Cleanup backup tags
	- Tags-->>FenceAgent: Tags cleaned
	+ FenceAgent->>Tags: Cleanup backup tags
	+ Tags-->>FenceAgent: Tags cleaned

	- FenceAgent-->>Client: Success
	- else No valid backup
	- FenceAgent-->>Client: Fail - No valid backup found
	+ FenceAgent-->>Client: Success
	+ else No valid backup
	+ FenceAgent-->>Client: Fail - No valid backup found
	+ end
	+ else
	+ FenceAgent-->>Client: Success - Restore skipped
	end
	```

	## Component Details

	### 1. Main Controller (FenceAWSVPCNet)
	- Purpose: Main entry point and orchestration
	- Key Responsibilities:
	- Process command line options
	- Initialize AWS connection
	- Execute fence operations
	- Handle logging and errors
	+ - Manage self-fencing prevention
	+ - Support tag write failure handling

	-### 2. AWS Connection Manager
	-- Purpose: Handle AWS connectivity
	+### 2. Instance Operations
	+- Purpose: Handle EC2 instance operations
	- Key Responsibilities:
	- - Establish and maintain AWS connection
	- - Handle credentials and regions
	- - Manage API retries and timeouts
	+ - Get instance details and metadata
	+ - Handle instance power operations
	+ - Validate instance states
	+ - List and filter instances
	+ - Handle instance shutdown

	-### 3. Security Group Manager
	+### 3. Security Group Operations
	- Purpose: Manage security group operations
	- Key Responsibilities:
	- - Modify security groups
	- - Create backups of security group configurations
	+ - Modify security groups (remove or keep-only modes)
	+ - Handle chunked backup operations
	- Restore security groups from backups
	- Validate security group changes
	+ - Support partial success scenarios

	-### 4. Instance Manager
	-- Purpose: Handle EC2 instance operations
	-- Key Responsibilities:
	- - Get instance details and status
	- - Handle instance power operations
	- - Validate instance states
	- - Manage self-fencing prevention
	-
	-### 5. Tag Manager
	+### 4. Tag Operations
	- Purpose: Manage AWS resource tagging
	- Key Responsibilities:
	- Create and manage backup tags
	- - Handle lastfence tags
	+ - Handle chunked tag data
	+ - Manage lastfence tags
	- Clean up tags after operations
	- - Validate tag operations
	+ - Support tag write failure scenarios
	+
	+### 5. Logging Manager
	+- Purpose: Handle logging configuration
	+- Key Responsibilities:
	+ - Configure application logging
	+ - Manage boto3 debug logging
	+ - Handle debug file output
	+ - Control log propagation

	## Success and Failure Paths

	### Success Paths

	-1. Normal Fence Operation
	+1. Normal Fence Operation (Without ignore-tag-write-failure)
	```
	Start
	├── Validate AWS credentials
	+├── Check for self-fencing (if enabled)
	├── Check instance is running
	-├── Backup security groups
	+├── Backup security groups (with chunking)
	+│ ├── Create backup tags for each interface
	+│ └── Verify backup tag creation
	├── Create lastfence tag
	├── Modify security groups
	+│ ├── Remove specified groups
	+│ └── Verify modifications
	├── [Optional] Shutdown instance
	└── Success
	```

	-2. Normal Unfence Operation
	+2. Fence Operation (With ignore-tag-write-failure)
	```
	Start
	├── Validate AWS credentials
	-├── Find lastfence tag
	-├── Find backup tags
	-├── Restore security groups
	-├── Clean up tags
	+├── Check for self-fencing (if enabled)
	+├── Check instance is running
	+├── Attempt backup tag creation
	+│ ├── Success: Create backup tags
	+│ └── Failure: Log warning and continue
	+├── Attempt lastfence tag creation
	+│ ├── Success: Create tag
	+│ └── Failure: Log warning and continue
	+├── Modify security groups
	+│ ├── Remove specified groups
	+│ ├── Verify modifications
	+│ └── Check all interfaces modified
	+│ ├── All modified: Success
	+│ └── Partial: Fail with modification error
	+├── [Optional] Shutdown instance
	+└── Success (if security groups modified)
	+```
	+
	+3. Normal Unfence Operation
	+```
	+Start
	+├── Validate AWS credentials
	+├── [Skip if unfence-ignore-restore]
	+│ ├── Find lastfence tag
	+│ ├── Find backup tags
	+│ ├── Restore security groups
	+│ └── Clean up tags
	└── Success
	```

	### Failure Paths

	1. Authentication Failures
	```
	Start
	├── Invalid AWS credentials
	+│ ├── Missing credentials
	+│ ├── Invalid access key
	+│ ├── Invalid secret key
	+│ └── Invalid region
	└── Fail with auth error
	```

	2. Instance State Failures
	```
	Start
	+├── Instance not found
	+│ └── Fail with instance error
	├── Instance not in required state
	-└── Fail with state error
	+│ └── Fail with state error
	+└── Self-fencing detected
	+ └── Fail with self-fencing error
	```

	-3. Security Group Operation Failures
	+3. Security Group Operation Failures (Without ignore-tag-write-failure)
	```
	Start
	├── Backup creation fails
	+│ ├── Tag size too large
	+│ ├── API error
	│ └── Fail with backup error
	├── Security group modification fails
	+│ ├── Permission denied
	+│ ├── Invalid group ID
	+│ ├── Rate limit exceeded
	│ └── Fail with modification error
	└── Restoration fails
	+ ├── Missing backup data
	+ ├── Invalid backup format
	+ ├── Modification error
	└── Fail with restore error
	```

	-4. Tag Operation Failures
	+4. Security Group Operation Failures (With ignore-tag-write-failure)
	+```
	+Start
	+├── Backup creation fails
	+│ ├── Log warning
	+│ └── Continue to modifications
	+├── Security group modification attempt
	+│ ├── Success: All interfaces modified
	+│ │ └── Continue to completion
	+│ ├── Partial success
	+│ │ ├── Verify fencing state
	+│ │ │ ├── Sufficient interfaces modified
	+│ │ │ │ └── Continue to completion
	+│ │ │ └── Insufficient modifications
	+│ │ │ └── Fail with partial error
	+│ │ └── Log warning
	+│ └── Complete failure
	+│ └── Fail with modification error
	+├── [Optional] Shutdown attempt
	+│ ├── Success
	+│ │ └── Continue to completion
	+│ └── Failure
	+│ └── Log warning (non-fatal)
	+└── Final state determined by SG modifications
	+```
	+
	+5. Tag Operation Failures (Without ignore-tag-write-failure)
	```
	Start
	├── Tag creation fails
	+│ ├── Size limit exceeded
	+│ ├── API error
	│ └── Fail with tag error
	├── Tag retrieval fails
	+│ ├── Missing tags
	+│ ├── Invalid format
	│ └── Fail with retrieval error
	└── Tag cleanup fails
	└── Warning (non-fatal)
	```

	+6. Tag Operation Failures (With ignore-tag-write-failure)
	+```
	+Start
	+├── Backup tag creation fails
	+│ ├── Log warning
	+│ └── Continue operation
	+├── Lastfence tag creation fails
	+│ ├── Log warning
	+│ └── Continue operation
	+├── Tag retrieval fails
	+│ ├── Check security group state
	+│ │ ├── Groups properly modified
	+│ │ │ └── Continue operation
	+│ │ └── Groups not modified
	+│ │ └── Fail with SG error
	+│ └── Log warning
	+└── Tag cleanup fails
	+ └── Warning (non-fatal)
	+```
	+
	## Error Handling

	### Error Categories
	1. AWS API Errors
	- ConnectionError
	- ClientError
	- EndpointConnectionError
	- NoRegionError
	+ - Tag size limitations
	+ - API rate limiting

	2. Validation Errors
	- Invalid parameters
	- Missing required options
	- Invalid security group configurations
	+ - Malformed tag data

	3. State Errors
	- Instance state conflicts
	- Security group conflicts
	- Self-fencing detection
	+ - Partial operation completion

	### Error Recovery
	- Automatic retries for transient AWS API errors
	+- Chunked tag handling for large security group lists
	+- Support for continuing operation despite tag failures
	- Rollback of security group changes on partial failures
	- Preservation of backup tags for manual recovery
	- Detailed logging for troubleshooting

	## Configuration Options

	### Required Options
	- `--plug`: AWS Instance ID
	- AWS credentials (via options or environment)

	### Optional Options
	- `--region`: AWS region
	-- `--secg`: Security groups to remove
	+- `--access-key`: AWS access key
	+- `--secret-key`: AWS secret key
	+- `--secg`: Security groups to remove/keep
	- `--skip-race-check`: Skip self-fencing check
	-- `--invert-sg-removal`: Invert security group removal
	+- `--invert-sg-removal`: Keep only specified security groups
	- `--unfence-ignore-restore`: Skip restore on unfence
	- `--onfence-poweroff`: Power off on fence
	+- `--ignore-tag-write-failure`: Continue despite tag failures
	+- `--filter`: Filter instances for list operation
	+- `--boto3_debug`: Enable boto3 debug logging

	## Logging and Monitoring

	### Log Levels
	-- ERROR: Operation failures
	-- WARNING: Non-critical issues
	-- INFO: Operation progress
	-- DEBUG: Detailed operation data
	+- ERROR: Operation failures and AWS API errors
	+- WARNING: Non-critical issues and tag operation failures
	+- INFO: Operation progress and success
	+- DEBUG: Detailed operation data and API responses

	### Key Metrics
	- Operation success/failure rates
	-- Operation duration
	+- Tag operation success rates
	+- Security group modification status
	- AWS API call latency
	- Error frequency and types
	+- Tag size and chunking metrics

	## Security Considerations

	### Authentication
	- AWS credential management
	- IAM role requirements
	- Access key security
	+- Instance metadata security

	### Operation Safety
	- Self-fencing prevention
	- Backup verification
	- Security group validation
	- State verification
	+- Tag operation integrity
	+- Partial success handling

	## Best Practices

	1. Operation Safety
	- Always verify instance state
	+ - Use self-fencing prevention
	- Validate security group changes
	- Maintain accurate backups
	- - Prevent self-fencing
	+ - Handle tag operation failures gracefully

	2. Error Handling
	- Implement proper rollbacks
	+ - Use chunked tag operations
	- Maintain detailed logs
	- Preserve recovery data
	- Handle edge cases
	+ - Support partial success scenarios

	3. Performance
	- Minimize API calls
	- Implement retries
	- Handle rate limiting
	- - Optimize operations
	+ - Optimize tag operations
	+ - Use efficient security group modifications

	4. Maintenance
	- Regular backup cleanup
	- Log rotation
	- Configuration updates
	- Security updates
	+ - Monitor tag usage
	+ - Clean up orphaned tags
	+
	diff --git a/tests/data/metadata/fence_aws_vpc_net.xml b/tests/data/metadata/fence_aws_vpc_net.xml
	index a252f28e..63607ce6 100644
	--- a/tests/data/metadata/fence_aws_vpc_net.xml
	+++ b/tests/data/metadata/fence_aws_vpc_net.xml
	@@ -1,186 +1,196 @@
	<?xml version="1.0" ?>
	<resource-agent name="fence_aws_vpc_net" shortdesc="Fence agent for AWS (Amazon Web Services) Net" >
	<longdesc>fence_aws_vpc is a Network and Power Fencing agent for AWS VPC that works by manipulating security groups. It uses the boto3 library to connect to AWS.

	boto3 can be configured with AWS CLI or by creating ~/.aws/credentials.
	For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.html#configuration NOTE: If onfence-poweroff is set, the agent won't be able to power on the node again, it will have to be powered on manually or with other automation.</longdesc>
	<vendor-url>http://www.amazon.com</vendor-url>
	<parameters>
	<parameter name="action" unique="0" required="1">
	<getopt mixed="-o, --action=[action]" />
	<content type="string" default="reboot" />
	<shortdesc lang="en">Fencing action</shortdesc>
	</parameter>
	<parameter name="plug" unique="0" required="1" obsoletes="port">
	<getopt mixed="-n, --plug=[id]" />
	<content type="string" />
	<shortdesc lang="en">AWS Instance ID to perform action on </shortdesc>
	</parameter>
	<parameter name="port" unique="0" required="1" deprecated="1">
	<getopt mixed="-n, --plug=[id]" />
	<content type="string" />
	<shortdesc lang="en">AWS Instance ID to perform action on </shortdesc>
	</parameter>
	<parameter name="region" unique="0" required="0">
	<getopt mixed="-r, --region=[region]" />
	<content type="string" />
	<shortdesc lang="en">AWS Region.</shortdesc>
	</parameter>
	<parameter name="access_key" unique="0" required="0">
	<getopt mixed="-a, --access-key=[key]" />
	<content type="string" />
	<shortdesc lang="en">AWS Access Key.</shortdesc>
	</parameter>
	<parameter name="secret_key" unique="0" required="0">
	<getopt mixed="-s, --secret-key=[key]" />
	<content type="string" />
	<shortdesc lang="en">AWS Secret Key.</shortdesc>
	</parameter>
	<parameter name="secg" unique="0" required="0">
	<getopt mixed="--secg=[sg1,sg2,...]" />
	<content type="string" />
	<shortdesc lang="en">Security Groups to remove.</shortdesc>
	</parameter>
	<parameter name="skip_race_check" unique="0" required="0">
	<getopt mixed="--skip-race-check" />
	<content type="boolean" />
	<shortdesc lang="en">Skip race condition check.</shortdesc>
	</parameter>
	<parameter name="invert-sg-removal" unique="0" required="0" deprecated="1">
	<getopt mixed="--invert-sg-removal" />
	<content type="boolean" />
	<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
	</parameter>
	<parameter name="invert_sg_removal" unique="0" required="0" obsoletes="invert-sg-removal">
	<getopt mixed="--invert-sg-removal" />
	<content type="boolean" />
	<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
	</parameter>
	<parameter name="unfence-ignore-restore" unique="0" required="0" deprecated="1">
	<getopt mixed="--unfence-ignore-restore" />
	<content type="boolean" />
	<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
	</parameter>
	<parameter name="unfence_ignore_restore" unique="0" required="0" obsoletes="unfence-ignore-restore">
	<getopt mixed="--unfence-ignore-restore" />
	<content type="boolean" />
	<shortdesc lang="en">Remove all security groups except specified..</shortdesc>
	</parameter>
	<parameter name="filter" unique="0" required="0">
	<getopt mixed="--filter=[key=value]" />
	<content type="string" />
	<shortdesc lang="en">Filter for list-action</shortdesc>
	</parameter>
	<parameter name="boto3_debug" unique="0" required="0">
	<getopt mixed="-b, --boto3_debug=[option]" />
	<content type="string" default="False" />
	<shortdesc lang="en">Boto Lib debug</shortdesc>
	</parameter>
	<parameter name="onfence-poweroff" unique="0" required="0" deprecated="1">
	<getopt mixed="--onfence-poweroff" />
	<content type="boolean" />
	<shortdesc lang="en">Power off the machine async..</shortdesc>
	</parameter>
	<parameter name="onfence_poweroff" unique="0" required="0" obsoletes="onfence-poweroff">
	<getopt mixed="--onfence-poweroff" />
	<content type="boolean" />
	<shortdesc lang="en">Power off the machine async..</shortdesc>
	</parameter>
	+ <parameter name="ignore-tag-write-failure" unique="0" required="0" deprecated="1">
	+ <getopt mixed="--ignore-tag-write-failure" />
	+ <content type="boolean" />
	+ <shortdesc lang="en">Continue to fence even if backup tag fails..</shortdesc>
	+ </parameter>
	+ <parameter name="ignore_tag_write_failure" unique="0" required="0" obsoletes="ignore-tag-write-failure">
	+ <getopt mixed="--ignore-tag-write-failure" />
	+ <content type="boolean" />
	+ <shortdesc lang="en">Continue to fence even if backup tag fails..</shortdesc>
	+ </parameter>
	<parameter name="quiet" unique="0" required="0">
	<getopt mixed="-q, --quiet" />
	<content type="boolean" />
	<shortdesc lang="en">Disable logging to stderr. Does not affect --verbose or --debug-file or logging to syslog.</shortdesc>
	</parameter>
	<parameter name="verbose" unique="0" required="0">
	<getopt mixed="-v, --verbose" />
	<content type="boolean" />
	<shortdesc lang="en">Verbose mode. Multiple -v flags can be stacked on the command line (e.g., -vvv) to increase verbosity.</shortdesc>
	</parameter>
	<parameter name="verbose_level" unique="0" required="0">
	<getopt mixed="--verbose-level" />
	<content type="integer" />
	<shortdesc lang="en">Level of debugging detail in output. Defaults to the number of --verbose flags specified on the command line, or to 1 if verbose=1 in a stonith device configuration (i.e., on stdin).</shortdesc>
	</parameter>
	<parameter name="debug" unique="0" required="0" deprecated="1">
	<getopt mixed="-D, --debug-file=[debugfile]" />
	<content type="string" />
	<shortdesc lang="en">Write debug information to given file</shortdesc>
	</parameter>
	<parameter name="debug_file" unique="0" required="0" obsoletes="debug">
	<getopt mixed="-D, --debug-file=[debugfile]" />
	<shortdesc lang="en">Write debug information to given file</shortdesc>
	</parameter>
	<parameter name="version" unique="0" required="0">
	<getopt mixed="-V, --version" />
	<content type="boolean" />
	<shortdesc lang="en">Display version information and exit</shortdesc>
	</parameter>
	<parameter name="help" unique="0" required="0">
	<getopt mixed="-h, --help" />
	<content type="boolean" />
	<shortdesc lang="en">Display help and exit</shortdesc>
	</parameter>
	<parameter name="plug_separator" unique="0" required="0">
	<getopt mixed="--plug-separator=[char]" />
	<content type="string" default="," />
	<shortdesc lang="en">Separator for plug parameter when specifying more than 1 plug</shortdesc>
	</parameter>
	<parameter name="separator" unique="0" required="0">
	<getopt mixed="-C, --separator=[char]" />
	<content type="string" default="," />
	<shortdesc lang="en">Separator for CSV created by 'list' operation</shortdesc>
	</parameter>
	<parameter name="delay" unique="0" required="0">
	<getopt mixed="--delay=[seconds]" />
	<content type="second" default="0" />
	<shortdesc lang="en">Wait X seconds before fencing is started</shortdesc>
	</parameter>
	<parameter name="disable_timeout" unique="0" required="0">
	<getopt mixed="--disable-timeout=[true/false]" />
	<content type="string" />
	<shortdesc lang="en">Disable timeout (true/false) (default: true when run from Pacemaker 2.0+)</shortdesc>
	</parameter>
	<parameter name="login_timeout" unique="0" required="0">
	<getopt mixed="--login-timeout=[seconds]" />
	<content type="second" default="5" />
	<shortdesc lang="en">Wait X seconds for cmd prompt after login</shortdesc>
	</parameter>
	<parameter name="power_timeout" unique="0" required="0">
	<getopt mixed="--power-timeout=[seconds]" />
	<content type="second" default="20" />
	<shortdesc lang="en">Test X seconds for status change after ON/OFF</shortdesc>
	</parameter>
	<parameter name="power_wait" unique="0" required="0">
	<getopt mixed="--power-wait=[seconds]" />
	<content type="second" default="0" />
	<shortdesc lang="en">Wait X seconds after issuing ON/OFF</shortdesc>
	</parameter>
	<parameter name="shell_timeout" unique="0" required="0">
	<getopt mixed="--shell-timeout=[seconds]" />
	<content type="second" default="3" />
	<shortdesc lang="en">Wait X seconds for cmd prompt after issuing command</shortdesc>
	</parameter>
	<parameter name="stonith_status_sleep" unique="0" required="0">
	<getopt mixed="--stonith-status-sleep=[seconds]" />
	<content type="second" default="1" />
	<shortdesc lang="en">Sleep X seconds between status calls during a STONITH action</shortdesc>
	</parameter>
	<parameter name="retry_on" unique="0" required="0">
	<getopt mixed="--retry-on=[attempts]" />
	<content type="integer" default="1" />
	<shortdesc lang="en">Count of attempts to retry power on</shortdesc>
	</parameter>
	</parameters>
	<actions>
	<action name="on" automatic="0"/>
	<action name="off" />
	<action name="reboot" />
	<action name="status" />
	<action name="list" />
	<action name="list-status" />
	<action name="monitor" />
	<action name="metadata" />
	<action name="manpage" />
	<action name="validate-all" />
	</actions>
	</resource-agent>

File Metadata

Mime Type: text/x-diff
Expires: Mon, Feb 24, 10:57 PM (17 h, 24 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1464507
Default Alt Text: (70 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions