diff --git a/include/corosync/corotypes.h b/include/corosync/corotypes.h index 57f8b476..dcb83258 100644 --- a/include/corosync/corotypes.h +++ b/include/corosync/corotypes.h @@ -1,180 +1,211 @@ /* * Copyright (c) 2008 Allied Telesis Labs. * * All rights reserved. * * Author: Angus Salkeld (ahsalkeld@gmail.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef COROTYPES_H_DEFINED #define COROTYPES_H_DEFINED #ifndef COROSYNC_SOLARIS #include #else #include #endif +#include +#include typedef int64_t cs_time_t; #define CS_FALSE 0 #define CS_TRUE !CS_FALSE #define CS_MAX_NAME_LENGTH 256 #define CS_TIME_END ((cs_time_t)0x7FFFFFFFFFFFFFFFULL) +#define CS_MAX(x, y) (((x) > (y)) ? (x) : (y)) typedef struct { uint16_t length; uint8_t value[CS_MAX_NAME_LENGTH]; } cs_name_t; typedef struct { char releaseCode; unsigned char majorVersion; unsigned char minorVersion; } cs_version_t; typedef enum { CS_DISPATCH_ONE = 1, CS_DISPATCH_ALL = 2, CS_DISPATCH_BLOCKING = 3 } cs_dispatch_flags_t; #define CS_TRACK_CURRENT 0x01 #define CS_TRACK_CHANGES 0x02 #define CS_TRACK_CHANGES_ONLY 0x04 typedef enum { CS_OK = 1, CS_ERR_LIBRARY = 2, CS_ERR_VERSION = 3, CS_ERR_INIT = 4, CS_ERR_TIMEOUT = 5, CS_ERR_TRY_AGAIN = 6, CS_ERR_INVALID_PARAM = 7, CS_ERR_NO_MEMORY = 8, CS_ERR_BAD_HANDLE = 9, CS_ERR_BUSY = 10, CS_ERR_ACCESS = 11, CS_ERR_NOT_EXIST = 12, CS_ERR_NAME_TOO_LONG = 13, CS_ERR_EXIST = 14, CS_ERR_NO_SPACE = 15, CS_ERR_INTERRUPT = 16, CS_ERR_NAME_NOT_FOUND = 17, CS_ERR_NO_RESOURCES = 18, CS_ERR_NOT_SUPPORTED = 19, CS_ERR_BAD_OPERATION = 20, CS_ERR_FAILED_OPERATION = 21, CS_ERR_MESSAGE_ERROR = 22, CS_ERR_QUEUE_FULL = 23, CS_ERR_QUEUE_NOT_AVAILABLE = 24, CS_ERR_BAD_FLAGS = 25, CS_ERR_TOO_BIG = 26, CS_ERR_NO_SECTIONS = 27, CS_ERR_CONTEXT_NOT_FOUND = 28, CS_ERR_TOO_MANY_GROUPS = 30, CS_ERR_SECURITY = 100 } cs_error_t; +#define CS_TIME_MS_IN_SEC 1000ULL +#define CS_TIME_US_IN_SEC 1000000ULL +#define CS_TIME_NS_IN_SEC 1000000000ULL +#define CS_TIME_US_IN_MSEC 1000ULL +#define CS_TIME_NS_IN_MSEC 1000000ULL +#define CS_TIME_NS_IN_USEC 1000ULL +static inline uint64_t cs_timestamp_get(void) +{ + uint64_t result; + +#if defined _POSIX_MONOTONIC_CLOCK && _POSIX_MONOTONIC_CLOCK >= 0 + struct timespec ts; + + clock_gettime (CLOCK_MONOTONIC, &ts); + result = (ts.tv_sec * CS_TIME_NS_IN_SEC) + (uint64_t)ts.tv_nsec; +#else + struct timeval time_from_epoch; + + gettimeofday (&time_from_epoch, 0); + result = ((time_from_epoch.tv_sec * CS_TIME_NS_IN_SEC) + + (time_from_epoch.tv_usec * CS_TIME_NS_IN_USEC)); +#endif + + return result; +} + + /* * DEPRECATED */ #define EVS_DISPATCH_ONE CS_DISPATCH_ONE #define EVS_DISPATCH_ALL CS_DISPATCH_ALL #define EVS_DISPATCH_BLOCKING CS_DISPATCH_BLOCKING #define EVS_OK CS_OK #define EVS_ERR_LIBRARY CS_ERR_LIBRARY #define EVS_ERR_TIMEOUT CS_ERR_TIMEOUT #define EVS_ERR_TRY_AGAIN CS_ERR_TRY_AGAIN #define EVS_ERR_INVALID_PARAM CS_ERR_INVALID_PARAM #define EVS_ERR_NO_MEMORY CS_ERR_NO_MEMORY #define EVS_ERR_BAD_HANDLE CS_ERR_BAD_HANDLE #define EVS_ERR_ACCESS CS_ERR_ACCESS #define EVS_ERR_NOT_EXIST CS_ERR_NOT_EXIST #define EVS_ERR_EXIST CS_ERR_EXIST #define EVS_ERR_NOT_SUPPORTED CS_ERR_NOT_SUPPORTED #define EVS_ERR_SECURITY CS_ERR_SECURITY #define EVS_ERR_TOO_MANY_GROUPS CS_ERR_TOO_MANY_GROUPS #define evs_error_t cs_error_t #define CPG_DISPATCH_ONE CS_DISPATCH_ONE #define CPG_DISPATCH_ALL CS_DISPATCH_ALL #define CPG_DISPATCH_BLOCKING CS_DISPATCH_BLOCKING #define CPG_OK CS_OK #define CPG_ERR_LIBRARY CS_ERR_LIBRARY #define CPG_ERR_TIMEOUT CS_ERR_TIMEOUT #define CPG_ERR_TRY_AGAIN CS_ERR_TRY_AGAIN #define CPG_ERR_INVALID_PARAM CS_ERR_INVALID_PARAM #define CPG_ERR_NO_MEMORY CS_ERR_NO_MEMORY #define CPG_ERR_BAD_HANDLE CS_ERR_BAD_HANDLE #define CPG_ERR_ACCESS CS_ERR_ACCESS #define CPG_ERR_BUSY CS_ERR_BUSY #define CPG_ERR_NOT_EXIST CS_ERR_NOT_EXIST #define CPG_ERR_EXIST CS_ERR_EXIST #define CPG_ERR_NOT_SUPPORTED CS_ERR_NOT_SUPPORTED #define CPG_ERR_SECURITY CS_ERR_SECURITY #define CPG_ERR_TOO_MANY_GROUPS CS_ERR_TOO_MANY_GROUPS #define cpg_error_t cs_error_t #define CONFDB_DISPATCH_ONE CS_DISPATCH_ONE #define CONFDB_DISPATCH_ALL CS_DISPATCH_ALL #define CONFDB_DISPATCH_BLOCKING CS_DISPATCH_BLOCKING #define CONFDB_OK CS_OK #define CONFDB_ERR_LIBRARY CS_ERR_LIBRARY #define CONFDB_ERR_TIMEOUT CS_ERR_TIMEOUT #define CONFDB_ERR_TRY_AGAIN CS_ERR_TRY_AGAIN #define CONFDB_ERR_INVALID_PARAM CS_ERR_INVALID_PARAM #define CONFDB_ERR_NO_MEMORY CS_ERR_NO_MEMORY #define CONFDB_ERR_BAD_HANDLE CS_ERR_BAD_HANDLE #define CONFDB_ERR_ACCESS CS_ERR_ACCESS #define CONFDB_ERR_NOT_EXIST CS_ERR_NOT_EXIST #define CONFDB_ERR_EXIST CS_ERR_EXIST #define CONFDB_ERR_NOT_SUPPORTED CS_ERR_NOT_SUPPORTED #define CONFDB_ERR_SECURITY CS_ERR_SECURITY #define confdb_error_t cs_error_t #define QUORUM_DISPATCH_ONE CS_DISPATCH_ONE #define QUORUM_DISPATCH_ALL CS_DISPATCH_ALL #define QUORUM_DISPATCH_BLOCKING CS_DISPATCH_BLOCKING #define QUORUM_OK CS_OK #define QUORUM_ERR_LIBRARY CS_ERR_LIBRARY #define QUORUM_ERR_TIMEOUT CS_ERR_TIMEOUT #define QUORUM_ERR_TRY_AGAIN CS_ERR_TRY_AGAIN #define QUORUM_ERR_INVALID_PARAM CS_ERR_INVALID_PARAM #define QUORUM_ERR_NO_MEMORY CS_ERR_NO_MEMORY #define QUORUM_ERR_BAD_HANDLE CS_ERR_BAD_HANDLE #define QUORUM_ERR_ACCESS CS_ERR_ACCESS #define QUORUM_ERR_NOT_EXIST CS_ERR_NOT_EXIST #define QUORUM_ERR_EXIST CS_ERR_EXIST #define QUORUM_ERR_NOT_SUPPORTED CS_ERR_NOT_SUPPORTED #define QUORUM_ERR_SECURITY CS_ERR_SECURITY #define quorum_error_t cs_error_t -#endif +#endif /* COROTYPES_H_DEFINED */ + diff --git a/lib/sam.c b/lib/sam.c index 53020ac0..35bb7eeb 100644 --- a/lib/sam.c +++ b/lib/sam.c @@ -1,1506 +1,1501 @@ /* * Copyright (c) 2009-2010 Red Hat, Inc. * * All rights reserved. * * Author: Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the Red Hat, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Provides a SAM API */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" #include #include #include #define SAM_CONFDB_S_FAILED "failed" -#define SAM_CONFDB_S_REGISTERED "registered" -#define SAM_CONFDB_S_STARTED "started" +#define SAM_CONFDB_S_REGISTERED "stopped" +#define SAM_CONFDB_S_STARTED "running" #define SAM_CONFDB_S_Q_WAIT "waiting for quorum" #define SAM_RP_MASK_Q(pol) (pol & (~SAM_RECOVERY_POLICY_QUORUM)) #define SAM_RP_MASK_C(pol) (pol & (~SAM_RECOVERY_POLICY_CONFDB)) #define SAM_RP_MASK(pol) (pol & (~(SAM_RECOVERY_POLICY_QUORUM | SAM_RECOVERY_POLICY_CONFDB))) enum sam_internal_status_t { SAM_INTERNAL_STATUS_NOT_INITIALIZED = 0, SAM_INTERNAL_STATUS_INITIALIZED, SAM_INTERNAL_STATUS_REGISTERED, SAM_INTERNAL_STATUS_STARTED, SAM_INTERNAL_STATUS_FINALIZED }; enum sam_command_t { SAM_COMMAND_START, SAM_COMMAND_STOP, SAM_COMMAND_HB, SAM_COMMAND_DATA_STORE, SAM_COMMAND_WARN_SIGNAL_SET, SAM_COMMAND_MARK_FAILED, }; enum sam_reply_t { SAM_REPLY_OK, SAM_REPLY_ERROR, }; enum sam_parent_action_t { SAM_PARENT_ACTION_ERROR, SAM_PARENT_ACTION_RECOVERY, SAM_PARENT_ACTION_QUIT, SAM_PARENT_ACTION_CONTINUE }; enum sam_confdb_key_t { SAM_CONFDB_KEY_RECOVERY, SAM_CONFDB_KEY_HC_PERIOD, SAM_CONFDB_KEY_LAST_HC, SAM_CONFDB_KEY_STATE, }; static struct { int time_interval; sam_recovery_policy_t recovery_policy; enum sam_internal_status_t internal_status; unsigned int instance_id; int child_fd_out; int child_fd_in; int term_send; int warn_signal; int am_i_child; sam_hc_callback_t hc_callback; pthread_t cb_thread; int cb_rpipe_fd, cb_wpipe_fd; int cb_registered; void *user_data; size_t user_data_size; size_t user_data_allocated; pthread_mutex_t lock; quorum_handle_t quorum_handle; uint32_t quorate; int quorum_fd; confdb_handle_t confdb_handle; hdb_handle_t confdb_pid_handle; } sam_internal_data; extern const char *__progname; static cs_error_t sam_confdb_update_key (enum sam_confdb_key_t key, const char *value) { cs_error_t err; const char *svalue; uint64_t hc_period, last_hc; - struct timeval tv; const char *ssvalue[] = { [SAM_RECOVERY_POLICY_QUIT] = "quit", [SAM_RECOVERY_POLICY_RESTART] = "restart" }; switch (key) { case SAM_CONFDB_KEY_RECOVERY: svalue = ssvalue[SAM_RP_MASK (sam_internal_data.recovery_policy)]; if ((err = confdb_key_create_typed (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle, "recovery", svalue, strlen ((const char *)svalue), CONFDB_VALUETYPE_STRING)) != CS_OK) { goto exit_error; } break; case SAM_CONFDB_KEY_HC_PERIOD: hc_period = sam_internal_data.time_interval; if ((err = confdb_key_create_typed (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle, - "hc_period", &hc_period, sizeof (uint64_t), CONFDB_VALUETYPE_UINT64)) != CS_OK) { + "poll_period", &hc_period, sizeof (hc_period), CONFDB_VALUETYPE_UINT64)) != CS_OK) { goto exit_error; } break; case SAM_CONFDB_KEY_LAST_HC: - if (gettimeofday (&tv, NULL) == -1) { - last_hc = 0; - } else { - last_hc = ((uint64_t)tv.tv_sec * 1000) + ((uint64_t)tv.tv_usec / 1000); - } + last_hc = cs_timestamp_get(); if ((err = confdb_key_create_typed (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle, - "hc_last", &last_hc, sizeof (uint64_t), CONFDB_VALUETYPE_UINT64)) != CS_OK) { + "last_updated", &last_hc, sizeof (last_hc), CONFDB_VALUETYPE_UINT64)) != CS_OK) { goto exit_error; } break; case SAM_CONFDB_KEY_STATE: svalue = value; if ((err = confdb_key_create_typed (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle, "state", svalue, strlen ((const char *)svalue), CONFDB_VALUETYPE_STRING)) != CS_OK) { goto exit_error; } break; } return (CS_OK); exit_error: return (err); } static cs_error_t sam_confdb_destroy_pid_obj (void) { return (confdb_object_destroy (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle)); } static cs_error_t sam_confdb_register (void) { const char *obj_name; cs_error_t err; confdb_handle_t confdb_handle; hdb_handle_t resource_handle, process_handle, pid_handle, obj_handle; hdb_handle_t *res_handle; char tmp_obj[PATH_MAX]; int i; if ((err = confdb_initialize (&confdb_handle, NULL)) != CS_OK) { return (err); } for (i = 0; i < 3; i++) { switch (i) { case 0: obj_name = "resources"; obj_handle = OBJECT_PARENT_HANDLE; res_handle = &resource_handle; break; case 1: obj_name = "process"; obj_handle = resource_handle; res_handle = &process_handle; break; case 2: if (snprintf (tmp_obj, sizeof (tmp_obj), "%s:%d", __progname, getpid ()) >= sizeof (tmp_obj)) { snprintf (tmp_obj, sizeof (tmp_obj), "%d", getpid ()); } obj_name = tmp_obj; obj_handle = process_handle; res_handle = &pid_handle; break; } if ((err = confdb_object_find_start (confdb_handle, obj_handle)) != CS_OK) { goto finalize_error; } if ((err = confdb_object_find (confdb_handle, obj_handle, obj_name, strlen (obj_name), res_handle)) != CS_OK) { if (err == CONFDB_ERR_ACCESS) { /* * Try to create object */ if ((err = confdb_object_create (confdb_handle, obj_handle, obj_name, strlen (obj_name), res_handle)) != CS_OK) { goto finalize_error; } } else { goto finalize_error; } } else { if ((err = confdb_object_find_destroy (confdb_handle, obj_handle)) != CS_OK) { goto finalize_error; } } } sam_internal_data.confdb_pid_handle = pid_handle; sam_internal_data.confdb_handle = confdb_handle; if ((err = sam_confdb_update_key (SAM_CONFDB_KEY_RECOVERY, NULL)) != CS_OK) { goto destroy_finalize_error; } if ((err = sam_confdb_update_key (SAM_CONFDB_KEY_HC_PERIOD, NULL)) != CS_OK) { goto destroy_finalize_error; } return (CS_OK); destroy_finalize_error: sam_confdb_destroy_pid_obj (); finalize_error: confdb_finalize (confdb_handle); return (err); } static void quorum_notification_fn ( quorum_handle_t handle, uint32_t quorate, uint64_t ring_id, uint32_t view_list_entries, uint32_t *view_list) { sam_internal_data.quorate = quorate; } cs_error_t sam_initialize ( int time_interval, sam_recovery_policy_t recovery_policy) { quorum_callbacks_t quorum_callbacks; cs_error_t err; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_NOT_INITIALIZED) { return (CS_ERR_BAD_HANDLE); } if (SAM_RP_MASK (recovery_policy) != SAM_RECOVERY_POLICY_QUIT && SAM_RP_MASK (recovery_policy) != SAM_RECOVERY_POLICY_RESTART) { return (CS_ERR_INVALID_PARAM); } if (recovery_policy & SAM_RECOVERY_POLICY_QUORUM) { /* * Initialize quorum */ quorum_callbacks.quorum_notify_fn = quorum_notification_fn; if ((err = quorum_initialize (&sam_internal_data.quorum_handle, &quorum_callbacks)) != CS_OK) { goto exit_error; } if ((err = quorum_trackstart (sam_internal_data.quorum_handle, CS_TRACK_CHANGES)) != CS_OK) { goto exit_error_quorum; } if ((err = quorum_fd_get (sam_internal_data.quorum_handle, &sam_internal_data.quorum_fd)) != CS_OK) { goto exit_error_quorum; } /* * Dispatch initial quorate state */ if ((err = quorum_dispatch (sam_internal_data.quorum_handle, CS_DISPATCH_ONE)) != CS_OK) { goto exit_error_quorum; } } sam_internal_data.recovery_policy = recovery_policy; sam_internal_data.time_interval = time_interval; sam_internal_data.internal_status = SAM_INTERNAL_STATUS_INITIALIZED; sam_internal_data.warn_signal = SIGTERM; sam_internal_data.am_i_child = 0; sam_internal_data.user_data = NULL; sam_internal_data.user_data_size = 0; sam_internal_data.user_data_allocated = 0; pthread_mutex_init (&sam_internal_data.lock, NULL); return (CS_OK); exit_error_quorum: quorum_finalize (sam_internal_data.quorum_handle); exit_error: return (err); } /* * Wrapper on top of write(2) function. It handles EAGAIN and EINTR states and sends whole buffer if possible. */ static size_t sam_safe_write ( int d, const void *buf, size_t nbyte) { ssize_t bytes_write; ssize_t tmp_bytes_write; bytes_write = 0; do { tmp_bytes_write = write (d, (const char *)buf + bytes_write, (nbyte - bytes_write > SSIZE_MAX) ? SSIZE_MAX : nbyte - bytes_write); if (tmp_bytes_write == -1) { if (!(errno == EAGAIN || errno == EINTR)) return -1; } else { bytes_write += tmp_bytes_write; } } while (bytes_write != nbyte); return (bytes_write); } /* * Wrapper on top of read(2) function. It handles EAGAIN and EINTR states and reads whole buffer if possible. */ static size_t sam_safe_read ( int d, void *buf, size_t nbyte) { ssize_t bytes_read; ssize_t tmp_bytes_read; bytes_read = 0; do { tmp_bytes_read = read (d, (char *)buf + bytes_read, (nbyte - bytes_read > SSIZE_MAX) ? SSIZE_MAX : nbyte - bytes_read); if (tmp_bytes_read == -1) { if (!(errno == EAGAIN || errno == EINTR)) return -1; } else { bytes_read += tmp_bytes_read; } } while (bytes_read != nbyte && tmp_bytes_read != 0); return (bytes_read); } static cs_error_t sam_read_reply ( int child_fd_in) { char reply; cs_error_t err; if (sam_safe_read (sam_internal_data.child_fd_in, &reply, sizeof (reply)) != sizeof (reply)) { return (CS_ERR_LIBRARY); } switch (reply) { case SAM_REPLY_ERROR: /* * Read error and return that */ if (sam_safe_read (sam_internal_data.child_fd_in, &err, sizeof (err)) != sizeof (err)) { return (CS_ERR_LIBRARY); } return (err); break; case SAM_REPLY_OK: /* * Everything correct */ break; default: return (CS_ERR_LIBRARY); break; } return (CS_OK); } cs_error_t sam_data_getsize (size_t *size) { if (size == NULL) { return (CS_ERR_INVALID_PARAM); } if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_INITIALIZED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } pthread_mutex_lock (&sam_internal_data.lock); *size = sam_internal_data.user_data_size; pthread_mutex_unlock (&sam_internal_data.lock); return (CS_OK); } cs_error_t sam_data_restore ( void *data, size_t size) { cs_error_t err; err = CS_OK; if (data == NULL) { return (CS_ERR_INVALID_PARAM); } if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_INITIALIZED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } pthread_mutex_lock (&sam_internal_data.lock); if (sam_internal_data.user_data_size == 0) { err = CS_OK; goto error_unlock; } if (size < sam_internal_data.user_data_size) { err = CS_ERR_INVALID_PARAM; goto error_unlock; } memcpy (data, sam_internal_data.user_data, sam_internal_data.user_data_size); pthread_mutex_unlock (&sam_internal_data.lock); return (CS_OK); error_unlock: pthread_mutex_unlock (&sam_internal_data.lock); return (err); } cs_error_t sam_data_store ( const void *data, size_t size) { cs_error_t err; char command; char *new_data; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_INITIALIZED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } if (data == NULL) { size = 0; } pthread_mutex_lock (&sam_internal_data.lock); if (sam_internal_data.am_i_child) { /* * We are child so we must send data to parent */ command = SAM_COMMAND_DATA_STORE; if (sam_safe_write (sam_internal_data.child_fd_out, &command, sizeof (command)) != sizeof (command)) { err = CS_ERR_LIBRARY; goto error_unlock; } if (sam_safe_write (sam_internal_data.child_fd_out, &size, sizeof (size)) != sizeof (size)) { err = CS_ERR_LIBRARY; goto error_unlock; } if (data != NULL && sam_safe_write (sam_internal_data.child_fd_out, data, size) != size) { err = CS_ERR_LIBRARY; goto error_unlock; } /* * And wait for reply */ if ((err = sam_read_reply (sam_internal_data.child_fd_in)) != CS_OK) { goto error_unlock; } } /* * We are parent or we received OK reply from parent -> do required action */ if (data == NULL) { free (sam_internal_data.user_data); sam_internal_data.user_data = NULL; sam_internal_data.user_data_allocated = 0; sam_internal_data.user_data_size = 0; } else { if (sam_internal_data.user_data_allocated < size) { if ((new_data = realloc (sam_internal_data.user_data, size)) == NULL) { err = CS_ERR_NO_MEMORY; goto error_unlock; } sam_internal_data.user_data_allocated = size; } else { new_data = sam_internal_data.user_data; } sam_internal_data.user_data = new_data; sam_internal_data.user_data_size = size; memcpy (sam_internal_data.user_data, data, size); } pthread_mutex_unlock (&sam_internal_data.lock); return (CS_OK); error_unlock: pthread_mutex_unlock (&sam_internal_data.lock); return (err); } cs_error_t sam_start (void) { char command; cs_error_t err; sam_recovery_policy_t recpol; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED) { return (CS_ERR_BAD_HANDLE); } recpol = sam_internal_data.recovery_policy; if (recpol & SAM_RECOVERY_POLICY_QUORUM || recpol & SAM_RECOVERY_POLICY_CONFDB) { pthread_mutex_lock (&sam_internal_data.lock); } command = SAM_COMMAND_START; if (sam_safe_write (sam_internal_data.child_fd_out, &command, sizeof (command)) != sizeof (command)) { if (recpol & SAM_RECOVERY_POLICY_QUORUM || recpol & SAM_RECOVERY_POLICY_CONFDB) { pthread_mutex_unlock (&sam_internal_data.lock); } return (CS_ERR_LIBRARY); } if (recpol & SAM_RECOVERY_POLICY_QUORUM || recpol & SAM_RECOVERY_POLICY_CONFDB) { /* * Wait for parent reply */ if ((err = sam_read_reply (sam_internal_data.child_fd_in)) != CS_OK) { pthread_mutex_unlock (&sam_internal_data.lock); return (err); } pthread_mutex_unlock (&sam_internal_data.lock); } if (sam_internal_data.hc_callback) if (sam_safe_write (sam_internal_data.cb_wpipe_fd, &command, sizeof (command)) != sizeof (command)) return (CS_ERR_LIBRARY); sam_internal_data.internal_status = SAM_INTERNAL_STATUS_STARTED; return (CS_OK); } cs_error_t sam_stop (void) { char command; cs_error_t err; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } command = SAM_COMMAND_STOP; if (sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB) { pthread_mutex_lock (&sam_internal_data.lock); } if (sam_safe_write (sam_internal_data.child_fd_out, &command, sizeof (command)) != sizeof (command)) { if (sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB) { pthread_mutex_unlock (&sam_internal_data.lock); } return (CS_ERR_LIBRARY); } if (sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB) { /* * Wait for parent reply */ if ((err = sam_read_reply (sam_internal_data.child_fd_in)) != CS_OK) { pthread_mutex_unlock (&sam_internal_data.lock); return (err); } pthread_mutex_unlock (&sam_internal_data.lock); } if (sam_internal_data.hc_callback) if (sam_safe_write (sam_internal_data.cb_wpipe_fd, &command, sizeof (command)) != sizeof (command)) return (CS_ERR_LIBRARY); sam_internal_data.internal_status = SAM_INTERNAL_STATUS_REGISTERED; return (CS_OK); } cs_error_t sam_hc_send (void) { char command; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } command = SAM_COMMAND_HB; if (sam_safe_write (sam_internal_data.child_fd_out, &command, sizeof (command)) != sizeof (command)) return (CS_ERR_LIBRARY); return (CS_OK); } cs_error_t sam_finalize (void) { cs_error_t error; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_INITIALIZED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } if (sam_internal_data.internal_status == SAM_INTERNAL_STATUS_STARTED) { error = sam_stop (); if (error != CS_OK) goto exit_error; } sam_internal_data.internal_status = SAM_INTERNAL_STATUS_FINALIZED; free (sam_internal_data.user_data); exit_error: return (CS_OK); } cs_error_t sam_mark_failed (void) { char command; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED) { return (CS_ERR_BAD_HANDLE); } if (!(sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB)) { return (CS_ERR_INVALID_PARAM); } command = SAM_COMMAND_MARK_FAILED; if (sam_safe_write (sam_internal_data.child_fd_out, &command, sizeof (command)) != sizeof (command)) return (CS_ERR_LIBRARY); return (CS_OK); } cs_error_t sam_warn_signal_set (int warn_signal) { char command; cs_error_t err; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_INITIALIZED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED && sam_internal_data.internal_status != SAM_INTERNAL_STATUS_STARTED) { return (CS_ERR_BAD_HANDLE); } pthread_mutex_lock (&sam_internal_data.lock); if (sam_internal_data.am_i_child) { /* * We are child so we must send data to parent */ command = SAM_COMMAND_WARN_SIGNAL_SET; if (sam_safe_write (sam_internal_data.child_fd_out, &command, sizeof (command)) != sizeof (command)) { err = CS_ERR_LIBRARY; goto error_unlock; } if (sam_safe_write (sam_internal_data.child_fd_out, &warn_signal, sizeof (warn_signal)) != sizeof (warn_signal)) { err = CS_ERR_LIBRARY; goto error_unlock; } /* * And wait for reply */ if ((err = sam_read_reply (sam_internal_data.child_fd_in)) != CS_OK) { goto error_unlock; } } /* * We are parent or we received OK reply from parent -> do required action */ sam_internal_data.warn_signal = warn_signal; pthread_mutex_unlock (&sam_internal_data.lock); return (CS_OK); error_unlock: pthread_mutex_unlock (&sam_internal_data.lock); return (err); } static cs_error_t sam_parent_reply_send ( cs_error_t err, int parent_fd_in, int parent_fd_out) { char reply; if (err == CS_OK) { reply = SAM_REPLY_OK; if (sam_safe_write (parent_fd_out, &reply, sizeof (reply)) != sizeof (reply)) { err = CS_ERR_LIBRARY; goto error_reply; } return (CS_OK); } error_reply: reply = SAM_REPLY_ERROR; if (sam_safe_write (parent_fd_out, &reply, sizeof (reply)) != sizeof (reply)) { return (CS_ERR_LIBRARY); } if (sam_safe_write (parent_fd_out, &err, sizeof (err)) != sizeof (err)) { return (CS_ERR_LIBRARY); } return (err); } static cs_error_t sam_parent_warn_signal_set ( int parent_fd_in, int parent_fd_out) { char *user_data; int warn_signal; cs_error_t err; err = CS_OK; user_data = NULL; if (sam_safe_read (parent_fd_in, &warn_signal, sizeof (warn_signal)) != sizeof (warn_signal)) { err = CS_ERR_LIBRARY; goto error_reply; } err = sam_warn_signal_set (warn_signal); if (err != CS_OK) { goto error_reply; } return (sam_parent_reply_send (CS_OK, parent_fd_in, parent_fd_out)); error_reply: return (sam_parent_reply_send (err, parent_fd_in, parent_fd_out)); } static cs_error_t sam_parent_wait_for_quorum ( int parent_fd_in, int parent_fd_out) { cs_error_t err; struct pollfd pfds[2]; int poll_err; if (sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB) { if ((err = sam_confdb_update_key (SAM_CONFDB_KEY_STATE, SAM_CONFDB_S_Q_WAIT)) != CS_OK) { goto error_reply; } } /* * Update current quorum */ if ((err = quorum_dispatch (sam_internal_data.quorum_handle, CS_DISPATCH_ALL)) != CS_OK) { goto error_reply; } /* * Wait for quorum */ while (!sam_internal_data.quorate) { pfds[0].fd = parent_fd_in; pfds[0].events = 0; pfds[0].revents = 0; pfds[1].fd = sam_internal_data.quorum_fd; pfds[1].events = POLLIN; pfds[1].revents = 0; poll_err = poll (pfds, 2, -1); if (poll_err == -1) { /* * Error in poll * If it is EINTR, continue, otherwise QUIT */ if (errno != EINTR) { err = CS_ERR_LIBRARY; goto error_reply; } } if (pfds[0].revents != 0) { if (pfds[0].revents == POLLERR || pfds[0].revents == POLLHUP ||pfds[0].revents == POLLNVAL) { /* * Child has exited */ return (CS_OK); } } if (pfds[1].revents != 0) { if ((err = quorum_dispatch (sam_internal_data.quorum_handle, CS_DISPATCH_ONE)) != CS_OK) { goto error_reply; } } } if (sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB) { if ((err = sam_confdb_update_key (SAM_CONFDB_KEY_STATE, SAM_CONFDB_S_STARTED)) != CS_OK) { goto error_reply; } } return (sam_parent_reply_send (CS_OK, parent_fd_in, parent_fd_out)); error_reply: if (sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_CONFDB) { sam_confdb_update_key (SAM_CONFDB_KEY_STATE, SAM_CONFDB_S_REGISTERED); } return (sam_parent_reply_send (err, parent_fd_in, parent_fd_out)); } static cs_error_t sam_parent_confdb_state_set ( int parent_fd_in, int parent_fd_out, int state) { cs_error_t err; const char *state_s; if (state == 1) { state_s = SAM_CONFDB_S_STARTED; } else { state_s = SAM_CONFDB_S_REGISTERED; } if ((err = sam_confdb_update_key (SAM_CONFDB_KEY_STATE, state_s)) != CS_OK) { goto error_reply; } return (sam_parent_reply_send (CS_OK, parent_fd_in, parent_fd_out)); error_reply: return (sam_parent_reply_send (err, parent_fd_in, parent_fd_out)); } static cs_error_t sam_parent_kill_child ( int *action, pid_t child_pid) { /* * Kill child process */ if (!sam_internal_data.term_send) { /* * We didn't send warn_signal yet. */ kill (child_pid, sam_internal_data.warn_signal); sam_internal_data.term_send = 1; } else { /* * We sent child warning. Now, we will not be so nice */ kill (child_pid, SIGKILL); *action = SAM_PARENT_ACTION_RECOVERY; } return (CS_OK); } static cs_error_t sam_parent_mark_child_failed ( int *action, pid_t child_pid) { sam_recovery_policy_t recpol; recpol = sam_internal_data.recovery_policy; sam_internal_data.term_send = 1; sam_internal_data.recovery_policy = SAM_RECOVERY_POLICY_QUIT | (SAM_RP_MASK_C (recpol) ? SAM_RECOVERY_POLICY_CONFDB : 0) | (SAM_RP_MASK_Q (recpol) ? SAM_RECOVERY_POLICY_QUORUM : 0); return (sam_parent_kill_child (action, child_pid)); } static cs_error_t sam_parent_data_store ( int parent_fd_in, int parent_fd_out) { char *user_data; ssize_t size; cs_error_t err; err = CS_OK; user_data = NULL; if (sam_safe_read (parent_fd_in, &size, sizeof (size)) != sizeof (size)) { err = CS_ERR_LIBRARY; goto error_reply; } if (size > 0) { user_data = malloc (size); if (user_data == NULL) { err = CS_ERR_NO_MEMORY; goto error_reply; } if (sam_safe_read (parent_fd_in, user_data, size) != size) { err = CS_ERR_LIBRARY; goto free_error_reply; } } err = sam_data_store (user_data, size); if (err != CS_OK) { goto free_error_reply; } free (user_data); return (sam_parent_reply_send (CS_OK, parent_fd_in, parent_fd_out)); free_error_reply: free (user_data); error_reply: return (sam_parent_reply_send (err, parent_fd_in, parent_fd_out)); } static enum sam_parent_action_t sam_parent_handler ( int parent_fd_in, int parent_fd_out, pid_t child_pid) { int poll_error; int action; int status; ssize_t bytes_read; char command; int time_interval; struct pollfd pfds[2]; nfds_t nfds; cs_error_t err; sam_recovery_policy_t recpol; status = 0; action = SAM_PARENT_ACTION_CONTINUE; recpol = sam_internal_data.recovery_policy; while (action == SAM_PARENT_ACTION_CONTINUE) { pfds[0].fd = parent_fd_in; pfds[0].events = POLLIN; pfds[0].revents = 0; nfds = 1; if (status == 1 && sam_internal_data.time_interval != 0) { time_interval = sam_internal_data.time_interval; } else { time_interval = -1; } if (recpol & SAM_RECOVERY_POLICY_QUORUM) { pfds[nfds].fd = sam_internal_data.quorum_fd; pfds[nfds].events = POLLIN; pfds[nfds].revents = 0; nfds++; } poll_error = poll (pfds, nfds, time_interval); if (poll_error == -1) { /* * Error in poll * If it is EINTR, continue, otherwise QUIT */ if (errno != EINTR) { action = SAM_PARENT_ACTION_ERROR; } } if (poll_error == 0) { /* * Time limit expires */ if (status == 0) { action = SAM_PARENT_ACTION_QUIT; } else { sam_parent_kill_child (&action, child_pid); } } if (poll_error > 0) { if (pfds[0].revents != 0) { /* * We have EOF or command in pipe */ bytes_read = sam_safe_read (parent_fd_in, &command, 1); if (bytes_read == 0) { /* * Handle EOF -> Take recovery action or quit if sam_start wasn't called */ if (status == 0) action = SAM_PARENT_ACTION_QUIT; else action = SAM_PARENT_ACTION_RECOVERY; continue; } if (bytes_read == -1) { action = SAM_PARENT_ACTION_ERROR; goto action_exit; } if (recpol & SAM_RECOVERY_POLICY_CONFDB) { sam_confdb_update_key (SAM_CONFDB_KEY_LAST_HC, NULL); } /* * We have read command */ switch (command) { case SAM_COMMAND_START: if (status == 0) { /* * Not started yet */ if (recpol & SAM_RECOVERY_POLICY_QUORUM) { if (sam_parent_wait_for_quorum (parent_fd_in, parent_fd_out) != CS_OK) { continue; } } if (recpol & SAM_RECOVERY_POLICY_CONFDB) { if (sam_parent_confdb_state_set (parent_fd_in, parent_fd_out, 1) != CS_OK) { continue; } } status = 1; } break; case SAM_COMMAND_STOP: if (status == 1) { /* * Started */ if (recpol & SAM_RECOVERY_POLICY_CONFDB) { if (sam_parent_confdb_state_set (parent_fd_in, parent_fd_out, 0) != CS_OK) { continue; } } status = 0; } break; case SAM_COMMAND_DATA_STORE: sam_parent_data_store (parent_fd_in, parent_fd_out); break; case SAM_COMMAND_WARN_SIGNAL_SET: sam_parent_warn_signal_set (parent_fd_in, parent_fd_out); break; case SAM_COMMAND_MARK_FAILED: status = 1; sam_parent_mark_child_failed (&action, child_pid); break; } } /* if (pfds[0].revents != 0) */ if ((sam_internal_data.recovery_policy & SAM_RECOVERY_POLICY_QUORUM) && pfds[1].revents != 0) { /* * Handle quorum change */ err = quorum_dispatch (sam_internal_data.quorum_handle, CS_DISPATCH_ALL); if (status == 1 && (!sam_internal_data.quorate || (err != CS_ERR_TRY_AGAIN && err != CS_OK))) { sam_parent_kill_child (&action, child_pid); } } } /* select_error > 0 */ } /* action == SAM_PARENT_ACTION_CONTINUE */ action_exit: return action; } cs_error_t sam_register ( unsigned int *instance_id) { cs_error_t error; pid_t pid; int pipe_error; int pipe_fd_out[2], pipe_fd_in[2]; enum sam_parent_action_t action, old_action; int child_status; sam_recovery_policy_t recpol; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_INITIALIZED) { return (CS_ERR_BAD_HANDLE); } recpol = sam_internal_data.recovery_policy; if (recpol & SAM_RECOVERY_POLICY_CONFDB) { /* * Register to objdb */ if ((error = sam_confdb_register ()) != CS_OK) { goto error_exit; } } error = CS_OK; while (1) { if ((pipe_error = pipe (pipe_fd_out)) != 0) { error = CS_ERR_LIBRARY; goto error_exit; } if ((pipe_error = pipe (pipe_fd_in)) != 0) { close (pipe_fd_out[0]); close (pipe_fd_out[1]); error = CS_ERR_LIBRARY; goto error_exit; } if (recpol & SAM_RECOVERY_POLICY_CONFDB) { if ((error = sam_confdb_update_key (SAM_CONFDB_KEY_STATE, SAM_CONFDB_S_REGISTERED)) != CS_OK) { goto error_exit; } } sam_internal_data.instance_id++; sam_internal_data.term_send = 0; pid = fork (); if (pid == -1) { /* * Fork error */ sam_internal_data.instance_id--; error = CS_ERR_LIBRARY; goto error_exit; } if (pid == 0) { /* * Child process */ close (pipe_fd_out[0]); close (pipe_fd_in[1]); sam_internal_data.child_fd_out = pipe_fd_out[1]; sam_internal_data.child_fd_in = pipe_fd_in[0]; if (instance_id) *instance_id = sam_internal_data.instance_id; sam_internal_data.am_i_child = 1; sam_internal_data.internal_status = SAM_INTERNAL_STATUS_REGISTERED; pthread_mutex_init (&sam_internal_data.lock, NULL); goto error_exit; } else { /* * Parent process */ close (pipe_fd_out[1]); close (pipe_fd_in[0]); action = sam_parent_handler (pipe_fd_out[0], pipe_fd_in[1], pid); close (pipe_fd_out[0]); close (pipe_fd_in[1]); if (action == SAM_PARENT_ACTION_ERROR) { error = CS_ERR_LIBRARY; goto error_exit; } /* * We really don't like zombies */ while (waitpid (pid, &child_status, 0) == -1 && errno == EINTR) ; old_action = action; if (action == SAM_PARENT_ACTION_RECOVERY) { if (SAM_RP_MASK (sam_internal_data.recovery_policy) == SAM_RECOVERY_POLICY_QUIT) action = SAM_PARENT_ACTION_QUIT; } if (action == SAM_PARENT_ACTION_QUIT) { if (recpol & SAM_RECOVERY_POLICY_QUORUM) { quorum_finalize (sam_internal_data.quorum_handle); } if (recpol & SAM_RECOVERY_POLICY_CONFDB) { if (old_action == SAM_PARENT_ACTION_RECOVERY) { /* * Mark as failed */ sam_confdb_update_key (SAM_CONFDB_KEY_STATE, SAM_CONFDB_S_FAILED); } else { sam_confdb_destroy_pid_obj (); } } exit (WEXITSTATUS (child_status)); } } } error_exit: return (error); } static void *hc_callback_thread (void *unused_param) { int poll_error; int status; ssize_t bytes_readed; char command; int time_interval, tmp_time_interval; int counter; struct pollfd pfds; status = 0; counter = 0; time_interval = sam_internal_data.time_interval >> 2; while (1) { pfds.fd = sam_internal_data.cb_rpipe_fd; pfds.events = POLLIN; pfds.revents = 0; if (status == 1) { tmp_time_interval = time_interval; } else { tmp_time_interval = -1; } poll_error = poll (&pfds, 1, tmp_time_interval); if (poll_error == 0) { if (sam_hc_send () == CS_OK) { counter++; } if (counter >= 4) { if (sam_internal_data.hc_callback () != 0) { status = 3; } counter = 0; } } if (poll_error > 0) { bytes_readed = sam_safe_read (sam_internal_data.cb_rpipe_fd, &command, 1); if (bytes_readed > 0) { if (status == 0 && command == SAM_COMMAND_START) status = 1; if (status == 1 && command == SAM_COMMAND_STOP) status = 0; } } } /* * This makes compiler happy, it's same as return (NULL); */ return (unused_param); } cs_error_t sam_hc_callback_register (sam_hc_callback_t cb) { cs_error_t error = CS_OK; pthread_attr_t thread_attr; int pipe_error; int pipe_fd[2]; if (sam_internal_data.internal_status != SAM_INTERNAL_STATUS_REGISTERED) { return (CS_ERR_BAD_HANDLE); } if (sam_internal_data.time_interval == 0) { return (CS_ERR_INVALID_PARAM); } if (sam_internal_data.cb_registered) { sam_internal_data.hc_callback = cb; return (CS_OK); } /* * We know, this is first registration */ if (cb == NULL) { return (CS_ERR_INVALID_PARAM); } pipe_error = pipe (pipe_fd); if (pipe_error != 0) { /* * Pipe creation error */ error = CS_ERR_LIBRARY; goto error_exit; } sam_internal_data.cb_rpipe_fd = pipe_fd[0]; sam_internal_data.cb_wpipe_fd = pipe_fd[1]; /* * Create thread attributes */ error = pthread_attr_init (&thread_attr); if (error != 0) { error = CS_ERR_LIBRARY; goto error_close_fd_exit; } pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED); pthread_attr_setstacksize (&thread_attr, 32768); /* * Create thread */ error = pthread_create (&sam_internal_data.cb_thread, &thread_attr, hc_callback_thread, NULL); if (error != 0) { error = CS_ERR_LIBRARY; goto error_attr_destroy_exit; } /* * Cleanup */ pthread_attr_destroy(&thread_attr); sam_internal_data.cb_registered = 1; sam_internal_data.hc_callback = cb; return (CS_OK); error_attr_destroy_exit: pthread_attr_destroy(&thread_attr); error_close_fd_exit: sam_internal_data.cb_rpipe_fd = sam_internal_data.cb_wpipe_fd = 0; close (pipe_fd[0]); close (pipe_fd[1]); error_exit: return (error); } diff --git a/man/sam_overview.8 b/man/sam_overview.8 index a5807cff..e00d2e83 100644 --- a/man/sam_overview.8 +++ b/man/sam_overview.8 @@ -1,181 +1,181 @@ .\"/* .\" * Copyright (c) 2009-2010 Red Hat, Inc. .\" * .\" * All rights reserved. .\" * .\" * Author: Jan Friesse (jfriesse@redhat.com) .\" * Author: Steven Dake (sdake@redhat.com) .\" * .\" * This software licensed under BSD license, the text of which follows: .\" * .\" * Redistribution and use in source and binary forms, with or without .\" * modification, are permitted provided that the following conditions are met: .\" * .\" * - Redistributions of source code must retain the above copyright notice, .\" * this list of conditions and the following disclaimer. .\" * - Redistributions in binary form must reproduce the above copyright notice, .\" * this list of conditions and the following disclaimer in the documentation .\" * and/or other materials provided with the distribution. .\" * - Neither the name of the Red Hat, Inc. nor the names of its .\" * contributors may be used to endorse or promote products derived from this .\" * software without specific prior written permission. .\" * .\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" .\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE .\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF .\" * THE POSSIBILITY OF SUCH DAMAGE. .\" */ .TH "SAM_OVERVIEW" 8 "21/05/2010" "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" .SH NAME .P sam_overview \- Overview of the Simple Availability Manager .SH OVERVIEW .P The SAM library provide a tool to check the health of an application. The main purpose of SAM is to restart a local process when it fails to respond to a healthcheck request in a configured time interval. .P During \fBsam_initialize(3)\fR, a duplicate copy of the process is created using the \fBfork(3)\fR system call. This duplicate process copy contains the logic for executing the SAM server. The SAM server is responsible for requesting healthchecks from the active process, and controlling the lifecycle of the active process when it fails. If the active process fails to respond to the healthcheck request sent by the SAM server, it will be sent a user configurable signal (default SIGTERM) to request shutdown of the application. After a configured time interval, the process will be forcibly killed by being sent a SIGKILL signal. Once the active process terminates, the SAM server will create a new active process. .P The Simple Availability Manager is meant to be used in conjunction with the cpg service. Used together, it is possible to restart a cpg process that fails healthchecking during operation. .P The main features of SAM include: .RS .IP \(bu 3 A configurable recovery policy. .IP \(bu 3 A configurable time interval for health check operations. .IP \(bu 3 A notification via signal before recovery action is taken. .IP \(bu 3 A mechanism to indicate to the application the number of times an active process has been created by the SAM server. .IP \(bu 3 Both application driven health checking and event driven health checking. .RE .SH Initializing SAM .P The SAM library is initialized by \fBsam_initialize(3)\fR. \fBsam_initalize(3)\fR may only be called once per process. Calling it more then once has undefined results and is not recommended or tested. .SH Setting warning callback .P User configurable signal (default \fISIGTERM\fR) is sent to the application when a recovery action is planned. The application can use the \fBsignal(3)\fR system call to monitor for this signal. .P There are no special constraints on what SAM apis may be called in a warning callback. After \fItime_interval\fR expires, a SIGKILL signal is sent to the active process to force its termination. .SH Registering the active process .P The active process is registered with SAM by calling \fBsam_register(3)\fR. This function should only be called one time in a process. After a recovery action is taken, the new active process will begin execution at the next line of code in a user process after \fBsam_register(3)\fR. .SH Enabling event driven healthchecking .P Two types of healthchecking are available to the user. The first model is one where the user application healthchecks during its normal operation. It is never requested to healtcheck, and if the active process doesn't respond within the time interval, the process will be restarted. .P A more useful mechanism for healthchecking is event driven healthchecking. Because this model is directed by the SAM server, It isn't necessary to guess or add timers to the active process to signal a healthcheck operation is successful. To use event driven healthchecking, the \fBsam_hc_callback_register(3)\fR function should be executed. .SH Quorum integration .P SAM has special policies (\fISAM_RECOVERY_POLICY_QUIT\fR and \fISAM_RECOVERY_POLICY_RESTART\fR) for integration with quorum service. This policies changes SAM behaviour in two aspects. .RS .IP \(bu 3 Call of \fBsam_start(3)\fR blocks until corosync becomes quorate .IP \(bu 3 User selected recovery action is taken immediately after lost of quorum. .RE .SH Storing user data .P Sometimes there is need to store some data, which survives between instances. One can in such case use files, databases, ... or much simpler in memory solution presented by \fBsam_data_store(3)\fR, \fBsam_data_restore(3)\fR and \fBsam_data_getsize(3)\fR functions. .SH Confdb integration .P SAM has policy flag used for confdb system integration (\fISAM_RECOVERY_POLICY_CONFDB\fR). If process is registered with this flag, new confdb object PROCESS_NAME:PID is created with following keys: .RS .IP \(bu 3 \fIrecovery\fR - will be quit or restart depending on policy .IP \(bu 3 -\fIhc_period\fR - period of health checking in milliseconds +\fIpoll_period\fR - period of health checking in milliseconds .IP \(bu 3 -\fIhc_last\fR - last known GMT time in milliseconds when health check was received +\fIlast_updated\fR - Timestamp (in nanoseconds) of the last health check. .IP \(bu 3 \fIstate\fR - state of process (can be one of registered, started, failed, waiting for quorum) .RE .P Object is automatically deleted if process exits with stopped health checking. .P Confdb integration with corosync wathdog can be used in implicit and explicit way. .P Implicit way is achieved by setting recovery policy to QUIT and let process exit with started health checking. If this happened, object is not deleted and corosync watchdog will take required action. .P Explicit way is usefull for situations, when developer can deal with some non-fatal fall of application. This mode is achieved by setting policy to RESTART and using SAM same as without Confdb integration. If real fail is needed (like too many restarts at all, per/sec, ...), it's possible to use \fBsam_mark_failed(3)\fR and let corosync watchdog take required action. .SH BUGS .SH "SEE ALSO" .BR sam_initialize (3), .BR sam_data_getsize (3), .BR sam_data_restore (3), .BR sam_data_store (3), .BR sam_finalize (3), .BR sam_mark_failed (3), .BR sam_start (3), .BR sam_stop (3), .BR sam_register (3), .BR sam_warn_signal_set (3), .BR sam_hc_send (3), .BR sam_hc_callback_register (3) diff --git a/services/mon.c b/services/mon.c index 3e475a17..d07254d5 100644 --- a/services/mon.c +++ b/services/mon.c @@ -1,635 +1,637 @@ /* * Copyright (c) 2010 Red Hat, Inc. * * All rights reserved. * * Author: Angus Salkeld * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #if defined(HAVE_LIBSTATGRAB) #include #endif #include #include #include #include #include -#include #include #include "../exec/fsm.h" LOGSYS_DECLARE_SUBSYS ("MON"); -#undef ENTER -#define ENTER() log_printf (LOGSYS_LEVEL_INFO, "%s", __func__) - /* * Service Interfaces required by service_message_handler struct */ static int mon_exec_init_fn ( struct corosync_api_v1 *corosync_api); -hdb_handle_t mon_poll = 0; static struct corosync_api_v1 *api; static hdb_handle_t resources_obj; -static pthread_t mon_poll_thread; -#define MON_DEFAULT_PERIOD 3 +#define MON_DEFAULT_PERIOD 3000 +#define MON_MIN_PERIOD 500 +#define MON_MAX_PERIOD (120 * CS_TIME_MS_IN_SEC) struct corosync_service_engine mon_service_engine = { .name = "corosync resource monitoring service", .id = MON_SERVICE, .priority = 1, .private_data_size = 0, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, .lib_init_fn = NULL, .lib_exit_fn = NULL, .lib_engine = NULL, .lib_engine_count = 0, .exec_engine = NULL, .exec_engine_count = 0, .confchg_fn = NULL, .exec_init_fn = mon_exec_init_fn, .exec_dump_fn = NULL, .sync_mode = CS_SYNC_V2 }; static DECLARE_LIST_INIT (confchg_notify); struct resource_instance { hdb_handle_t handle; const char *name; - poll_timer_handle timer_handle; + corosync_timer_handle_t timer_handle; void (*update_stats_fn) (void *data); struct cs_fsm fsm; - int32_t period; + uint64_t period; objdb_value_types_t max_type; union { int32_t int32; double dbl; } max; }; static void mem_update_stats_fn (void *data); static void load_update_stats_fn (void *data); static struct resource_instance memory_used_inst = { .name = "memory_used", .update_stats_fn = mem_update_stats_fn, .max_type = OBJDB_VALUETYPE_INT32, .max.int32 = INT32_MAX, .period = MON_DEFAULT_PERIOD, }; static struct resource_instance load_15min_inst = { .name = "load_15min", .update_stats_fn = load_update_stats_fn, .max_type = OBJDB_VALUETYPE_DOUBLE, .max.dbl = INT32_MAX, .period = MON_DEFAULT_PERIOD, }; /* * F S M */ static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * data); static void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data); -const char * mon_ok_str = "ok"; +const char * mon_running_str = "running"; const char * mon_failed_str = "failed"; const char * mon_failure_str = "failure"; -const char * mon_disabled_str = "disabled"; +const char * mon_stopped_str = "stopped"; const char * mon_config_changed_str = "config_changed"; enum mon_resource_state { - MON_S_DISABLED, - MON_S_OK, + MON_S_STOPPED, + MON_S_RUNNING, MON_S_FAILED }; enum mon_resource_event { MON_E_CONFIG_CHANGED, MON_E_FAILURE }; struct cs_fsm_entry mon_fsm_table[] = { - { MON_S_DISABLED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_DISABLED, MON_S_OK, -1} }, - { MON_S_DISABLED, MON_E_FAILURE, NULL, {-1} }, - { MON_S_OK, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_OK, MON_S_DISABLED, -1} }, - { MON_S_OK, MON_E_FAILURE, mon_resource_failed, {MON_S_FAILED, -1} }, - { MON_S_FAILED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_OK, MON_S_DISABLED, -1} }, - { MON_S_FAILED, MON_E_FAILURE, NULL, {-1} }, + { MON_S_STOPPED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_STOPPED, MON_S_RUNNING, -1} }, + { MON_S_STOPPED, MON_E_FAILURE, NULL, {-1} }, + { MON_S_RUNNING, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_RUNNING, MON_S_STOPPED, -1} }, + { MON_S_RUNNING, MON_E_FAILURE, mon_resource_failed, {MON_S_FAILED, -1} }, + { MON_S_FAILED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_RUNNING, MON_S_STOPPED, -1} }, + { MON_S_FAILED, MON_E_FAILURE, NULL, {-1} }, }; /* * Dynamic loading descriptor */ static struct corosync_service_engine *mon_get_service_engine_ver0 (void); static struct corosync_service_engine_iface_ver0 mon_service_engine_iface = { .corosync_get_service_engine_ver0 = mon_get_service_engine_ver0 }; static struct lcr_iface corosync_mon_ver0[1] = { { .name = "corosync_mon", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = NULL, } }; static struct lcr_comp mon_comp_ver0 = { .iface_count = 1, .ifaces = corosync_mon_ver0 }; static struct corosync_service_engine *mon_get_service_engine_ver0 (void) { return (&mon_service_engine); } #ifdef COROSYNC_SOLARIS void corosync_lcr_component_register (void); void corosync_lcr_component_register (void) { #else __attribute__ ((constructor)) static void corosync_lcr_component_register (void) { #endif lcr_interfaces_set (&corosync_mon_ver0[0], &mon_service_engine_iface); lcr_component_register (&mon_comp_ver0); } static const char * mon_res_state_to_str(struct cs_fsm* fsm, int32_t state) { switch (state) { - case MON_S_DISABLED: - return mon_disabled_str; + case MON_S_STOPPED: + return mon_stopped_str; break; - case MON_S_OK: - return mon_ok_str; + case MON_S_RUNNING: + return mon_running_str; break; case MON_S_FAILED: return mon_failed_str; break; } return NULL; } static const char * mon_res_event_to_str(struct cs_fsm* fsm, int32_t event) { switch (event) { case MON_E_CONFIG_CHANGED: return mon_config_changed_str; break; case MON_E_FAILURE: return mon_failure_str; break; } return NULL; } +static cs_error_t str_to_uint64_t(const char* str, uint64_t *out_value, uint64_t min, uint64_t max) +{ + char *endptr; + + errno = 0; + *out_value = strtol(str, &endptr, 0); + + /* Check for various possible errors */ + if (errno != 0 || endptr == str) { + return CS_ERR_INVALID_PARAM; + } + + if (*out_value > max || *out_value < min) { + return CS_ERR_INVALID_PARAM; + } + return CS_OK; +} + static void mon_fsm_state_set (struct cs_fsm* fsm, enum mon_resource_state next_state, struct resource_instance* inst) { enum mon_resource_state prev_state = fsm->curr_state; const char *state_str; ENTER(); cs_fsm_state_set(fsm, next_state, inst); if (prev_state == fsm->curr_state) { return; } state_str = mon_res_state_to_str(fsm, fsm->curr_state); api->object_key_replace (inst->handle, "state", strlen ("state"), state_str, strlen (state_str)); } static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * data) { struct resource_instance * inst = (struct resource_instance *)data; char *str; size_t str_len; objdb_value_types_t type; - int32_t tmp_value; + uint64_t tmp_value; int32_t res; ENTER(); res = api->object_key_get_typed (inst->handle, "poll_period", (void**)&str, &str_len, &type); if (res == 0) { - tmp_value = strtol (str, NULL, 0); - if (tmp_value > 0 && tmp_value < 120) { - if (inst->period != tmp_value) { - inst->period = tmp_value; - } + if (str_to_uint64_t(str, &tmp_value, MON_MIN_PERIOD, MON_MAX_PERIOD) == CS_OK) { + log_printf (LOGSYS_LEVEL_DEBUG, + "poll_period changing from:%"PRIu64" to %"PRIu64".", + inst->period, tmp_value); + inst->period = tmp_value; + } else { + log_printf (LOGSYS_LEVEL_WARNING, + "Could NOT use poll_period:%s ms for resource %s", + str, inst->name); } } + if (inst->timer_handle) { + api->timer_delete(inst->timer_handle); + inst->timer_handle = 0; + } res = api->object_key_get_typed (inst->handle, "max", (void**)&str, &str_len, &type); if (res != 0) { if (inst->max_type == OBJDB_VALUETYPE_INT32) { inst->max.int32 = INT32_MAX; } else if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) { inst->max.dbl = INT32_MAX; } - mon_fsm_state_set (fsm, MON_S_DISABLED, inst); + mon_fsm_state_set (fsm, MON_S_STOPPED, inst); } else { if (inst->max_type == OBJDB_VALUETYPE_INT32) { inst->max.int32 = strtol (str, NULL, 0); } else if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) { inst->max.dbl = strtod (str, NULL); } - mon_fsm_state_set (fsm, MON_S_OK, inst); - } - - if (mon_poll == 0) { - return; + mon_fsm_state_set (fsm, MON_S_RUNNING, inst); + /* + * run the updater, incase the period has shortened + * and to start the timer. + */ + inst->update_stats_fn (inst); } - poll_timer_delete (mon_poll, inst->timer_handle); - /* - * run the updater, incase the period has shortened - */ - inst->update_stats_fn (inst); - poll_timer_add (mon_poll, - inst->period * 1000, NULL, - inst->update_stats_fn, - &inst->timer_handle); } void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data) { struct resource_instance * inst = (struct resource_instance *)data; ENTER(); mon_fsm_state_set (fsm, MON_S_FAILED, inst); } static int32_t percent_mem_used_get(void) { #if defined(HAVE_LIBSTATGRAB) sg_mem_stats *mem_stats; sg_swap_stats *swap_stats; long long total, freemem; mem_stats = sg_get_mem_stats(); swap_stats = sg_get_swap_stats(); if (mem_stats == NULL || swap_stats != NULL) { log_printf (LOGSYS_LEVEL_ERROR, "Unable to get memory stats: %s\n", sg_str_error(sg_get_error())); return -1; } total = mem_stats->total + swap_stats->total; freemem = mem_stats->free + swap_stats->free; return ((total - freemem) * 100) / total; #else #if defined(COROSYNC_LINUX) char *line_ptr; char line[512]; unsigned long long value; FILE *f; long long total = 0; long long freemem = 0; if ((f = fopen("/proc/meminfo", "r")) == NULL) { return -1; } while ((line_ptr = fgets(line, sizeof(line), f)) != NULL) { if (sscanf(line_ptr, "%*s %llu kB", &value) != 1) { continue; } value *= 1024; if (strncmp(line_ptr, "MemTotal:", 9) == 0) { total += value; } else if (strncmp(line_ptr, "MemFree:", 8) == 0) { freemem += value; } else if (strncmp(line_ptr, "SwapTotal:", 10) == 0) { total += value; } else if (strncmp(line_ptr, "SwapFree:", 9) == 0) { freemem += value; } } fclose(f); return ((total - freemem) * 100) / total; #else #error need libstatgrab or linux. #endif /* COROSYNC_LINUX */ #endif /* HAVE_LIBSTATGRAB */ } static void mem_update_stats_fn (void *data) { struct resource_instance * inst = (struct resource_instance *)data; int32_t new_value; uint64_t timestamp; new_value = percent_mem_used_get(); if (new_value > 0) { api->object_key_replace (inst->handle, "current", strlen("current"), &new_value, sizeof(new_value)); - timestamp = time (NULL); + timestamp = cs_timestamp_get(); api->object_key_replace (inst->handle, "last_updated", strlen("last_updated"), - ×tamp, sizeof(time_t)); + ×tamp, sizeof(uint64_t)); - if (new_value > inst->max.int32) { + if (new_value > inst->max.int32 && inst->fsm.curr_state != MON_S_FAILED) { cs_fsm_process (&inst->fsm, MON_E_FAILURE, inst); } } - poll_timer_add (mon_poll, - inst->period * 1000, inst, - inst->update_stats_fn, - &inst->timer_handle); + api->timer_add_duration(inst->period * MILLI_2_NANO_SECONDS, + inst, inst->update_stats_fn, &inst->timer_handle); } static double min15_loadavg_get(void) { #if defined(HAVE_LIBSTATGRAB) sg_load_stats *load_stats; load_stats = sg_get_load_stats (); if (load_stats == NULL) { log_printf (LOGSYS_LEVEL_ERROR, "Unable to get load stats: %s\n", sg_str_error (sg_get_error())); return -1; } return load_stats->min15; #else #if defined(COROSYNC_LINUX) double loadav[3]; if (getloadavg(loadav,3) < 0) { return -1; } return loadav[2]; #else #error need libstatgrab or linux. #endif /* COROSYNC_LINUX */ #endif /* HAVE_LIBSTATGRAB */ } static void load_update_stats_fn (void *data) { struct resource_instance * inst = (struct resource_instance *)data; uint64_t timestamp; int32_t res = 0; double min15 = min15_loadavg_get(); - if (min15 < 0) { - } - res = api->object_key_replace (inst->handle, - "current", strlen("current"), - &min15, sizeof (min15)); - if (res != 0) - log_printf (LOGSYS_LEVEL_ERROR, "replace current failed: %d", res); - - timestamp = cs_timestamp_get(); - - res = api->object_key_replace (inst->handle, - "last_updated", strlen("last_updated"), - ×tamp, sizeof(uint64_t)); - if (res != 0) - log_printf (LOGSYS_LEVEL_ERROR, "replace last_updated failed: %d", res); - - if (min15 > inst->max.dbl) { - cs_fsm_process (&inst->fsm, MON_E_FAILURE, &inst); - } - - poll_timer_add (mon_poll, - inst->period * 1000, inst, - inst->update_stats_fn, - &inst->timer_handle); -} - -static void *mon_thread_handler (void * unused) -{ -#ifdef HAVE_LIBSTATGRAB - sg_init(); -#endif /* HAVE_LIBSTATGRAB */ - mon_poll = poll_create (); - - poll_timer_add (mon_poll, - memory_used_inst.period * 1000, - &memory_used_inst, - memory_used_inst.update_stats_fn, - &memory_used_inst.timer_handle); + if (min15 > 0) { + res = api->object_key_replace (inst->handle, + "current", strlen("current"), + &min15, sizeof (min15)); + if (res != 0) { + log_printf (LOGSYS_LEVEL_ERROR, "replace current failed: %d", res); + } + timestamp = cs_timestamp_get(); - poll_timer_add (mon_poll, - load_15min_inst.period * 1000, - &load_15min_inst, - load_15min_inst.update_stats_fn, - &load_15min_inst.timer_handle); - poll_run (mon_poll); + res = api->object_key_replace (inst->handle, + "last_updated", strlen("last_updated"), + ×tamp, sizeof(uint64_t)); + if (res != 0) { + log_printf (LOGSYS_LEVEL_ERROR, "replace last_updated failed: %d", res); + } + if (min15 > inst->max.dbl && inst->fsm.curr_state != MON_S_FAILED) { + cs_fsm_process (&inst->fsm, MON_E_FAILURE, &inst); + } + } - return NULL; + api->timer_add_duration(inst->period * MILLI_2_NANO_SECONDS, + inst, inst->update_stats_fn, &inst->timer_handle); } static int object_find_or_create ( hdb_handle_t parent_object_handle, hdb_handle_t *object_handle, const void *object_name, size_t object_name_len) { hdb_handle_t obj_finder; hdb_handle_t obj; int ret = -1; api->object_find_create ( parent_object_handle, object_name, object_name_len, &obj_finder); if (api->object_find_next (obj_finder, &obj) == 0) { /* found it */ *object_handle = obj; ret = 0; } else { ret = api->object_create (parent_object_handle, object_handle, object_name, object_name_len); } api->object_find_destroy (obj_finder); return ret; } +static void mon_object_destroyed( + hdb_handle_t parent_object_handle, + const void *name_pt, size_t name_len, + void *priv_data_pt) +{ + struct resource_instance* inst = (struct resource_instance*)priv_data_pt; + + if (inst) { + log_printf (LOGSYS_LEVEL_WARNING, + "resource \"%s\" deleted from objdb!", + inst->name); + + cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst); + } +} + + static void mon_key_change_notify (object_change_type_t change_type, hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *object_name_pt, size_t object_name_len, const void *key_name_pt, size_t key_len, const void *key_value_pt, size_t key_value_len, void *priv_data_pt) { struct resource_instance* inst = (struct resource_instance*)priv_data_pt; - if ((strcmp ((char*)key_name_pt, "max") == 0) || - (strcmp ((char*)key_name_pt, "poll_period") == 0)) { + if ((strncmp ((char*)key_name_pt, "max", key_len) == 0) || + (strncmp ((char*)key_name_pt, "poll_period", key_len) == 0)) { ENTER(); cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst); } } static void mon_instance_init (hdb_handle_t parent, struct resource_instance* inst) { int32_t res; char mon_period_str[32]; + char *str; size_t mon_period_len; objdb_value_types_t mon_period_type; - int32_t tmp_value; + uint64_t tmp_value; int32_t zero_32 = 0; time_t zero_64 = 0; double zero_double = 0; - ENTER(); - object_find_or_create (parent, &inst->handle, inst->name, strlen (inst->name)); if (inst->max_type == OBJDB_VALUETYPE_INT32) { api->object_key_create_typed (inst->handle, "current", &zero_32, sizeof (zero_32), inst->max_type); } else { api->object_key_create_typed (inst->handle, "current", &zero_double, sizeof (zero_double), inst->max_type); } api->object_key_create_typed (inst->handle, "last_updated", &zero_64, - sizeof (time_t), OBJDB_VALUETYPE_INT64); + sizeof (uint64_t), OBJDB_VALUETYPE_UINT64); api->object_key_create_typed (inst->handle, - "state", mon_disabled_str, strlen (mon_disabled_str), + "state", mon_stopped_str, strlen (mon_stopped_str), OBJDB_VALUETYPE_STRING); inst->fsm.name = inst->name; inst->fsm.curr_entry = 0; - inst->fsm.curr_state = MON_S_DISABLED; + inst->fsm.curr_state = MON_S_STOPPED; inst->fsm.table = mon_fsm_table; inst->fsm.entries = sizeof(mon_fsm_table) / sizeof(struct cs_fsm_entry); inst->fsm.state_to_str = mon_res_state_to_str; inst->fsm.event_to_str = mon_res_event_to_str; res = api->object_key_get_typed (inst->handle, "poll_period", - (void**)&mon_period_str, &mon_period_len, + (void**)&str, &mon_period_len, &mon_period_type); if (res != 0) { - mon_period_len = snprintf (mon_period_str, 32, "%d", + mon_period_len = snprintf (mon_period_str, 32, "%"PRIu64"", inst->period); api->object_key_create_typed (inst->handle, "poll_period", &mon_period_str, mon_period_len, OBJDB_VALUETYPE_STRING); } else { - tmp_value = strtol (mon_period_str, NULL, 0); - if (tmp_value > 0 && tmp_value < 120) + if (str_to_uint64_t(str, &tmp_value, MON_MIN_PERIOD, MON_MAX_PERIOD) == CS_OK) { inst->period = tmp_value; + } else { + log_printf (LOGSYS_LEVEL_WARNING, + "Could NOT use poll_period:%s ms for resource %s", + str, inst->name); + } } cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst); - poll_timer_add (mon_poll, - inst->period * 1000, inst, - inst->update_stats_fn, - &inst->timer_handle); - - api->object_track_start (inst->handle, OBJECT_TRACK_DEPTH_ONE, + api->object_track_start (inst->handle, OBJECT_TRACK_DEPTH_RECURSIVE, mon_key_change_notify, - NULL, NULL, NULL, NULL); + NULL, mon_object_destroyed, NULL, inst); } static int mon_exec_init_fn ( struct corosync_api_v1 *corosync_api) { hdb_handle_t obj; hdb_handle_t parent; +#ifdef HAVE_LIBSTATGRAB + sg_init(); +#endif /* HAVE_LIBSTATGRAB */ + #ifdef COROSYNC_SOLARIS logsys_subsys_init(); #endif api = corosync_api; - ENTER(); object_find_or_create (OBJECT_PARENT_HANDLE, &resources_obj, "resources", strlen ("resources")); object_find_or_create (resources_obj, &obj, "system", strlen ("system")); parent = obj; mon_instance_init (parent, &memory_used_inst); mon_instance_init (parent, &load_15min_inst); - - pthread_create (&mon_poll_thread, NULL, mon_thread_handler, NULL); - return 0; } diff --git a/services/wd.c b/services/wd.c index 9c9ad97d..8ceecdea 100644 --- a/services/wd.c +++ b/services/wd.c @@ -1,755 +1,829 @@ /* * Copyright (c) 2010 Red Hat, Inc. * * All rights reserved. * * Author: Angus Salkeld * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include "../exec/fsm.h" typedef enum { WD_RESOURCE_GOOD, WD_RESOURCE_FAILED, WD_RESOURCE_STATE_UNKNOWN, WD_RESOURCE_NOT_MONITORED } wd_resource_state_t; struct resource { hdb_handle_t handle; char *recovery; - char name[128]; + char name[CS_MAX_NAME_LENGTH]; time_t last_updated; struct cs_fsm fsm; corosync_timer_handle_t check_timer; - uint32_t check_timeout; + uint64_t check_timeout; }; LOGSYS_DECLARE_SUBSYS("WD"); /* * Service Interfaces required by service_message_handler struct */ static int wd_exec_init_fn ( struct corosync_api_v1 *corosync_api); static int wd_exec_exit_fn (void); static void wd_resource_check_fn (void* resource_ref); static struct corosync_api_v1 *api; -#define WD_DEFAULT_TIMEOUT 6 -static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT; -static uint32_t tickle_timeout = (WD_DEFAULT_TIMEOUT / 2); +#define WD_DEFAULT_TIMEOUT_SEC 6 +#define WD_DEFAULT_TIMEOUT_MS (WD_DEFAULT_TIMEOUT_SEC * CS_TIME_MS_IN_SEC) +#define WD_MIN_TIMEOUT_MS 500 +#define WD_MAX_TIMEOUT_MS (120 * CS_TIME_MS_IN_SEC) +static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT_SEC; +static uint64_t tickle_timeout = (WD_DEFAULT_TIMEOUT_MS / 2); static int dog = -1; static corosync_timer_handle_t wd_timer; static hdb_handle_t resources_obj; static int watchdog_ok = 1; struct corosync_service_engine wd_service_engine = { - .name = "corosync self-fencing service", + .name = "corosync watchdog service", .id = WD_SERVICE, .priority = 1, .private_data_size = 0, - .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED, + .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED, .lib_init_fn = NULL, .lib_exit_fn = NULL, .lib_engine = NULL, .lib_engine_count = 0, .exec_engine = NULL, .exec_engine_count = 0, .confchg_fn = NULL, .exec_init_fn = wd_exec_init_fn, .exec_exit_fn = wd_exec_exit_fn, .exec_dump_fn = NULL, .sync_mode = CS_SYNC_V2 }; static DECLARE_LIST_INIT (confchg_notify); /* * F S M */ static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data); static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data); enum wd_resource_state { - WD_S_GOOD, + WD_S_RUNNING, WD_S_FAILED, - WD_S_DISABLED + WD_S_STOPPED }; enum wd_resource_event { WD_E_FAILURE, WD_E_CONFIG_CHANGED }; -const char * wd_ok_str = "ok"; +const char * wd_running_str = "running"; const char * wd_failed_str = "failed"; const char * wd_failure_str = "failure"; -const char * wd_disabled_str = "disabled"; +const char * wd_stopped_str = "stopped"; const char * wd_config_changed_str = "config_changed"; struct cs_fsm_entry wd_fsm_table[] = { - { WD_S_DISABLED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_DISABLED, WD_S_GOOD, -1} }, - { WD_S_DISABLED, WD_E_FAILURE, NULL, {-1} }, - { WD_S_GOOD, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_GOOD, WD_S_DISABLED, -1} }, - { WD_S_GOOD, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} }, - { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_GOOD, WD_S_DISABLED, -1} }, - { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} }, + { WD_S_STOPPED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_STOPPED, WD_S_RUNNING, -1} }, + { WD_S_STOPPED, WD_E_FAILURE, NULL, {-1} }, + { WD_S_RUNNING, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} }, + { WD_S_RUNNING, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} }, + { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} }, + { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} }, }; /* * Dynamic loading descriptor */ static struct corosync_service_engine *wd_get_service_engine_ver0 (void); static struct corosync_service_engine_iface_ver0 wd_service_engine_iface = { .corosync_get_service_engine_ver0 = wd_get_service_engine_ver0 }; static struct lcr_iface corosync_wd_ver0[1] = { { .name = "corosync_wd", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = NULL, } }; static struct lcr_comp wd_comp_ver0 = { .iface_count = 1, .ifaces = corosync_wd_ver0 }; static struct corosync_service_engine *wd_get_service_engine_ver0 (void) { return (&wd_service_engine); } #ifdef COROSYNC_SOLARIS void corosync_lcr_component_register (void); void corosync_lcr_component_register (void) { #else __attribute__ ((constructor)) static void corosync_lcr_component_register (void) { #endif lcr_interfaces_set (&corosync_wd_ver0[0], &wd_service_engine_iface); lcr_component_register (&wd_comp_ver0); } static int object_find_or_create ( hdb_handle_t parent_object_handle, hdb_handle_t *object_handle, const void *object_name, size_t object_name_len) { hdb_handle_t obj_finder; hdb_handle_t obj; int ret = -1; api->object_find_create ( parent_object_handle, object_name, object_name_len, &obj_finder); if (api->object_find_next (obj_finder, &obj) == 0) { /* found it */ *object_handle = obj; ret = 0; } else { ret = api->object_create (parent_object_handle, object_handle, object_name, object_name_len); } api->object_find_destroy (obj_finder); return ret; } +static cs_error_t str_to_uint64_t(const char* str, uint64_t *out_value, uint64_t min, uint64_t max) +{ + char *endptr; + + errno = 0; + *out_value = strtol(str, &endptr, 0); + + /* Check for various possible errors */ + if (errno != 0 || endptr == str) { + return CS_ERR_INVALID_PARAM; + } + + if (*out_value > max || *out_value < min) { + return CS_ERR_INVALID_PARAM; + } + return CS_OK; +} + static const char * wd_res_state_to_str(struct cs_fsm* fsm, int32_t state) { switch (state) { - case WD_S_DISABLED: - return wd_disabled_str; + case WD_S_STOPPED: + return wd_stopped_str; break; - case WD_S_GOOD: - return wd_ok_str; + case WD_S_RUNNING: + return wd_running_str; break; case WD_S_FAILED: return wd_failed_str; break; } return NULL; } static const char * wd_res_event_to_str(struct cs_fsm* fsm, int32_t event) { switch (event) { case WD_E_CONFIG_CHANGED: return wd_config_changed_str; break; case WD_E_FAILURE: return wd_failure_str; break; } return NULL; } /* - * returns (0 == OK, 1 == failed) + * returns (CS_TRUE == OK, CS_FALSE == failed) */ -static int32_t wd_resource_has_failed (struct resource *ref) +static int32_t wd_resource_state_is_ok (struct resource *ref) { hdb_handle_t resource = ref->handle; int res; char* state; size_t state_len; objdb_value_types_t type; - time_t *last_updated; - time_t my_time; + uint64_t *last_updated; + uint64_t my_time; + uint64_t allowed_period; size_t last_updated_len; res = api->object_key_get_typed (resource, "last_updated", (void*)&last_updated, &last_updated_len, &type); if (res != 0) { /* key does not exist. */ - return 1; + return CS_FALSE; } res = api->object_key_get_typed (resource, "state", (void**)&state, &state_len, &type); if (res != 0 || strncmp (state, "disabled", strlen ("disabled")) == 0) { /* key does not exist. */ - return 1; + return CS_FALSE; + } + if (*last_updated == 0) { + /* initial value */ + return CS_TRUE; } - my_time = time (NULL); + my_time = cs_timestamp_get(); - if ((*last_updated + ref->check_timeout) < my_time) { - log_printf (LOGSYS_LEVEL_INFO, "delayed %ld + %d < %ld", - *last_updated, ref->check_timeout, my_time); - return 1; + /* + * Here we check that the monitor has written a timestamp within the poll_period + * plus a grace factor of (0.5 * poll_period). + */ + allowed_period = (ref->check_timeout * MILLI_2_NANO_SECONDS * 3) / 2; + if ((*last_updated + allowed_period) < my_time) { + log_printf (LOGSYS_LEVEL_ERROR, + "last_updated %"PRIu64" ms too late, period:%"PRIu64".", + (uint64_t)(my_time/MILLI_2_NANO_SECONDS - ((*last_updated + allowed_period) / MILLI_2_NANO_SECONDS)), + ref->check_timeout); + return CS_FALSE; } - if ((*last_updated + ref->check_timeout) < my_time || - strcmp (state, "bad") == 0) { - return 1; + if (strcmp (state, wd_failed_str) == 0) { + return CS_FALSE; } - return 0; + return CS_TRUE; } static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data) { int res; size_t len; char *state; objdb_value_types_t type; - char mon_period_str[32]; - int32_t tmp_value; + char *str; + uint64_t tmp_value; + uint64_t next_timeout; struct resource *ref = (struct resource*)data; + next_timeout = ref->check_timeout; + res = api->object_key_get_typed (ref->handle, "poll_period", - (void**)&mon_period_str, &len, + (void**)&str, &len, &type); if (res == 0) { - tmp_value = strtol (mon_period_str, NULL, 0); - if (tmp_value > 0 && tmp_value < 120) - ref->check_timeout = (tmp_value * 5)/4; + if (str_to_uint64_t(str, &tmp_value, WD_MIN_TIMEOUT_MS, WD_MAX_TIMEOUT_MS) == CS_OK) { + log_printf (LOGSYS_LEVEL_DEBUG, + "poll_period changing from:%"PRIu64" to %"PRIu64".", + ref->check_timeout, tmp_value); + /* + * To easy in the transition between poll_period's we are going + * to make the first timeout the bigger of the new and old value. + * This is to give the monitoring system time to adjust. + */ + next_timeout = CS_MAX(tmp_value, ref->check_timeout); + ref->check_timeout = tmp_value; + } else { + log_printf (LOGSYS_LEVEL_WARNING, + "Could NOT use poll_period:%s ms for resource %s", + str, ref->name); + } } res = api->object_key_get_typed (ref->handle, "recovery", (void*)&ref->recovery, &len, &type); if (res != 0) { /* key does not exist. */ log_printf (LOGSYS_LEVEL_WARNING, "resource %s missing a recovery key.", ref->name); - cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref); + cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref); return; } res = api->object_key_get_typed (ref->handle, "state", (void*)&state, &len, &type); if (res != 0) { /* key does not exist. */ log_printf (LOGSYS_LEVEL_WARNING, "resource %s missing a state key.", ref->name); - cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref); + cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref); return; } - - cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref); - if (ref->check_timer) { api->timer_delete(ref->check_timer); + ref->check_timer = NULL; } - api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000, - ref, - wd_resource_check_fn, &ref->check_timer); + if (strcmp(wd_stopped_str, state) == 0) { + cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref); + } else { + api->timer_add_duration(next_timeout * MILLI_2_NANO_SECONDS, + ref, wd_resource_check_fn, &ref->check_timer); + cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref); + } } static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data) { struct resource* ref = (struct resource*)data; if (ref->check_timer) { api->timer_delete(ref->check_timer); + ref->check_timer = NULL; } log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!", ref->recovery, (char*)ref->name); if (strcmp (ref->recovery, "watchdog") == 0 || strcmp (ref->recovery, "quit") == 0) { watchdog_ok = 0; } else if (strcmp (ref->recovery, "reboot") == 0) { - //reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_RESTART, NULL); + reboot(RB_AUTOBOOT); } else if (strcmp (ref->recovery, "shutdown") == 0) { - //reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_POWER_OFF, NULL); + reboot(RB_POWER_OFF); } cs_fsm_state_set(fsm, WD_S_FAILED, data); } static void wd_key_changed(object_change_type_t change_type, hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *object_name_pt, size_t object_name_len, const void *key_name_pt, size_t key_len, const void *key_value_pt, size_t key_value_len, void *priv_data_pt) { struct resource* ref = (struct resource*)priv_data_pt; - if (strcmp(key_name_pt, "last_updated") == 0 || - strcmp(key_name_pt, "current") == 0) { + if (strncmp(key_name_pt, "last_updated", key_len) == 0 || + strncmp(key_name_pt, "current", key_len) == 0) { return; } -// log_printf (LOGSYS_LEVEL_WARNING, -// "watchdog resource key changed: %s.%s=%s ref=%p.", -// (char*)object_name_pt, (char*)key_name_pt, (char*)key_value_pt, ref); if (ref == NULL) { return; } cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref); } static void wd_object_destroyed( hdb_handle_t parent_object_handle, const void *name_pt, size_t name_len, void *priv_data_pt) { struct resource* ref = (struct resource*)priv_data_pt; - log_printf (LOGSYS_LEVEL_WARNING, - "watchdog resource \"%s\" deleted from objdb!", - (char*)name_pt); - if (ref) { + log_printf (LOGSYS_LEVEL_WARNING, + "resource \"%s\" deleted from objdb!", + ref->name); + api->timer_delete(ref->check_timer); ref->check_timer = NULL; + free(ref); } } static void wd_resource_check_fn (void* resource_ref) { struct resource* ref = (struct resource*)resource_ref; - log_printf (LOGSYS_LEVEL_INFO, - "checking watchdog resource \"%s\".", - ref->name); - if (wd_resource_has_failed (ref) ) { + if (wd_resource_state_is_ok (ref) == CS_FALSE) { cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref); - log_printf (LOGSYS_LEVEL_CRIT, - "watchdog resource \"%s\" failed!", - (char*)ref->name); return; } - api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000, + api->timer_add_duration(ref->check_timeout*MILLI_2_NANO_SECONDS, ref, wd_resource_check_fn, &ref->check_timer); } - -static void wd_resource_create (hdb_handle_t resource_obj) +/* + * return 0 - fully configured + * return -1 - partially configured + */ +static int32_t wd_resource_create (hdb_handle_t resource_obj) { int res; size_t len; char *state; objdb_value_types_t type; - char mon_period_str[32]; - int32_t tmp_value; + char period_str[32]; + char *str; + uint64_t tmp_value; struct resource *ref = malloc (sizeof (struct resource)); ref->handle = resource_obj; - ref->check_timeout = WD_DEFAULT_TIMEOUT; + ref->check_timeout = WD_DEFAULT_TIMEOUT_MS; ref->check_timer = NULL; api->object_name_get (resource_obj, ref->name, &len); ref->name[len] = '\0'; ref->fsm.name = ref->name; ref->fsm.table = wd_fsm_table; ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry); ref->fsm.curr_entry = 0; - ref->fsm.curr_state = WD_S_DISABLED; + ref->fsm.curr_state = WD_S_STOPPED; ref->fsm.state_to_str = wd_res_state_to_str; ref->fsm.event_to_str = wd_res_event_to_str; api->object_priv_set (resource_obj, NULL); res = api->object_key_get_typed (resource_obj, "poll_period", - (void**)&mon_period_str, &len, + (void**)&str, &len, &type); if (res != 0) { - log_printf (LOGSYS_LEVEL_ERROR, "%s : %d",__func__, res); - len = snprintf (mon_period_str, 32, "%d", ref->check_timeout); + len = snprintf (period_str, 32, "%"PRIu64"", ref->check_timeout); api->object_key_create_typed (resource_obj, - "poll_period", &mon_period_str, + "poll_period", &period_str, len, OBJDB_VALUETYPE_STRING); } else { - tmp_value = strtol (mon_period_str, NULL, 0); - if (tmp_value > 0 && tmp_value < 120) - ref->check_timeout = (tmp_value * 5)/4; + if (str_to_uint64_t(str, &tmp_value, WD_MIN_TIMEOUT_MS, WD_MAX_TIMEOUT_MS) == CS_OK) { + ref->check_timeout = tmp_value; + } else { + log_printf (LOGSYS_LEVEL_WARNING, + "Could NOT use poll_period:%s ms for resource %s", + str, ref->name); + } } - api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_ONE, + api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_RECURSIVE, wd_key_changed, NULL, wd_object_destroyed, NULL, ref); res = api->object_key_get_typed (resource_obj, "recovery", (void*)&ref->recovery, &len, &type); if (res != 0) { /* key does not exist. */ log_printf (LOGSYS_LEVEL_WARNING, "resource %s missing a recovery key.", ref->name); - return; + return -1; } res = api->object_key_get_typed (resource_obj, "state", (void*)&state, &len, &type); if (res != 0) { /* key does not exist. */ log_printf (LOGSYS_LEVEL_WARNING, "resource %s missing a state key.", ref->name); - return; + return -1; } res = api->object_key_get_typed (resource_obj, "last_updated", (void*)&ref->last_updated, &len, &type); if (res != 0) { /* key does not exist. */ ref->last_updated = 0; } - api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000, + /* + * delay the first check to give the monitor time to start working. + */ + tmp_value = CS_MAX(ref->check_timeout * 2, WD_DEFAULT_TIMEOUT_MS); + api->timer_add_duration(tmp_value * MILLI_2_NANO_SECONDS, ref, wd_resource_check_fn, &ref->check_timer); - cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref); + cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref); + return 0; } static void wd_tickle_fn (void* arg) { ENTER(); if (watchdog_ok) { - if (dog > 0) + if (dog > 0) { ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok); + } + api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL, + wd_tickle_fn, &wd_timer); } else { log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!"); } - api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, NULL, - wd_tickle_fn, &wd_timer); } static void wd_resource_object_created(hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *name_pt, size_t name_len, void *priv_data_pt) { wd_resource_create (object_handle); } static void wd_scan_resources (void) { hdb_handle_t obj_finder; hdb_handle_t obj_finder2; hdb_handle_t resource_type; hdb_handle_t resource; - int res; + int res_count = 0; ENTER(); api->object_find_create ( OBJECT_PARENT_HANDLE, "resources", strlen ("resources"), &obj_finder); - res = api->object_find_next (obj_finder, &resources_obj); + api->object_find_next (obj_finder, &resources_obj); api->object_find_destroy (obj_finder); - if (res != 0) { - log_printf (LOGSYS_LEVEL_INFO, "no resources."); - return; - } /* this will be the system or process level */ api->object_find_create ( resources_obj, NULL, 0, &obj_finder); while (api->object_find_next (obj_finder, &resource_type) == 0) { api->object_find_create ( resource_type, NULL, 0, &obj_finder2); while (api->object_find_next (obj_finder2, &resource) == 0) { - wd_resource_create (resource); + if (wd_resource_create (resource) == 0) { + res_count++; + } } api->object_find_destroy (obj_finder2); api->object_track_start (resource_type, OBJECT_TRACK_DEPTH_ONE, NULL, wd_resource_object_created, NULL, NULL, NULL); } api->object_find_destroy (obj_finder); + if (res_count == 0) { + log_printf (LOGSYS_LEVEL_INFO, "no resources configured."); + } } static void watchdog_timeout_apply (uint32_t new) { struct watchdog_info ident; + uint32_t original_timeout = watchdog_timeout; - if (new < 2) { - watchdog_timeout = 2; - } - else if (new > 120) { - watchdog_timeout = 120; - } - else { - watchdog_timeout = new; + if (new == original_timeout) { + return; } + watchdog_timeout = new; + if (dog > 0) { ioctl(dog, WDIOC_GETSUPPORT, &ident); if (ident.options & WDIOF_SETTIMEOUT) { /* yay! the dog is trained. */ ioctl(dog, WDIOC_SETTIMEOUT, &watchdog_timeout); } ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout); } - tickle_timeout = watchdog_timeout / 2; - log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", watchdog_timeout); - log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %d seconds\n", tickle_timeout); + if (watchdog_timeout == new) { + tickle_timeout = (watchdog_timeout * CS_TIME_MS_IN_SEC)/ 2; + + /* reset the tickle timer in case it was reduced. + */ + api->timer_delete (wd_timer); + api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL, + wd_tickle_fn, &wd_timer); + + log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", watchdog_timeout); + log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %"PRIu64" ms\n", tickle_timeout); + } else { + log_printf (LOGSYS_LEVEL_WARNING, + "Could not change the Watchdog timeout from %d to %d seconds\n", + original_timeout, new); + } + } static int setup_watchdog(void) { struct watchdog_info ident; ENTER(); if (access ("/dev/watchdog", W_OK) != 0) { log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog, try modprobe "); dog = -1; return -1; } /* here goes, lets hope they have "Magic Close" */ dog = open("/dev/watchdog", O_WRONLY); if (dog == -1) { log_printf (LOGSYS_LEVEL_WARNING, "Watchdog exists but couldn't be opened."); dog = -1; return -1; } /* Right we have the dog. * Lets see what breed it is. */ ioctl(dog, WDIOC_GETSUPPORT, &ident); log_printf (LOGSYS_LEVEL_INFO, "Watchdog is now been tickled by corosync."); log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity); watchdog_timeout_apply (watchdog_timeout); ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD); return 0; } static void wd_top_level_key_changed(object_change_type_t change_type, hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *object_name_pt, size_t object_name_len, const void *key_name_pt, size_t key_len, const void *key_value_pt, size_t key_value_len, void *priv_data_pt) { - uint32_t tmp_value; + uint64_t tmp_value; + int32_t tmp_value_32; ENTER(); if (change_type != OBJECT_KEY_DELETED && strncmp ((char*)key_name_pt, "watchdog_timeout", key_value_len) == 0) { - tmp_value = strtol (key_value_pt, NULL, 0); - watchdog_timeout_apply (tmp_value); + if (str_to_uint64_t(key_value_pt, &tmp_value, 2, 120) == CS_OK) { + tmp_value_32 = tmp_value; + watchdog_timeout_apply (tmp_value_32); + } } else { - watchdog_timeout_apply (WD_DEFAULT_TIMEOUT); + watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC); } - log_printf (LOGSYS_LEVEL_INFO, "new(%d) tickle_timeout: %d", change_type, tickle_timeout); } - static void watchdog_timeout_get_initial (void) { int32_t res; char watchdog_timeout_str[32]; size_t watchdog_timeout_len; objdb_value_types_t watchdog_timeout_type; - uint32_t tmp_value; + uint32_t tmp_value_32; + uint64_t tmp_value; ENTER(); res = api->object_key_get_typed (resources_obj, "watchdog_timeout", (void**)&watchdog_timeout_str, &watchdog_timeout_len, &watchdog_timeout_type); if (res != 0) { - watchdog_timeout_apply (WD_DEFAULT_TIMEOUT); + watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC); watchdog_timeout_len = snprintf (watchdog_timeout_str, 32, "%d", watchdog_timeout); api->object_key_create_typed (resources_obj, "watchdog_timeout", &watchdog_timeout_str, watchdog_timeout_len, OBJDB_VALUETYPE_STRING); } else { - tmp_value = strtol (watchdog_timeout_str, NULL, 0); - watchdog_timeout_apply (tmp_value); + if (str_to_uint64_t(watchdog_timeout_str, &tmp_value, 2, 120) == CS_OK) { + tmp_value_32 = tmp_value; + watchdog_timeout_apply (tmp_value_32); + } else { + watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC); + } } api->object_track_start (resources_obj, OBJECT_TRACK_DEPTH_ONE, wd_top_level_key_changed, NULL, NULL, NULL, NULL); } static int wd_exec_init_fn ( struct corosync_api_v1 *corosync_api) { hdb_handle_t obj; ENTER(); #ifdef COROSYNC_SOLARIS logsys_subsys_init(); #endif api = corosync_api; object_find_or_create (OBJECT_PARENT_HANDLE, &resources_obj, "resources", strlen ("resources")); object_find_or_create (resources_obj, &obj, "system", strlen ("system")); object_find_or_create (resources_obj, &obj, "process", strlen ("process")); watchdog_timeout_get_initial(); setup_watchdog(); wd_scan_resources(); - api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, NULL, + api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL, wd_tickle_fn, &wd_timer); return 0; } static int wd_exec_exit_fn (void) { char magic = 'V'; ENTER(); if (dog > 0) { log_printf (LOGSYS_LEVEL_INFO, "magically closing the watchdog."); write (dog, &magic, 1); } return 0; } diff --git a/test/testsam.c b/test/testsam.c index 1972d9ee..d29605ab 100644 --- a/test/testsam.c +++ b/test/testsam.c @@ -1,1653 +1,1656 @@ /* * Copyright (c) 2009 Red Hat, Inc. * * All rights reserved. * * Author: Jan Friesse (jfriesse@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the Red Hat, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Provides test of SAM API */ #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *__progname; static int test2_sig_delivered = 0; static int test5_hc_cb_count = 0; static int test6_sig_delivered = 0; /* * First test will just register SAM, with policy restart. First instance will * sleep one second, send hc and sleep another 3 seconds. This should force restart. * Second instance will sleep one second, send hc, stop hc and sleep 3 seconds. * Then start hc again and sleep 3 seconds. This should force restart again. * Last instance just calls initialize again. This should end with error. * Then call start, followed by stop and start again. Finally, we will call finalize * twice. One should succeed, second should fail. After this, we will call every function * (none should succeed). */ static int test1 (void) { cs_error_t error; unsigned int instance_id; int i; printf ("%s: initialize\n", __FUNCTION__); error = sam_initialize (2000, SAM_RECOVERY_POLICY_RESTART); if (error != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", error); return 1; } printf ("%s: register\n", __FUNCTION__); error = sam_register (&instance_id); if (error != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", error); return 1; } if (instance_id == 1 || instance_id == 2) { printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } for (i = 0; i < 10; i++) { printf ("%s iid %d: sleep 1\n", __FUNCTION__, instance_id); sleep (1); printf ("%s iid %d: hc send\n", __FUNCTION__, instance_id); error = sam_hc_send (); if (error != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", error); return 1; } } if (instance_id == 2) { printf ("%s iid %d: stop\n", __FUNCTION__, instance_id); error = sam_stop (); if (error != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", error); return 1; } } printf ("%s iid %d: sleep 3\n", __FUNCTION__, instance_id); sleep (3); printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } printf ("%s iid %d: sleep 3\n", __FUNCTION__, instance_id); sleep (3); return 0; } if (instance_id == 3) { error = sam_initialize (2000, SAM_RECOVERY_POLICY_RESTART); if (error == CS_OK) { fprintf (stderr, "Can initialize SAM API after initialization"); return 1; } error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } error = sam_stop (); if (error != CS_OK) { fprintf (stderr, "Can't stop hc. Error %d\n", error); return 1; } error = sam_finalize (); if (error != CS_OK) { fprintf (stderr, "Can't finalize sam. Error %d\n", error); return 1; } error = sam_finalize (); if (error == CS_OK) { fprintf (stderr, "Can finalize sam after finalization!\n"); return 1; } if (sam_initialize (2, SAM_RECOVERY_POLICY_RESTART) == CS_OK || sam_start () == CS_OK || sam_stop () == CS_OK || sam_register (NULL) == CS_OK || sam_hc_send () == CS_OK || sam_hc_callback_register (NULL) == CS_OK) { fprintf (stderr, "Can call one of function after finalization!\n"); return 1; } return 0; } return 1; } static void test2_signal (int sig) { printf ("%s\n", __FUNCTION__); test2_sig_delivered = 1; } /* * This tests recovery policy quit and callback. */ static int test2 (void) { cs_error_t error; unsigned int instance_id; printf ("%s: initialize\n", __FUNCTION__); error = sam_initialize (2000, SAM_RECOVERY_POLICY_QUIT); if (error != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", error); return 1; } printf ("%s: register\n", __FUNCTION__); error = sam_register (&instance_id); if (error != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", error); return 1; } if (instance_id == 1) { signal (SIGTERM, test2_signal); printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } printf ("%s iid %d: sleep 1\n", __FUNCTION__, instance_id); sleep (1); printf ("%s iid %d: hc send\n", __FUNCTION__, instance_id); error = sam_hc_send (); if (error != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", error); return 1; } printf ("%s iid %d: wait for delivery of signal\n", __FUNCTION__, instance_id); while (!test2_sig_delivered) { sleep (1); } printf ("%s iid %d: wait for real kill\n", __FUNCTION__, instance_id); sleep (3); } return 1; } /* * Smoke test. Better to turn off coredump ;) This has no time limit, just restart process * when it dies. */ static int test3 (void) { cs_error_t error; unsigned int instance_id; int tmp1, tmp2, tmp3; printf ("%s: initialize\n", __FUNCTION__); error = sam_initialize (0, SAM_RECOVERY_POLICY_RESTART); if (error != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", error); return 1; } printf ("%s: register\n", __FUNCTION__); error = sam_register (&instance_id); if (error != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", error); return 1; } if (instance_id < 100) { printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } printf ("%s iid %d: divide by zero\n", __FUNCTION__, instance_id); tmp2 = rand (); tmp3 = 0; tmp1 = tmp2 / tmp3; return 1; } return 0; } /* * Test sam_data_store, sam_data_restore and sam_data_getsize */ static int test4 (void) { size_t size; cs_error_t err; int i; unsigned int instance_id; char saved_data[128]; char saved_data2[128]; printf ("%s: sam_data_getsize 1\n", __FUNCTION__); err = sam_data_getsize (&size); if (err != CS_ERR_BAD_HANDLE) { fprintf (stderr, "Test should return CS_ERR_BAD_HANDLE. Error returned %d\n", err); return 1; } printf ("%s: sam_data_getsize 2\n", __FUNCTION__); err = sam_data_getsize (NULL); if (err != CS_ERR_INVALID_PARAM) { fprintf (stderr, "Test should return CS_ERR_INVALID_PARAM. Error returned %d\n", err); return 1; } printf ("%s: sam_data_store 1\n", __FUNCTION__); err = sam_data_store (NULL, 0); if (err != CS_ERR_BAD_HANDLE) { fprintf (stderr, "Test should return CS_ERR_BAD_HANDLE. Error returned %d\n", err); return 1; } printf ("%s: sam_data_restore 1\n", __FUNCTION__); err = sam_data_restore (saved_data, sizeof (saved_data)); if (err != CS_ERR_BAD_HANDLE) { fprintf (stderr, "Test should return CS_ERR_BAD_HANDLE. Error returned %d\n", err); return 1; } printf ("%s: sam_initialize\n", __FUNCTION__); err = sam_initialize (0, SAM_RECOVERY_POLICY_RESTART); if (err != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", err); return 1; } printf ("%s: sam_data_getsize 3\n", __FUNCTION__); err = sam_data_getsize (&size); if (err != CS_OK) { fprintf (stderr, "Test should return CS_ERR_BAD_HANDLE. Error returned %d\n", err); return 1; } if (size != 0) { fprintf (stderr, "Test should return size of 0. Returned %zx\n", size); return 1; } printf ("%s: sam_data_restore 2\n", __FUNCTION__); err = sam_data_restore (NULL, sizeof (saved_data)); if (err != CS_ERR_INVALID_PARAM) { fprintf (stderr, "Test should return CS_ERR_INVALID_PARAM. Error returned %d\n", err); return 1; } /* * Store some real data */ for (i = 0; i < sizeof (saved_data); i++) { saved_data[i] = (char)(i + 5); } printf ("%s: sam_data_store 2\n", __FUNCTION__); err = sam_data_store (saved_data, sizeof (saved_data)); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } printf ("%s: sam_data_getsize 4\n", __FUNCTION__); err = sam_data_getsize (&size); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (size != sizeof (saved_data)) { fprintf (stderr, "Test should return size of 0. Returned %zx\n", size); return 1; } printf ("%s: sam_data_restore 3\n", __FUNCTION__); err = sam_data_restore (saved_data2, sizeof (saved_data2) - 1); if (err != CS_ERR_INVALID_PARAM) { fprintf (stderr, "Test should return CS_ERR_INVALID_PARAM. Error returned %d\n", err); return 1; } printf ("%s: sam_data_restore 4\n", __FUNCTION__); err = sam_data_restore (saved_data2, sizeof (saved_data2)); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (memcmp (saved_data, saved_data2, sizeof (saved_data2)) != 0) { fprintf (stderr, "Retored data are not same\n"); return 1; } memset (saved_data2, 0, sizeof (saved_data2)); printf ("%s: sam_data_store 3\n", __FUNCTION__); err = sam_data_store (NULL, 1); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } printf ("%s: sam_data_getsize 5\n", __FUNCTION__); err = sam_data_getsize (&size); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (size != 0) { fprintf (stderr, "Test should return size of 0. Returned %zx\n", size); return 1; } printf ("%s: sam_data_store 4\n", __FUNCTION__); err = sam_data_store (saved_data, sizeof (saved_data)); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } printf ("%s: register\n", __FUNCTION__); err = sam_register (&instance_id); if (err != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", err); return 1; } if (instance_id == 1) { printf ("%s iid %d: sam_start\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 1; } printf ("%s iid %d: sam_data_getsize 6\n", __FUNCTION__, instance_id); err = sam_data_getsize (&size); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (size != sizeof (saved_data2)) { fprintf (stderr, "Test should return size of 0. Returned %zx\n", size); return 1; } printf ("%s iid %d: sam_data_restore 5\n", __FUNCTION__, instance_id); err = sam_data_restore (saved_data2, sizeof (saved_data2)); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (memcmp (saved_data, saved_data2, sizeof (saved_data2)) != 0) { fprintf (stderr, "Retored data are not same\n"); return 1; } for (i = 0; i < sizeof (saved_data); i++) { saved_data[i] = (char)(i - 5); } printf ("%s iid %d: sam_data_store 5\n", __FUNCTION__, instance_id); err = sam_data_store (saved_data, sizeof (saved_data) - 7); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } exit (1); } if (instance_id == 2) { printf ("%s iid %d: sam_start\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 1; } printf ("%s iid %d: sam_data_getsize 7\n", __FUNCTION__, instance_id); err = sam_data_getsize (&size); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (size != sizeof (saved_data2) - 7) { fprintf (stderr, "Test should return size of 0. Returned %zx\n", size); return 1; } printf ("%s iid %d: sam_data_restore 6\n", __FUNCTION__, instance_id); err = sam_data_restore (saved_data2, sizeof (saved_data2)); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } for (i = 0; i < sizeof (saved_data); i++) { saved_data[i] = (char)(i - 5); } if (memcmp (saved_data, saved_data2, sizeof (saved_data2) - 7) != 0) { fprintf (stderr, "Retored data are not same\n"); return 1; } printf ("%s iid %d: sam_data_store 6\n", __FUNCTION__, instance_id); err = sam_data_store (NULL, 0); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } exit (1); } if (instance_id == 3) { printf ("%s iid %d: sam_data_getsize 8\n", __FUNCTION__, instance_id); err = sam_data_getsize (&size); if (err != CS_OK) { fprintf (stderr, "Test should return CS_OK. Error returned %d\n", err); return 1; } if (size != 0) { fprintf (stderr, "Test should return size of 0. Returned %zx\n", size); return 1; } } return (0); } static int test5_hc_cb (void) { printf ("%s %d\n", __FUNCTION__, ++test5_hc_cb_count); sam_data_store (&test5_hc_cb_count, sizeof (test5_hc_cb_count)); if (test5_hc_cb_count > 10) return 1; return 0; } /* * Test event driven healtchecking. */ static int test5 (void) { cs_error_t error; unsigned int instance_id; int hc_cb_count; printf ("%s: initialize\n", __FUNCTION__); error = sam_initialize (100, SAM_RECOVERY_POLICY_RESTART); if (error != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", error); return 1; } printf ("%s: register\n", __FUNCTION__); error = sam_register (&instance_id); if (error != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", error); return 1; } if (instance_id == 1) { printf ("%s iid %d: hc callback register\n", __FUNCTION__, instance_id); error = sam_hc_callback_register (test5_hc_cb); if (error != CS_OK) { fprintf (stderr, "Can't register hc cb. Error %d\n", error); return 1; } printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } sleep (2); printf ("%s iid %d: Failed. Wasn't killed.\n", __FUNCTION__, instance_id); return 1; } if (instance_id == 2) { error = sam_data_restore (&hc_cb_count, sizeof (hc_cb_count)); if (error != CS_OK) { fprintf (stderr, "sam_data_restore should return CS_OK. Error returned %d\n", error); return 1; } if (hc_cb_count != 11) { fprintf (stderr, "%s iid %d: Premature killed. hc_cb_count should be 11 and it is %d\n", __FUNCTION__, instance_id - 1, hc_cb_count); return 1; } return 0; } return 1; } static void test6_signal (int sig) { cs_error_t error; printf ("%s\n", __FUNCTION__); test6_sig_delivered++; if ((error = sam_data_store (&test6_sig_delivered, sizeof (test6_sig_delivered))) != CS_OK) { fprintf (stderr, "Can't store data! Error : %d\n", error); } } /* * Test warn signal set. */ static int test6 (void) { cs_error_t error; unsigned int instance_id; int test6_sig_del; printf ("%s: initialize\n", __FUNCTION__); error = sam_initialize (2000, SAM_RECOVERY_POLICY_RESTART); if (error != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", error); return 1; } printf ("%s: register\n", __FUNCTION__); error = sam_register (&instance_id); if (error != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", error); return 1; } if (instance_id == 1) { error = sam_warn_signal_set (SIGUSR1); if (error != CS_OK) { fprintf (stderr, "Can't set warn signal. Error %d\n", error); return 1; } signal (SIGUSR1, test6_signal); printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } printf ("%s iid %d: sleep 1\n", __FUNCTION__, instance_id); sleep (1); printf ("%s iid %d: hc send\n", __FUNCTION__, instance_id); error = sam_hc_send (); if (error != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", error); return 1; } printf ("%s iid %d: wait for delivery of signal\n", __FUNCTION__, instance_id); while (!test6_sig_delivered) { sleep (1); } printf ("%s iid %d: wait for real kill\n", __FUNCTION__, instance_id); sleep (3); printf ("%s iid %d: wasn't killed\n", __FUNCTION__, instance_id); return (1); } if (instance_id == 2) { error = sam_data_restore (&test6_sig_del, sizeof (test6_sig_del)); if (error != CS_OK) { fprintf (stderr, "Can't restore data. Error %d\n", error); return 1; } if (test6_sig_del != 1) { fprintf (stderr, "Previous test failed. Signal was not delivered\n"); return 1; } error = sam_warn_signal_set (SIGKILL); if (error != CS_OK) { fprintf (stderr, "Can't set warn signal. Error %d\n", error); return 1; } signal (SIGUSR1, test6_signal); printf ("%s iid %d: start\n", __FUNCTION__, instance_id); error = sam_start (); if (error != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", error); return 1; } printf ("%s iid %d: sleep 1\n", __FUNCTION__, instance_id); sleep (1); printf ("%s iid %d: hc send\n", __FUNCTION__, instance_id); error = sam_hc_send (); if (error != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", error); return 1; } printf ("%s iid %d: wait for delivery of signal\n", __FUNCTION__, instance_id); while (!test6_sig_delivered) { sleep (1); } printf ("%s iid %d: wasn't killed\n", __FUNCTION__, instance_id); return (1); } if (instance_id == 3) { error = sam_data_restore (&test6_sig_del, sizeof (test6_sig_del)); if (error != CS_OK) { fprintf (stderr, "Can't restore data. Error %d\n", error); return 1; } if (test6_sig_del != 1) { fprintf (stderr, "Previous test failed. Signal WAS delivered\n"); return 1; } return (0); } return 1; } static void *test7_thread (void *arg) { /* Wait 5s */ sleep (5); exit (0); } /* * Test quorum */ static int test7 (void) { confdb_handle_t cdb_handle; cs_error_t err; hdb_handle_t quorum_handle; size_t value_len; char key_value[256]; unsigned int instance_id; pthread_t kill_thread; err = confdb_initialize (&cdb_handle, NULL); if (err != CS_OK) { printf ("Could not initialize Cluster Configuration Database API instance error %d. Test skipped\n", err); return (1); } err = confdb_object_find_start(cdb_handle, OBJECT_PARENT_HANDLE); if (err != CS_OK) { printf ("Could not start object_find %d. Test skipped\n", err); return (1); } err = confdb_object_find(cdb_handle, OBJECT_PARENT_HANDLE, "quorum", strlen("quorum"), &quorum_handle); if (err != CS_OK) { printf ("Could not object_find \"quorum\": %d. Test skipped\n", err); return (1); } err = confdb_key_get(cdb_handle, quorum_handle, "provider", strlen("provider"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"provider\" key: %d. Test skipped\n", err); return (1); } if (!(value_len - 1 == strlen ("testquorum") && memcmp (key_value, "testquorum", value_len - 1) == 0)) { printf ("Provider is not testquorum. Test skipped\n"); return (1); } /* * Set to not quorate */ err = confdb_key_create(cdb_handle, quorum_handle, "quorate", strlen("quorate"), "0", strlen("0")); if (err != CS_OK) { printf ("Can't create confdb key. Error %d\n", err); return (2); } printf ("%s: initialize\n", __FUNCTION__); err = sam_initialize (2000, SAM_RECOVERY_POLICY_QUORUM_RESTART); if (err != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", err); return 2; } printf ("%s: register\n", __FUNCTION__); err = sam_register (&instance_id); if (err != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", err); return 2; } if (instance_id == 1) { /* * Sam start should block forever, but 10s for us should be enough */ pthread_create (&kill_thread, NULL, test7_thread, NULL); printf ("%s iid %d: start - should block forever (waiting 5s)\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 2; } printf ("%s iid %d: wasn't killed\n", __FUNCTION__, instance_id); return (2); } if (instance_id == 2) { /* * Set to quorate */ err = confdb_key_create(cdb_handle, quorum_handle, "quorate", strlen("quorate"), "1", strlen("1")); if (err != CS_OK) { printf ("Can't create confdb key. Error %d\n", err); return (2); } printf ("%s iid %d: start\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 2; } /* * Set corosync unquorate */ err = confdb_key_create(cdb_handle, quorum_handle, "quorate", strlen("quorate"), "0", strlen("0")); if (err != CS_OK) { printf ("Can't create confdb key. Error %d\n", err); return (2); } printf ("%s iid %d: sleep 3\n", __FUNCTION__, instance_id); sleep (3); printf ("%s iid %d: wasn't killed\n", __FUNCTION__, instance_id); return (2); } if (instance_id == 3) { return (0); } return (2); } /* * Test confdb integration + quit policy */ static int test8 (pid_t pid, pid_t old_pid, int test_n) { confdb_handle_t cdb_handle; cs_error_t err; hdb_handle_t res_handle, proc_handle, pid_handle; size_t value_len; uint64_t tstamp1, tstamp2; + int32_t msec_diff; char key_value[256]; unsigned int instance_id; char tmp_obj[PATH_MAX]; confdb_value_types_t cdbtype; err = confdb_initialize (&cdb_handle, NULL); if (err != CS_OK) { printf ("Could not initialize Cluster Configuration Database API instance error %d. Test skipped\n", err); return (1); } printf ("%s test %d\n", __FUNCTION__, test_n); if (test_n == 2) { /* * Object should not exist */ printf ("%s Testing if object exists (it shouldn't)\n", __FUNCTION__); err = confdb_object_find_start(cdb_handle, OBJECT_PARENT_HANDLE); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, OBJECT_PARENT_HANDLE, "resources", strlen("resources"), &res_handle); if (err != CS_OK) { printf ("Could not object_find \"resources\": %d.\n", err); return (2); } err = confdb_object_find_start(cdb_handle, res_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, res_handle, "process", strlen("process"), &proc_handle); if (err != CS_OK) { printf ("Could not object_find \"process\": %d.\n", err); return (2); } if (snprintf (tmp_obj, sizeof (tmp_obj), "%s:%d", __progname, pid) >= sizeof (tmp_obj)) { snprintf (tmp_obj, sizeof (tmp_obj), "%d", pid); } err = confdb_object_find_start(cdb_handle, proc_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, proc_handle, tmp_obj, strlen(tmp_obj), &pid_handle); if (err == CS_OK) { printf ("Could find object \"%s\": %d.\n", tmp_obj, err); return (2); } } if (test_n == 1 || test_n == 2) { printf ("%s: initialize\n", __FUNCTION__); err = sam_initialize (2000, SAM_RECOVERY_POLICY_QUIT | SAM_RECOVERY_POLICY_CONFDB); if (err != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", err); return 2; } printf ("%s: register\n", __FUNCTION__); err = sam_register (&instance_id); if (err != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", err); return 2; } err = confdb_object_find_start(cdb_handle, OBJECT_PARENT_HANDLE); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, OBJECT_PARENT_HANDLE, "resources", strlen("resources"), &res_handle); if (err != CS_OK) { printf ("Could not object_find \"resources\": %d.\n", err); return (2); } err = confdb_object_find_start(cdb_handle, res_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, res_handle, "process", strlen("process"), &proc_handle); if (err != CS_OK) { printf ("Could not object_find \"process\": %d.\n", err); return (2); } if (snprintf (tmp_obj, sizeof (tmp_obj), "%s:%d", __progname, pid) >= sizeof (tmp_obj)) { snprintf (tmp_obj, sizeof (tmp_obj), "%d", pid); } err = confdb_object_find_start(cdb_handle, proc_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, proc_handle, tmp_obj, strlen(tmp_obj), &pid_handle); if (err != CS_OK) { printf ("Could not object_find \"%s\": %d.\n", tmp_obj, err); return (2); } err = confdb_key_get(cdb_handle, pid_handle, "recovery", strlen("recovery"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"recovery\" key: %d.\n", err); return (2); } if (value_len != strlen ("quit") || memcmp (key_value, "quit", value_len) != 0) { printf ("Recovery key \"%s\" is not \"watchdog\".\n", key_value); return (2); } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) { - printf ("State key is not \"registered\".\n"); + if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) { + printf ("State key is not \"stopped\".\n"); return (2); } printf ("%s iid %d: start\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 2; } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("started") || memcmp (key_value, "started", value_len) != 0) { - printf ("State key is not \"started\".\n"); + if (value_len != strlen ("running") || memcmp (key_value, "running", value_len) != 0) { + printf ("State key is not \"running\".\n"); return (2); } printf ("%s iid %d: stop\n", __FUNCTION__, instance_id); err = sam_stop (); if (err != CS_OK) { fprintf (stderr, "Can't stop hc. Error %d\n", err); return 2; } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) { - printf ("State key is not \"registered\".\n"); + if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) { + printf ("State key is not \"stopped\".\n"); return (2); } printf ("%s iid %d: sleeping 5\n", __FUNCTION__, instance_id); sleep (5); err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) { - printf ("State key is not \"registered\".\n"); + if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) { + printf ("State key is not \"stopped\".\n"); return (2); } printf ("%s iid %d: start 2\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 2; } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("started") || memcmp (key_value, "started", value_len) != 0) { - printf ("State key is not \"started\".\n"); + if (value_len != strlen ("running") || memcmp (key_value, "running", value_len) != 0) { + printf ("State key is not \"running\".\n"); return (2); } if (test_n == 2) { printf ("%s iid %d: sleeping 5. Should be killed\n", __FUNCTION__, instance_id); sleep (5); return (2); } else { printf ("%s iid %d: Test HC\n", __FUNCTION__, instance_id); err = sam_hc_send (); if (err != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", err); return 2; } - err = confdb_key_get_typed (cdb_handle, pid_handle, "hc_last", &tstamp1, &value_len, &cdbtype); + err = confdb_key_get_typed (cdb_handle, pid_handle, "last_updated", &tstamp1, &value_len, &cdbtype); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } printf ("%s iid %d: Sleep 1\n", __FUNCTION__, instance_id); sleep (1); err = sam_hc_send (); if (err != CS_OK) { fprintf (stderr, "Can't send hc. Error %d\n", err); return 2; } sleep (1); - err = confdb_key_get_typed (cdb_handle, pid_handle, "hc_last", &tstamp2, &value_len, &cdbtype); + err = confdb_key_get_typed (cdb_handle, pid_handle, "last_updated", &tstamp2, &value_len, &cdbtype); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (tstamp2 - tstamp1 < 500 || tstamp2 - tstamp1 > 2000) { - printf ("Difference %d is not within <500, 2000> interval.\n", (int)(tstamp2 - tstamp1)); + msec_diff = (tstamp2 - tstamp1)/CS_TIME_NS_IN_MSEC; + + if (msec_diff < 500 || msec_diff > 2000) { + printf ("Difference %d is not within <500, 2000> interval.\n", msec_diff); return (2); } printf ("%s iid %d: stop 2\n", __FUNCTION__, instance_id); err = sam_stop (); if (err != CS_OK) { fprintf (stderr, "Can't stop hc. Error %d\n", err); return 2; } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) { - printf ("State key is not \"registered\".\n"); + if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) { + printf ("State key is not \"stopped\".\n"); return (2); } printf ("%s iid %d: exiting\n", __FUNCTION__, instance_id); return (0); } } if (test_n == 3) { printf ("%s Testing if status is failed\n", __FUNCTION__); /* * Previous should be FAILED */ err = confdb_object_find_start(cdb_handle, OBJECT_PARENT_HANDLE); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, OBJECT_PARENT_HANDLE, "resources", strlen("resources"), &res_handle); if (err != CS_OK) { printf ("Could not object_find \"resources\": %d.\n", err); return (2); } err = confdb_object_find_start(cdb_handle, res_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, res_handle, "process", strlen("process"), &proc_handle); if (err != CS_OK) { printf ("Could not object_find \"process\": %d.\n", err); return (2); } if (snprintf (tmp_obj, sizeof (tmp_obj), "%s:%d", __progname, pid) >= sizeof (tmp_obj)) { snprintf (tmp_obj, sizeof (tmp_obj), "%d", pid); } err = confdb_object_find_start(cdb_handle, proc_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, proc_handle, tmp_obj, strlen(tmp_obj), &pid_handle); if (err != CS_OK) { printf ("Could not object_find \"%s\": %d.\n", tmp_obj, err); return (2); } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } if (value_len != strlen ("failed") || memcmp (key_value, "failed", value_len) != 0) { printf ("State key is not \"failed\".\n"); return (2); } return (0); } return (2); } /* * Test confdb integration + restart policy */ static int test9 (pid_t pid, pid_t old_pid, int test_n) { confdb_handle_t cdb_handle; cs_error_t err; hdb_handle_t res_handle, proc_handle, pid_handle; size_t value_len; char key_value[256]; unsigned int instance_id; char tmp_obj[PATH_MAX]; err = confdb_initialize (&cdb_handle, NULL); if (err != CS_OK) { printf ("Could not initialize Cluster Configuration Database API instance error %d. Test skipped\n", err); return (1); } printf ("%s test %d\n", __FUNCTION__, test_n); if (test_n == 1) { printf ("%s: initialize\n", __FUNCTION__); err = sam_initialize (2000, SAM_RECOVERY_POLICY_RESTART | SAM_RECOVERY_POLICY_CONFDB); if (err != CS_OK) { fprintf (stderr, "Can't initialize SAM API. Error %d\n", err); return 2; } printf ("%s: register\n", __FUNCTION__); err = sam_register (&instance_id); if (err != CS_OK) { fprintf (stderr, "Can't register. Error %d\n", err); return 2; } printf ("%s: iid %d\n", __FUNCTION__, instance_id); if (instance_id < 3) { err = confdb_object_find_start(cdb_handle, OBJECT_PARENT_HANDLE); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, OBJECT_PARENT_HANDLE, "resources", strlen("resources"), &res_handle); if (err != CS_OK) { printf ("Could not object_find \"resources\": %d.\n", err); return (2); } err = confdb_object_find_start(cdb_handle, res_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, res_handle, "process", strlen("process"), &proc_handle); if (err != CS_OK) { printf ("Could not object_find \"process\": %d.\n", err); return (2); } if (snprintf (tmp_obj, sizeof (tmp_obj), "%s:%d", __progname, pid) >= sizeof (tmp_obj)) { snprintf (tmp_obj, sizeof (tmp_obj), "%d", pid); } err = confdb_object_find_start(cdb_handle, proc_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, proc_handle, tmp_obj, strlen(tmp_obj), &pid_handle); if (err != CS_OK) { printf ("Could not object_find \"%s\": %d.\n", tmp_obj, err); return (2); } err = confdb_key_get(cdb_handle, pid_handle, "recovery", strlen("recovery"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"recovery\" key: %d.\n", err); return (2); } if (value_len != strlen ("restart") || memcmp (key_value, "restart", value_len) != 0) { printf ("Recovery key \"%s\" is not \"restart\".\n", key_value); return (2); } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) { - printf ("State key is not \"registered\".\n"); + if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) { + printf ("State key is not \"stopped\".\n"); return (2); } printf ("%s iid %d: start\n", __FUNCTION__, instance_id); err = sam_start (); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 2; } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } - if (value_len != strlen ("started") || memcmp (key_value, "started", value_len) != 0) { - printf ("State key is not \"started\".\n"); + if (value_len != strlen ("running") || memcmp (key_value, "running", value_len) != 0) { + printf ("State key is not \"running\".\n"); return (2); } printf ("%s iid %d: waiting for kill\n", __FUNCTION__, instance_id); sleep (10); return (2); } if (instance_id == 3) { printf ("%s iid %d: mark failed\n", __FUNCTION__, instance_id); if (err != CS_OK) { fprintf (stderr, "Can't start hc. Error %d\n", err); return 2; } err = sam_mark_failed (); if (err != CS_OK) { fprintf (stderr, "Can't mark failed. Error %d\n", err); return 2; } sleep (10); return (2); } return (2); } if (test_n == 2) { printf ("%s Testing if status is failed\n", __FUNCTION__); /* * Previous should be FAILED */ err = confdb_object_find_start(cdb_handle, OBJECT_PARENT_HANDLE); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, OBJECT_PARENT_HANDLE, "resources", strlen("resources"), &res_handle); if (err != CS_OK) { printf ("Could not object_find \"resources\": %d.\n", err); return (2); } err = confdb_object_find_start(cdb_handle, res_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, res_handle, "process", strlen("process"), &proc_handle); if (err != CS_OK) { printf ("Could not object_find \"process\": %d.\n", err); return (2); } if (snprintf (tmp_obj, sizeof (tmp_obj), "%s:%d", __progname, pid) >= sizeof (tmp_obj)) { snprintf (tmp_obj, sizeof (tmp_obj), "%d", pid); } err = confdb_object_find_start(cdb_handle, proc_handle); if (err != CS_OK) { printf ("Could not start object_find %d.\n", err); return (2); } err = confdb_object_find(cdb_handle, proc_handle, tmp_obj, strlen(tmp_obj), &pid_handle); if (err != CS_OK) { printf ("Could not object_find \"%s\": %d.\n", tmp_obj, err); return (2); } err = confdb_key_get(cdb_handle, pid_handle, "state", strlen("state"), key_value, &value_len); if (err != CS_OK) { printf ("Could not get \"state\" key: %d.\n", err); return (2); } if (value_len != strlen ("failed") || memcmp (key_value, "failed", value_len) != 0) { printf ("State key is not \"failed\".\n"); return (2); } return (0); } return (2); } int main(int argc, char *argv[]) { pid_t pid, old_pid; int err; int stat; int all_passed = 1; int no_skipped = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 1; } if (pid == 0) { err = test1 (); sam_finalize (); return err; } waitpid (pid, &stat, 0); fprintf (stderr, "test1 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : "failed")); if (WEXITSTATUS (stat) != 0) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 1; } if (pid == 0) { err = test2 (); sam_finalize (); return (err); } waitpid (pid, &stat, 0); fprintf (stderr, "test2 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : "failed")); if (WEXITSTATUS (stat) != 0) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 1; } if (pid == 0) { err = test3 (); sam_finalize (); return (err); } waitpid (pid, &stat, 0); fprintf (stderr, "test3 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : "failed")); if (WEXITSTATUS (stat) != 0) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 1; } if (pid == 0) { err = test4 (); sam_finalize (); return (err); } waitpid (pid, &stat, 0); fprintf (stderr, "test4 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : "failed")); if (WEXITSTATUS (stat) != 0) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 1; } if (pid == 0) { err = test5 (); sam_finalize (); return (err); } waitpid (pid, &stat, 0); fprintf (stderr, "test5 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : "failed")); if (WEXITSTATUS (stat) != 0) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 1; } if (pid == 0) { err = test6 (); sam_finalize (); return (err); } waitpid (pid, &stat, 0); fprintf (stderr, "test6 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : "failed")); if (WEXITSTATUS (stat) != 0) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 2; } if (pid == 0) { err = test7 (); sam_finalize (); return (err); } waitpid (pid, &stat, 0); fprintf (stderr, "test7 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : (WEXITSTATUS (stat) == 1 ? "skipped" : "failed"))); if (WEXITSTATUS (stat) == 1) no_skipped++; if (WEXITSTATUS (stat) > 1) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 2; } if (pid == 0) { err = test8 (getpid (), 0, 1); sam_finalize (); return (err); } waitpid (pid, &stat, 0); old_pid = pid; if (WEXITSTATUS (stat) == 0) { pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 2; } if (pid == 0) { err = test8 (getpid (), old_pid, 2); sam_finalize (); return (err); } waitpid (pid, &stat, 0); old_pid = pid; if (WEXITSTATUS (stat) == 0) { pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 2; } if (pid == 0) { err = test8 (old_pid, 0, 3); sam_finalize (); return (err); } waitpid (pid, &stat, 0); } } if (WEXITSTATUS (stat) == 1) no_skipped++; if (WEXITSTATUS (stat) > 1) all_passed = 0; pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 2; } if (pid == 0) { err = test9 (getpid (), 0, 1); sam_finalize (); return (err); } waitpid (pid, &stat, 0); old_pid = pid; if (WEXITSTATUS (stat) == 0) { pid = fork (); if (pid == -1) { fprintf (stderr, "Can't fork\n"); return 2; } if (pid == 0) { err = test9 (old_pid, 0, 2); sam_finalize (); return (err); } waitpid (pid, &stat, 0); } fprintf (stderr, "test9 %s\n", (WEXITSTATUS (stat) == 0 ? "passed" : (WEXITSTATUS (stat) == 1 ? "skipped" : "failed"))); if (WEXITSTATUS (stat) == 1) no_skipped++; if (WEXITSTATUS (stat) > 1) all_passed = 0; if (all_passed) fprintf (stderr, "All tests passed (%d skipped)\n", no_skipped); return (all_passed ? 0 : 1); }