diff --git a/daemons/controld/controld_throttle.c b/daemons/controld/controld_throttle.c index 9adfde6840..d2ad9355e9 100644 --- a/daemons/controld/controld_throttle.c +++ b/daemons/controld/controld_throttle.c @@ -1,595 +1,407 @@ /* * Copyright 2013-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include /* These values don't need to be bits, but these particular values must be kept * for backward compatibility during rolling upgrades. */ enum throttle_state_e { throttle_none = 0x0000, throttle_low = 0x0001, throttle_med = 0x0010, throttle_high = 0x0100, throttle_extreme = 0x1000, }; struct throttle_record_s { int max; enum throttle_state_e mode; char *node; }; static int throttle_job_max = 0; static float throttle_load_target = 0.0; #define THROTTLE_FACTOR_LOW 1.2 #define THROTTLE_FACTOR_MEDIUM 1.6 #define THROTTLE_FACTOR_HIGH 2.0 static GHashTable *throttle_records = NULL; static mainloop_timer_t *throttle_timer = NULL; static const char * load2str(enum throttle_state_e mode) { switch (mode) { case throttle_extreme: return "extreme"; case throttle_high: return "high"; case throttle_med: return "medium"; case throttle_low: return "low"; case throttle_none: return "negligible"; default: return "undetermined"; } } -#if HAVE_LINUX_PROCFS -/*! - * \internal - * \brief Return name of /proc file containing the CIB daemon's load statistics - * - * \return Newly allocated memory with file name on success, NULL otherwise - * - * \note It is the caller's responsibility to free the return value. - * This will return NULL if the daemon is being run via valgrind. - * This should be called only on Linux systems. - */ -static char * -find_cib_loadfile(const char *server) -{ - pid_t pid = pcmk__procfs_pid_of(server); - - return pid? crm_strdup_printf("/proc/%lld/stat", (long long) pid) : NULL; -} - -static bool -throttle_cib_load(const char *server, float *load) -{ -/* /proc/[pid]/stat - * - * Status information about the process. This is used by ps(1). It is defined - * in /usr/src/linux/fs/proc/array.c. - * - * The fields, in order, with their proper scanf(3) format specifiers, are: - * - * pid %d (1) The process ID. - * comm %s (2) The filename of the executable, in parentheses. This is - * visible whether or not the executable is swapped out. - * state %c (3) One character from the string "RSDZTW" where R is running, - * S is sleeping in an interruptible wait, D is waiting in - * uninterruptible disk sleep, Z is zombie, T is traced or - * stopped (on a signal), and W is paging. - * ppid %d (4) The PID of the parent. - * pgrp %d (5) The process group ID of the process. - * session %d (6) The session ID of the process. - * tty_nr %d (7) The controlling terminal of the process. (The minor device - * number is contained in the combination of bits 31 to 20 and - * 7 to 0; the major device number is in bits 15 to 8.) - * tpgid %d (8) The ID of the foreground process group of the controlling - * terminal of the process. - * flags %u (9) The kernel flags word of the process. For bit meanings, see - * the PF_* defines in the Linux kernel source file include/linux/sched.h. - * Details depend on the kernel version. - * minflt %lu (10) The number of minor faults the process has made which have - * not required loading a memory page from disk. - * cminflt %lu (11) The number of minor faults that the process's waited-for - * children have made. - * majflt %lu (12) The number of major faults the process has made which have - * required loading a memory page from disk. - * cmajflt %lu (13) The number of major faults that the process's waited-for - * children have made. - * utime %lu (14) Amount of time that this process has been scheduled in user - * mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). - * This includes guest time, guest_time (time spent running a - * virtual CPU, see below), so that applications that are not - * aware of the guest time field do not lose that time from - * their calculations. - * stime %lu (15) Amount of time that this process has been scheduled in - * kernel mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). - */ - - static char *loadfile = NULL; - static time_t last_call = 0; - static long ticks_per_s = 0; - static unsigned long last_utime, last_stime; - - char buffer[64*1024]; - FILE *stream = NULL; - time_t now = time(NULL); - - if (load == NULL) { - return false; - } else { - *load = 0.0; - } - - if (loadfile == NULL) { - last_call = 0; - last_utime = 0; - last_stime = 0; - - loadfile = find_cib_loadfile(server); - if (loadfile == NULL) { - crm_warn("Couldn't find CIB load file"); - return false; - } - - ticks_per_s = sysconf(_SC_CLK_TCK); - crm_trace("Found %s", loadfile); - } - - stream = fopen(loadfile, "r"); - if (stream == NULL) { - int rc = errno; - - crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc); - free(loadfile); - loadfile = NULL; - return false; - } - - if (fgets(buffer, sizeof(buffer), stream) != NULL) { - char *comm = pcmk__assert_alloc(1, 256); - char state = 0; - int rc = 0, pid = 0, ppid = 0, pgrp = 0, session = 0, tty_nr = 0, tpgid = 0; - unsigned long flags = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0, utime = 0, stime = 0; - - rc = sscanf(buffer, "%d %[^ ] %c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu", - &pid, comm, &state, &ppid, &pgrp, &session, &tty_nr, &tpgid, - &flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime); - free(comm); - - if (rc != 15) { - crm_err("Only %d of 15 fields found in %s", rc, loadfile); - fclose(stream); - return false; - - } else if ((last_call > 0) && (last_call < now) && (last_utime <= utime) && - (last_stime <= stime)) { - time_t elapsed = now - last_call; - unsigned long delta_utime = utime - last_utime; - unsigned long delta_stime = stime - last_stime; - - *load = delta_utime + delta_stime; /* Cast to a float before division */ - *load /= ticks_per_s; - *load /= elapsed; - crm_debug("cib load: %f (%lu ticks in %lds)", *load, - delta_utime + delta_stime, (long) elapsed); - - } else { - crm_debug("Init %lu + %lu ticks at %ld (%lu tps)", utime, stime, - (long) now, ticks_per_s); - } - - last_call = now; - last_utime = utime; - last_stime = stime; - - fclose(stream); - return true; - } - - fclose(stream); - return false; -} - -static bool -throttle_load_avg(float *load) -{ - char buffer[256]; - FILE *stream = NULL; - const char *loadfile = "/proc/loadavg"; - - if (load == NULL) { - return false; - } - - stream = fopen(loadfile, "r"); - if (stream == NULL) { - int rc = errno; - crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc); - return false; - } - - if (fgets(buffer, sizeof(buffer), stream) != NULL) { - char *nl = strstr(buffer, "\n"); - - /* Grab the 1-minute average, ignore the rest */ - *load = strtof(buffer, NULL); - if (nl != NULL) { - nl[0] = 0; - } - - fclose(stream); - return true; - } - - fclose(stream); - return false; -} - /*! * \internal * \brief Check a load value against throttling thresholds * * \param[in] load Load value to check * \param[in] desc Description of metric (for logging) * \param[in] thresholds Low/medium/high/extreme thresholds * * \return Throttle mode corresponding to load value */ static enum throttle_state_e throttle_check_thresholds(float load, const char *desc, const float thresholds[4]) { if (load > thresholds[3]) { crm_notice("Extreme %s detected: %f", desc, load); return throttle_extreme; } else if (load > thresholds[2]) { crm_notice("High %s detected: %f", desc, load); return throttle_high; } else if (load > thresholds[1]) { crm_info("Moderate %s detected: %f", desc, load); return throttle_med; } else if (load > thresholds[0]) { crm_debug("Noticeable %s detected: %f", desc, load); return throttle_low; } crm_trace("Negligible %s detected: %f", desc, load); return throttle_none; } static enum throttle_state_e throttle_handle_load(float load, const char *desc, int cores) { float normalize; float thresholds[4]; if (cores == 1) { /* On a single core machine, a load of 1.0 is already too high */ normalize = 0.6; } else { /* Normalize the load to be per-core */ normalize = cores; } thresholds[0] = throttle_load_target * normalize * THROTTLE_FACTOR_LOW; thresholds[1] = throttle_load_target * normalize * THROTTLE_FACTOR_MEDIUM; thresholds[2] = throttle_load_target * normalize * THROTTLE_FACTOR_HIGH; thresholds[3] = load + 1.0; /* never extreme */ return throttle_check_thresholds(load, desc, thresholds); } -#endif // HAVE_LINUX_PROCFS static enum throttle_state_e throttle_mode(void) { enum throttle_state_e mode = throttle_none; -#if HAVE_LINUX_PROCFS - unsigned int cores; + unsigned int cores = pcmk__procfs_num_cores(); float load; float thresholds[4]; - cores = pcmk__procfs_num_cores(); - if (throttle_cib_load(PCMK__SERVER_BASED, &load)) { + if (pcmk__throttle_cib_load(PCMK__SERVER_BASED, &load)) { float cib_max_cpu = 0.95; /* The CIB is a single-threaded task and thus cannot consume more * than 100% of a CPU (and 1/cores of the overall system load). * * On a many-cored system, the CIB might therefore be maxed out (causing * operations to fail or appear to fail) even though the overall system * load is still reasonable. * * Therefore, the 'normal' thresholds can not apply here, and we need a * special case. */ if (cores == 1) { cib_max_cpu = 0.4; } if ((throttle_load_target > 0.0) && (throttle_load_target < cib_max_cpu)) { cib_max_cpu = throttle_load_target; } thresholds[0] = cib_max_cpu * 0.8; thresholds[1] = cib_max_cpu * 0.9; thresholds[2] = cib_max_cpu; /* Can only happen on machines with a low number of cores */ thresholds[3] = cib_max_cpu * 1.5; mode = throttle_check_thresholds(load, "CIB load", thresholds); } if (throttle_load_target <= 0) { /* If we ever make this a valid value, the cluster will at least behave * as expected */ return mode; } - if (throttle_load_avg(&load)) { + if (pcmk__throttle_load_avg(&load)) { enum throttle_state_e cpu_load; cpu_load = throttle_handle_load(load, "CPU load", cores); if (cpu_load > mode) { mode = cpu_load; } crm_debug("Current load is %f across %u core(s)", load, cores); } -#endif // HAVE_LINUX_PROCFS + return mode; } static void throttle_send_command(enum throttle_state_e mode) { xmlNode *xml = NULL; static enum throttle_state_e last = -1; if(mode != last) { crm_info("New throttle mode: %s load (was %s)", load2str(mode), load2str(last)); last = mode; xml = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD, NULL, CRM_SYSTEM_CRMD, CRM_OP_THROTTLE, NULL); crm_xml_add_int(xml, PCMK__XA_CRM_LIMIT_MODE, mode); crm_xml_add_int(xml, PCMK__XA_CRM_LIMIT_MAX, throttle_job_max); pcmk__cluster_send_message(NULL, pcmk_ipc_controld, xml); pcmk__xml_free(xml); } } static gboolean throttle_timer_cb(gpointer data) { throttle_send_command(throttle_mode()); return TRUE; } static void throttle_record_free(gpointer p) { struct throttle_record_s *r = p; free(r->node); free(r); } static void throttle_set_load_target(float target) { throttle_load_target = target; } /*! * \internal * \brief Update the maximum number of simultaneous jobs * * \param[in] preference Cluster-wide \c PCMK_OPT_NODE_ACTION_LIMIT from the * CIB */ static void throttle_update_job_max(const char *preference) { long long max = 0LL; // Per-node override const char *env_limit = pcmk__env_option(PCMK__ENV_NODE_ACTION_LIMIT); if (env_limit != NULL) { int rc = pcmk__scan_ll(env_limit, &max, 0LL); if (rc != pcmk_rc_ok) { crm_warn("Ignoring local option PCMK_" PCMK__ENV_NODE_ACTION_LIMIT " because '%s' is not a valid value: %s", env_limit, pcmk_rc_str(rc)); env_limit = NULL; } } if (env_limit == NULL) { // Option validator should prevent invalid values CRM_LOG_ASSERT(pcmk__scan_ll(preference, &max, 0LL) == pcmk_rc_ok); } if (max > 0) { throttle_job_max = (max >= INT_MAX)? INT_MAX : (int) max; } else { // Default is based on the number of cores detected throttle_job_max = 2 * pcmk__procfs_num_cores(); } } void throttle_init(void) { if(throttle_records == NULL) { throttle_records = pcmk__strkey_table(NULL, throttle_record_free); throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL); } throttle_update_job_max(NULL); mainloop_timer_start(throttle_timer); } /*! * \internal * \brief Configure throttle options based on the CIB * * \param[in,out] options Name/value pairs for configured options */ void controld_configure_throttle(GHashTable *options) { const char *value = g_hash_table_lookup(options, PCMK_OPT_LOAD_THRESHOLD); if (value != NULL) { throttle_set_load_target(strtof(value, NULL) / 100.0); } value = g_hash_table_lookup(options, PCMK_OPT_NODE_ACTION_LIMIT); throttle_update_job_max(value); } void throttle_fini(void) { if (throttle_timer != NULL) { mainloop_timer_del(throttle_timer); throttle_timer = NULL; } if (throttle_records != NULL) { g_hash_table_destroy(throttle_records); throttle_records = NULL; } } int throttle_get_total_job_limit(int l) { /* Cluster-wide limit */ GHashTableIter iter; int limit = l; int peers = pcmk__cluster_num_active_nodes(); struct throttle_record_s *r = NULL; g_hash_table_iter_init(&iter, throttle_records); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &r)) { switch(r->mode) { case throttle_extreme: if(limit == 0 || limit > peers/4) { limit = QB_MAX(1, peers/4); } break; case throttle_high: if(limit == 0 || limit > peers/2) { limit = QB_MAX(1, peers/2); } break; default: break; } } if(limit == l) { } else if(l == 0) { crm_trace("Using " PCMK_OPT_BATCH_LIMIT "=%d", limit); } else { crm_trace("Using " PCMK_OPT_BATCH_LIMIT "=%d instead of %d", limit, l); } return limit; } int throttle_get_job_limit(const char *node) { int jobs = 1; struct throttle_record_s *r = NULL; r = g_hash_table_lookup(throttle_records, node); if(r == NULL) { r = pcmk__assert_alloc(1, sizeof(struct throttle_record_s)); r->node = pcmk__str_copy(node); r->mode = throttle_low; r->max = throttle_job_max; crm_trace("Defaulting to local values for unknown node %s", node); g_hash_table_insert(throttle_records, r->node, r); } switch(r->mode) { case throttle_extreme: case throttle_high: jobs = 1; /* At least one job must always be allowed */ break; case throttle_med: jobs = QB_MAX(1, r->max / 4); break; case throttle_low: jobs = QB_MAX(1, r->max / 2); break; case throttle_none: jobs = QB_MAX(1, r->max); break; default: crm_err("Unknown throttle mode %.4x on %s", r->mode, node); break; } return jobs; } void throttle_update(xmlNode *xml) { int max = 0; int mode = 0; struct throttle_record_s *r = NULL; const char *from = crm_element_value(xml, PCMK__XA_SRC); crm_element_value_int(xml, PCMK__XA_CRM_LIMIT_MODE, &mode); crm_element_value_int(xml, PCMK__XA_CRM_LIMIT_MAX, &max); r = g_hash_table_lookup(throttle_records, from); if(r == NULL) { r = pcmk__assert_alloc(1, sizeof(struct throttle_record_s)); r->node = pcmk__str_copy(from); g_hash_table_insert(throttle_records, r->node, r); } r->max = max; r->mode = (enum throttle_state_e) mode; crm_debug("Node %s has %s load and supports at most %d jobs; new job limit %d", from, load2str((enum throttle_state_e) mode), max, throttle_get_job_limit(from)); } diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h index ddbe67457d..82feb6476a 100644 --- a/include/crm/common/internal.h +++ b/include/crm/common/internal.h @@ -1,361 +1,363 @@ /* * Copyright 2015-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_COMMON_INTERNAL__H #define PCMK__CRM_COMMON_INTERNAL__H #include // pid_t, getpid() #include // bool #include // uint8_t, uint64_t #include // guint, GList, GHashTable #include // xmlNode #include // do_crm_log_unlikely(), etc. #include // mainloop_io_t, struct ipc_client_callbacks #include // crm_strdup_printf() #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* This says whether the current application is a Pacemaker daemon or not, * and is used to change default logging settings such as whether to log to * stderr, etc., as well as a few other details such as whether blackbox signal * handling is enabled. * * It is set when logging is initialized, and does not need to be set directly. */ extern bool pcmk__is_daemon; // Number of elements in a statically defined array #define PCMK__NELEM(a) ((int) (sizeof(a)/sizeof(a[0])) ) #if PCMK__ENABLE_CIBSECRETS /* internal CIB utilities (from cib_secrets.c) */ int pcmk__substitute_secrets(const char *rsc_id, GHashTable *params); #endif /* internal main loop utilities (from mainloop.c) */ int pcmk__add_mainloop_ipc(crm_ipc_t *ipc, int priority, void *userdata, const struct ipc_client_callbacks *callbacks, mainloop_io_t **source); guint pcmk__mainloop_timer_get_period(const mainloop_timer_t *timer); /* internal node-related XML utilities (from nodes.c) */ /*! * \internal * \brief Add local node name and ID to an XML node * * \param[in,out] request XML node to modify * \param[in] node The local node's name * \param[in] nodeid The local node's ID (can be 0) */ void pcmk__xe_add_node(xmlNode *xml, const char *node, int nodeid); /* internal name/value utilities (from nvpair.c) */ int pcmk__scan_nvpair(const char *input, char **name, char **value); char *pcmk__format_nvpair(const char *name, const char *value, const char *units); /* internal procfs utilities (from procfs.c) */ pid_t pcmk__procfs_pid_of(const char *name); unsigned int pcmk__procfs_num_cores(void); int pcmk__procfs_pid2path(pid_t pid, char path[], size_t path_size); bool pcmk__procfs_has_pids(void); DIR *pcmk__procfs_fd_dir(void); void pcmk__sysrq_trigger(char t); +bool pcmk__throttle_cib_load(const char *server, float *load); +bool pcmk__throttle_load_avg(float *load); /* internal functions related to process IDs (from pid.c) */ /*! * \internal * \brief Check whether process exists (by PID and optionally executable path) * * \param[in] pid PID of process to check * \param[in] daemon If not NULL, path component to match with procfs entry * * \return Standard Pacemaker return code * \note Particular return codes of interest include pcmk_rc_ok for alive, * ESRCH for process is not alive (verified by kill and/or executable path * match), EACCES for caller unable or not allowed to check. A result of * "alive" is less reliable when \p daemon is not provided or procfs is * not available, since there is no guarantee that the PID has not been * recycled for another process. * \note This function cannot be used to verify \e authenticity of the process. */ int pcmk__pid_active(pid_t pid, const char *daemon); int pcmk__read_pidfile(const char *filename, pid_t *pid); int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid); int pcmk__lock_pidfile(const char *filename, const char *name); // bitwise arithmetic utilities /*! * \internal * \brief Set specified flags in a flag group * * \param[in] function Function name of caller * \param[in] line Line number of caller * \param[in] log_level Log a message at this level * \param[in] flag_type Label describing this flag group (for logging) * \param[in] target Name of object whose flags these are (for logging) * \param[in] flag_group Flag group being manipulated * \param[in] flags Which flags in the group should be set * \param[in] flags_str Readable equivalent of \p flags (for logging) * * \return Possibly modified flag group */ static inline uint64_t pcmk__set_flags_as(const char *function, int line, uint8_t log_level, const char *flag_type, const char *target, uint64_t flag_group, uint64_t flags, const char *flags_str) { uint64_t result = flag_group | flags; if (result != flag_group) { do_crm_log_unlikely(log_level, "%s flags %#.8llx (%s) for %s set by %s:%d", ((flag_type == NULL)? "Group of" : flag_type), (unsigned long long) flags, ((flags_str == NULL)? "flags" : flags_str), ((target == NULL)? "target" : target), function, line); } return result; } /*! * \internal * \brief Clear specified flags in a flag group * * \param[in] function Function name of caller * \param[in] line Line number of caller * \param[in] log_level Log a message at this level * \param[in] flag_type Label describing this flag group (for logging) * \param[in] target Name of object whose flags these are (for logging) * \param[in] flag_group Flag group being manipulated * \param[in] flags Which flags in the group should be cleared * \param[in] flags_str Readable equivalent of \p flags (for logging) * * \return Possibly modified flag group */ static inline uint64_t pcmk__clear_flags_as(const char *function, int line, uint8_t log_level, const char *flag_type, const char *target, uint64_t flag_group, uint64_t flags, const char *flags_str) { uint64_t result = flag_group & ~flags; if (result != flag_group) { do_crm_log_unlikely(log_level, "%s flags %#.8llx (%s) for %s cleared by %s:%d", ((flag_type == NULL)? "Group of" : flag_type), (unsigned long long) flags, ((flags_str == NULL)? "flags" : flags_str), ((target == NULL)? "target" : target), function, line); } return result; } /*! * \internal * \brief Get readable string for whether specified flags are set * * \param[in] flag_group Group of flags to check * \param[in] flags Which flags in \p flag_group should be checked * * \return "true" if all \p flags are set in \p flag_group, otherwise "false" */ static inline const char * pcmk__flag_text(uint64_t flag_group, uint64_t flags) { return pcmk__btoa(pcmk_all_flags_set(flag_group, flags)); } // miscellaneous utilities (from utils.c) void pcmk__daemonize(const char *name, const char *pidfile); void pcmk__panic(const char *reason); pid_t pcmk__locate_sbd(void); void pcmk__sleep_ms(unsigned int ms); guint pcmk__create_timer(guint interval_ms, GSourceFunc fn, gpointer data); guint pcmk__timeout_ms2s(guint timeout_ms); extern int pcmk__score_red; extern int pcmk__score_green; extern int pcmk__score_yellow; /*! * \internal * \brief Allocate new zero-initialized memory, asserting on failure * * \param[in] file File where \p function is located * \param[in] function Calling function * \param[in] line Line within \p file * \param[in] nmemb Number of elements to allocate memory for * \param[in] size Size of each element * * \return Newly allocated memory of of size nmemb * size (guaranteed * not to be \c NULL) * * \note The caller is responsible for freeing the return value using \c free(). */ static inline void * pcmk__assert_alloc_as(const char *file, const char *function, uint32_t line, size_t nmemb, size_t size) { void *ptr = calloc(nmemb, size); if (ptr == NULL) { crm_abort(file, function, line, "Out of memory", FALSE, TRUE); crm_exit(CRM_EX_OSERR); } return ptr; } /*! * \internal * \brief Allocate new zero-initialized memory, asserting on failure * * \param[in] nmemb Number of elements to allocate memory for * \param[in] size Size of each element * * \return Newly allocated memory of of size nmemb * size (guaranteed * not to be \c NULL) * * \note The caller is responsible for freeing the return value using \c free(). */ #define pcmk__assert_alloc(nmemb, size) \ pcmk__assert_alloc_as(__FILE__, __func__, __LINE__, nmemb, size) /*! * \internal * \brief Resize a dynamically allocated memory block * * \param[in] ptr Memory block to resize (or NULL to allocate new memory) * \param[in] size New size of memory block in bytes (must be > 0) * * \return Pointer to resized memory block * * \note This asserts on error, so the result is guaranteed to be non-NULL * (which is the main advantage of this over directly using realloc()). */ static inline void * pcmk__realloc(void *ptr, size_t size) { void *new_ptr; // realloc(p, 0) can replace free(p) but this wrapper can't pcmk__assert(size > 0); new_ptr = realloc(ptr, size); if (new_ptr == NULL) { free(ptr); abort(); } return new_ptr; } static inline char * pcmk__getpid_s(void) { return crm_strdup_printf("%lu", (unsigned long) getpid()); } // More efficient than g_list_length(list) == 1 static inline bool pcmk__list_of_1(GList *list) { return list && (list->next == NULL); } // More efficient than g_list_length(list) > 1 static inline bool pcmk__list_of_multiple(GList *list) { return list && (list->next != NULL); } /* convenience functions for failure-related node attributes */ #define PCMK__FAIL_COUNT_PREFIX "fail-count" #define PCMK__LAST_FAILURE_PREFIX "last-failure" /*! * \internal * \brief Generate a failure-related node attribute name for a resource * * \param[in] prefix Start of attribute name * \param[in] rsc_id Resource name * \param[in] op Operation name * \param[in] interval_ms Operation interval * * \return Newly allocated string with attribute name * * \note Failure attributes are named like PREFIX-RSC#OP_INTERVAL (for example, * "fail-count-myrsc#monitor_30000"). The '#' is used because it is not * a valid character in a resource ID, to reliably distinguish where the * operation name begins. The '_' is used simply to be more comparable to * action labels like "myrsc_monitor_30000". */ static inline char * pcmk__fail_attr_name(const char *prefix, const char *rsc_id, const char *op, guint interval_ms) { CRM_CHECK(prefix && rsc_id && op, return NULL); return crm_strdup_printf("%s-%s#%s_%u", prefix, rsc_id, op, interval_ms); } static inline char * pcmk__failcount_name(const char *rsc_id, const char *op, guint interval_ms) { return pcmk__fail_attr_name(PCMK__FAIL_COUNT_PREFIX, rsc_id, op, interval_ms); } static inline char * pcmk__lastfailure_name(const char *rsc_id, const char *op, guint interval_ms) { return pcmk__fail_attr_name(PCMK__LAST_FAILURE_PREFIX, rsc_id, op, interval_ms); } // internal resource agent functions (from agents.c) int pcmk__effective_rc(int rc); #ifdef __cplusplus } #endif #endif // PCMK__CRM_COMMON_INTERNAL__H diff --git a/lib/common/procfs.c b/lib/common/procfs.c index 92738a9beb..a6b0d8e7ac 100644 --- a/lib/common/procfs.c +++ b/lib/common/procfs.c @@ -1,277 +1,465 @@ /* * Copyright 2015-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #if HAVE_LINUX_PROCFS +/*! + * \internal + * \brief Return name of /proc file containing the CIB daemon's load statistics + * + * \return Newly allocated memory with file name on success, NULL otherwise + * + * \note It is the caller's responsibility to free the return value. + * This will return NULL if the daemon is being run via valgrind. + * This should be called only on Linux systems. + */ +static char * +find_cib_loadfile(const char *server) +{ + pid_t pid = pcmk__procfs_pid_of(server); + + return pid? crm_strdup_printf("/proc/%lld/stat", (long long) pid) : NULL; +} + /*! * \internal * \brief Get process ID and name associated with a /proc directory entry * * \param[in] entry Directory entry (must be result of readdir() on /proc) * \param[out] name If not NULL, a char[16] to hold the process name * \param[out] pid If not NULL, will be set to process ID of entry * * \return Standard Pacemaker return code * \note This should be called only on Linux systems, as not all systems that * support /proc store process names and IDs in the same way. The kernel * limits the process name to the first 15 characters (plus terminator). * It would be nice if there were a public kernel API constant for that * limit, but there isn't. */ static int pcmk__procfs_process_info(const struct dirent *entry, char *name, pid_t *pid) { int fd, local_pid; FILE *file; struct stat statbuf; char procpath[128] = { 0 }; /* We're only interested in entries whose name is a PID, * so skip anything non-numeric or that is too long. * * 114 = 128 - strlen("/proc/") - strlen("/status") - 1 */ local_pid = atoi(entry->d_name); if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) { return -1; } if (pid) { *pid = (pid_t) local_pid; } /* Get this entry's file information */ strcpy(procpath, "/proc/"); strcat(procpath, entry->d_name); fd = open(procpath, O_RDONLY); if (fd < 0 ) { return -1; } if (fstat(fd, &statbuf) < 0) { close(fd); return -1; } close(fd); /* We're only interested in subdirectories */ if (!S_ISDIR(statbuf.st_mode)) { return -1; } /* Read the first entry ("Name:") from the process's status file. * We could handle the valgrind case if we parsed the cmdline file * instead, but that's more of a pain than it's worth. */ if (name != NULL) { strcat(procpath, "/status"); file = fopen(procpath, "r"); if (!file) { return -1; } if (fscanf(file, "Name:\t%15[^\n]", name) != 1) { fclose(file); return -1; } name[15] = 0; fclose(file); } return 0; } #endif // HAVE_LINUX_PROCFS /*! * \internal * \brief Return process ID of a named process * * \param[in] name Process name (as used in /proc/.../status) * * \return Process ID of named process if running, 0 otherwise * * \note This will return 0 if the process is being run via valgrind. * This should be called only on Linux systems. */ pid_t pcmk__procfs_pid_of(const char *name) { #if HAVE_LINUX_PROCFS DIR *dp; struct dirent *entry; pid_t pid = 0; char entry_name[64] = { 0 }; dp = opendir("/proc"); if (dp == NULL) { crm_notice("Can not read /proc directory to track existing components"); return 0; } while ((entry = readdir(dp)) != NULL) { if ((pcmk__procfs_process_info(entry, entry_name, &pid) == pcmk_rc_ok) && pcmk__str_eq(entry_name, name, pcmk__str_casei) && (pcmk__pid_active(pid, NULL) == pcmk_rc_ok)) { crm_info("Found %s active as process %lld", name, (long long) pid); break; } pid = 0; } closedir(dp); return pid; #else return 0; #endif // HAVE_LINUX_PROCFS } /*! * \internal * \brief Calculate number of logical CPU cores from procfs * * \return Number of cores (or 1 if unable to determine) */ unsigned int pcmk__procfs_num_cores(void) { #if HAVE_LINUX_PROCFS int cores = 0; FILE *stream = NULL; /* Parse /proc/stat instead of /proc/cpuinfo because it's smaller */ stream = fopen("/proc/stat", "r"); if (stream == NULL) { crm_perror(LOG_INFO, "Could not open /proc/stat"); } else { char buffer[2048]; while (fgets(buffer, sizeof(buffer), stream)) { if (pcmk__starts_with(buffer, "cpu") && isdigit(buffer[3])) { ++cores; } } fclose(stream); } return cores? cores : 1; #else return 1; #endif // HAVE_LINUX_PROCFS } /*! * \internal * \brief Get the executable path corresponding to a process ID * * \param[in] pid Process ID to check * \param[out] path Where to store executable path * \param[in] path_size Size of \p path in characters (ideally PATH_MAX) * * \return Standard Pacemaker error code (as possible errno values from * readlink()) */ int pcmk__procfs_pid2path(pid_t pid, char path[], size_t path_size) { #if HAVE_LINUX_PROCFS char procfs_exe_path[PATH_MAX]; ssize_t link_rc; if (snprintf(procfs_exe_path, PATH_MAX, "/proc/%lld/exe", (long long) pid) >= PATH_MAX) { return ENAMETOOLONG; // Truncated (shouldn't be possible in practice) } link_rc = readlink(procfs_exe_path, path, path_size - 1); if (link_rc < 0) { return errno; } else if (link_rc >= (path_size - 1)) { return ENAMETOOLONG; } path[link_rc] = '\0'; return pcmk_rc_ok; #else return EOPNOTSUPP; #endif // HAVE_LINUX_PROCFS } /*! * \internal * \brief Check whether process ID information is available from procfs * * \return true if process ID information is available, otherwise false */ bool pcmk__procfs_has_pids(void) { #if HAVE_LINUX_PROCFS static bool have_pids = false; static bool checked = false; if (!checked) { char path[PATH_MAX]; have_pids = pcmk__procfs_pid2path(getpid(), path, sizeof(path)) == pcmk_rc_ok; checked = true; } return have_pids; #else return false; #endif // HAVE_LINUX_PROCFS } /*! * \internal * \brief Return an open handle on the directory containing links to open file * descriptors, or NULL on error */ DIR * pcmk__procfs_fd_dir(void) { DIR *dir = NULL; /* /proc/self/fd (on Linux) or /dev/fd (on most OSes) contains symlinks to * all open files for the current process, named as the file descriptor. * Use this if available, because it's more efficient than a shotgun * approach to closing descriptors. */ #if HAVE_LINUX_PROCFS dir = opendir("/proc/self/fd"); #endif // HAVE_LINUX_PROCFS return dir; } /*! * \internal * \brief Trigger a sysrq command if supported on current platform * * \param[in] t Sysrq command to trigger */ void pcmk__sysrq_trigger(char t) { #if HAVE_LINUX_PROCFS // Root can always write here, regardless of kernel.sysrq value FILE *procf = fopen("/proc/sysrq-trigger", "a"); if (procf == NULL) { crm_warn("Could not open sysrq-trigger: %s", strerror(errno)); } else { fprintf(procf, "%c\n", t); fclose(procf); } #endif // HAVE_LINUX_PROCFS } + +bool +pcmk__throttle_cib_load(const char *server, float *load) +{ +/* /proc/[pid]/stat + * + * Status information about the process. This is used by ps(1). It is defined + * in /usr/src/linux/fs/proc/array.c. + * + * The fields, in order, with their proper scanf(3) format specifiers, are: + * + * pid %d (1) The process ID. + * comm %s (2) The filename of the executable, in parentheses. This is + * visible whether or not the executable is swapped out. + * state %c (3) One character from the string "RSDZTW" where R is running, + * S is sleeping in an interruptible wait, D is waiting in + * uninterruptible disk sleep, Z is zombie, T is traced or + * stopped (on a signal), and W is paging. + * ppid %d (4) The PID of the parent. + * pgrp %d (5) The process group ID of the process. + * session %d (6) The session ID of the process. + * tty_nr %d (7) The controlling terminal of the process. (The minor device + * number is contained in the combination of bits 31 to 20 and + * 7 to 0; the major device number is in bits 15 to 8.) + * tpgid %d (8) The ID of the foreground process group of the controlling + * terminal of the process. + * flags %u (9) The kernel flags word of the process. For bit meanings, see + * the PF_* defines in the Linux kernel source file include/linux/sched.h. + * Details depend on the kernel version. + * minflt %lu (10) The number of minor faults the process has made which have + * not required loading a memory page from disk. + * cminflt %lu (11) The number of minor faults that the process's waited-for + * children have made. + * majflt %lu (12) The number of major faults the process has made which have + * required loading a memory page from disk. + * cmajflt %lu (13) The number of major faults that the process's waited-for + * children have made. + * utime %lu (14) Amount of time that this process has been scheduled in user + * mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). + * This includes guest time, guest_time (time spent running a + * virtual CPU, see below), so that applications that are not + * aware of the guest time field do not lose that time from + * their calculations. + * stime %lu (15) Amount of time that this process has been scheduled in + * kernel mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). + */ + +#if HAVE_LINUX_PROCFS + static char *loadfile = NULL; + static time_t last_call = 0; + static long ticks_per_s = 0; + static unsigned long last_utime, last_stime; + + char buffer[64*1024]; + FILE *stream = NULL; + time_t now = time(NULL); + + if (load == NULL) { + return false; + } else { + *load = 0.0; + } + + if (loadfile == NULL) { + last_call = 0; + last_utime = 0; + last_stime = 0; + + loadfile = find_cib_loadfile(server); + if (loadfile == NULL) { + crm_warn("Couldn't find CIB load file"); + return false; + } + + ticks_per_s = sysconf(_SC_CLK_TCK); + crm_trace("Found %s", loadfile); + } + + stream = fopen(loadfile, "r"); + if (stream == NULL) { + int rc = errno; + + crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc); + free(loadfile); + loadfile = NULL; + return false; + } + + if (fgets(buffer, sizeof(buffer), stream) != NULL) { + char *comm = pcmk__assert_alloc(1, 256); + char state = 0; + int rc = 0, pid = 0, ppid = 0, pgrp = 0, session = 0, tty_nr = 0, tpgid = 0; + unsigned long flags = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0, utime = 0, stime = 0; + + rc = sscanf(buffer, "%d %[^ ] %c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu", + &pid, comm, &state, &ppid, &pgrp, &session, &tty_nr, &tpgid, + &flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime); + free(comm); + + if (rc != 15) { + crm_err("Only %d of 15 fields found in %s", rc, loadfile); + fclose(stream); + return false; + + } else if ((last_call > 0) && (last_call < now) && (last_utime <= utime) && + (last_stime <= stime)) { + time_t elapsed = now - last_call; + unsigned long delta_utime = utime - last_utime; + unsigned long delta_stime = stime - last_stime; + + *load = delta_utime + delta_stime; /* Cast to a float before division */ + *load /= ticks_per_s; + *load /= elapsed; + crm_debug("cib load: %f (%lu ticks in %lds)", *load, + delta_utime + delta_stime, (long) elapsed); + + } else { + crm_debug("Init %lu + %lu ticks at %ld (%lu tps)", utime, stime, + (long) now, ticks_per_s); + } + + last_call = now; + last_utime = utime; + last_stime = stime; + + fclose(stream); + return true; + } + + fclose(stream); +#endif // HAVE_LINUX_PROCFS + return false; +} + +bool +pcmk__throttle_load_avg(float *load) +{ +#if HAVE_LINUX_PROCFS + char buffer[256]; + FILE *stream = NULL; + const char *loadfile = "/proc/loadavg"; + + if (load == NULL) { + return false; + } + + stream = fopen(loadfile, "r"); + if (stream == NULL) { + int rc = errno; + crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc); + return false; + } + + if (fgets(buffer, sizeof(buffer), stream) != NULL) { + char *nl = strstr(buffer, "\n"); + + /* Grab the 1-minute average, ignore the rest */ + *load = strtof(buffer, NULL); + if (nl != NULL) { + nl[0] = 0; + } + + fclose(stream); + return true; + } + + fclose(stream); +#endif // HAVE_LINUX_PROCFS + return false; +}