diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h index c3113e49c3..26a0d124ab 100644 --- a/daemons/controld/controld_lrm.h +++ b/daemons/controld/controld_lrm.h @@ -1,183 +1,183 @@ /* * Copyright 2004-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef CONTROLD_LRM__H # define CONTROLD_LRM__H #include extern gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level); void lrm_clear_last_failure(const char *rsc_id, const char *node_name, const char *operation, guint interval_ms); void lrm_op_callback(lrmd_event_data_t * op); lrmd_t *crmd_local_lrmd_conn(void); typedef struct resource_history_s { char *id; uint32_t last_callid; lrmd_rsc_info_t rsc; lrmd_event_data_t *last; lrmd_event_data_t *failed; GList *recurring_op_list; /* Resources must be stopped using the same * parameters they were started with. This hashtable * holds the parameters that should be used for the next stop * cmd on this resource. */ GHashTable *stop_params; } rsc_history_t; void history_free(gpointer data); enum active_op_e { active_op_remove = (1 << 0), active_op_cancelled = (1 << 1), }; // In-flight action (recurring or pending) typedef struct active_op_s { guint interval_ms; int call_id; uint32_t flags; // bitmask of active_op_e time_t start_time; time_t lock_time; char *rsc_id; char *op_type; char *op_key; char *user_data; GHashTable *params; } active_op_t; #define controld_set_active_op_flags(active_op, flags_to_set) do { \ (active_op)->flags = pcmk__set_flags_as(__func__, __LINE__, \ LOG_TRACE, "Active operation", (active_op)->op_key, \ (active_op)->flags, (flags_to_set), #flags_to_set); \ } while (0) #define controld_clear_active_op_flags(active_op, flags_to_clear) do { \ (active_op)->flags = pcmk__clear_flags_as(__func__, __LINE__, \ LOG_TRACE, "Active operation", (active_op)->op_key, \ (active_op)->flags, (flags_to_clear), #flags_to_clear); \ } while (0) typedef struct lrm_state_s { const char *node_name; void *conn; // Reserved for controld_execd_state.c usage void *remote_ra_data; // Reserved for controld_remote_ra.c usage GHashTable *resource_history; GHashTable *active_ops; // Pending and recurring actions GHashTable *deletion_ops; GHashTable *rsc_info_cache; GHashTable *metadata_cache; // key = class[:provider]:agent, value = ra_metadata_s int num_lrm_register_fails; } lrm_state_t; struct pending_deletion_op_s { char *rsc; ha_msg_input_t *input; }; /*! * \brief Check whether this the local IPC connection to the executor */ gboolean lrm_state_is_local(lrm_state_t *lrm_state); /*! * \brief Clear all state information from a single state entry. * \note It sometimes useful to save metadata cache when it won't go stale. * \note This does not close the executor connection */ void lrm_state_reset_tables(lrm_state_t * lrm_state, gboolean reset_metadata); GList *lrm_state_get_list(void); /*! * \brief Initiate internal state tables */ gboolean lrm_state_init_local(void); /*! * \brief Destroy all state entries and internal state tables */ void lrm_state_destroy_all(void); /*! * \brief Find lrm_state data by node name */ lrm_state_t *lrm_state_find(const char *node_name); /*! * \brief Either find or create a new entry */ lrm_state_t *lrm_state_find_or_create(const char *node_name); /*! - * The functions below are wrappers for the executor API the the controller - * uses. These wrapper functions allow us to treat the controller's remote - * executor connection resources the same as regular resources. Internally, - * regular resources go to the executor, and remote connection resources are - * handled locally in the controller. + * The functions below are wrappers for the executor API the controller uses. + * These wrapper functions allow us to treat the controller's remote executor + * connection resources the same as regular resources. Internally, regular + * resources go to the executor, and remote connection resources are handled + * locally in the controller. */ void lrm_state_disconnect_only(lrm_state_t * lrm_state); void lrm_state_disconnect(lrm_state_t * lrm_state); int controld_connect_local_executor(lrm_state_t *lrm_state); int controld_connect_remote_executor(lrm_state_t *lrm_state, const char *server, int port, int timeout); int lrm_state_is_connected(lrm_state_t * lrm_state); int lrm_state_poke_connection(lrm_state_t * lrm_state); int lrm_state_get_metadata(lrm_state_t * lrm_state, const char *class, const char *provider, const char *agent, char **output, enum lrmd_call_options options); int lrm_state_cancel(lrm_state_t *lrm_state, const char *rsc_id, const char *action, guint interval_ms); int controld_execute_resource_agent(lrm_state_t *lrm_state, const char *rsc_id, const char *action, const char *userdata, guint interval_ms, int timeout_ms, int start_delay_ms, GHashTable *parameters, int *call_id); lrmd_rsc_info_t *lrm_state_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options); int lrm_state_register_rsc(lrm_state_t * lrm_state, const char *rsc_id, const char *class, const char *provider, const char *agent, enum lrmd_call_options options); int lrm_state_unregister_rsc(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options); // Functions used to manage remote executor connection resources void remote_lrm_op_callback(lrmd_event_data_t * op); gboolean is_remote_lrmd_ra(const char *agent, const char *provider, const char *id); lrmd_rsc_info_t *remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id); int remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id, const char *action, guint interval_ms); int controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id, const char *action, const char *userdata, guint interval_ms, int timeout_ms, int start_delay_ms, lrmd_key_value_t *params, int *call_id); void remote_ra_cleanup(lrm_state_t * lrm_state); void remote_ra_fail(const char *node_name); void remote_ra_process_pseudo(xmlNode *xml); gboolean remote_ra_is_in_maintenance(lrm_state_t * lrm_state); void remote_ra_process_maintenance_nodes(xmlNode *xml); gboolean remote_ra_controlling_guest(lrm_state_t * lrm_state); void process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, active_op_t *pending, const xmlNode *action_xml); void controld_ack_event_directly(const char *to_host, const char *to_sys, const lrmd_rsc_info_t *rsc, lrmd_event_data_t *op, const char *rsc_id); void controld_rc2event(lrmd_event_data_t *event, int rc); void controld_trigger_delete_refresh(const char *from_sys, const char *rsc_id); #endif diff --git a/doc/sphinx/Pacemaker_Administration/upgrading.rst b/doc/sphinx/Pacemaker_Administration/upgrading.rst index bccfc22842..121e26ee41 100644 --- a/doc/sphinx/Pacemaker_Administration/upgrading.rst +++ b/doc/sphinx/Pacemaker_Administration/upgrading.rst @@ -1,536 +1,536 @@ .. index:: upgrade Upgrading a Pacemaker Cluster ----------------------------- .. index:: version Pacemaker Versioning #################### Pacemaker has an overall release version, plus separate version numbers for certain internal components. .. index:: single: version; release * **Pacemaker release version:** This version consists of three numbers (*x.y.z*). The major version number (the *x* in *x.y.z*) increases when at least some rolling upgrades are not possible from the previous major version. For example, a rolling upgrade from 1.0.8 to 1.1.15 should always be supported, but a rolling upgrade from 1.0.8 to 2.0.0 may not be possible. The minor version (the *y* in *x.y.z*) increases when there are significant changes in cluster default behavior, tool behavior, and/or the API interface (for software that utilizes Pacemaker libraries). The main benefit is to alert you to pay closer attention to the release notes, to see if you might be affected. The release counter (the *z* in *x.y.z*) is increased with all public releases of Pacemaker, which typically include both bug fixes and new features. .. index:: single: feature set single: version; feature set * **CRM feature set:** This version number applies to the communication between full cluster nodes, and is used to avoid problems in mixed-version clusters. The major version number increases when nodes with different versions would not work (rolling upgrades are not allowed). The minor version number increases when mixed-version clusters are allowed only during rolling upgrades. The minor-minor version number is ignored, but allows resource agents to detect cluster support for various features. [#]_ Pacemaker ensures that the longest-running node is the cluster's DC. This ensures new features are not enabled until all nodes are upgraded to support them. .. index:: single: version; Pacemaker Remote protocol * **Pacemaker Remote protocol version:** This version applies to communication between a Pacemaker Remote node and the cluster. It increases when an older cluster node would have problems hosting the connection to a newer Pacemaker Remote node. To avoid these problems, Pacemaker Remote nodes will accept connections only from cluster nodes with the same or newer Pacemaker Remote protocol version. Unlike with CRM feature set differences between full cluster nodes, mixed Pacemaker Remote protocol versions between Pacemaker Remote nodes and full cluster nodes are fine, as long as the Pacemaker Remote nodes have the older version. This can be useful, for example, to host a legacy application in an older operating system version used as a Pacemaker Remote node. .. index:: single: version; XML schema * **XML schema version:** Pacemaker’s configuration syntax — what's allowed in the Configuration Information Base (CIB) — has its own version. This allows the configuration syntax to evolve over time while still allowing clusters with older configurations to work without change. .. index:: single: upgrade; methods Upgrading Cluster Software ########################## There are three approaches to upgrading a cluster, each with advantages and disadvantages. .. table:: **Upgrade Methods** +---------------------------------------------------+----------+----------+--------+---------+----------+----------+ | Method | Available| Can be | Service| Service | Exercises| Allows | | | between | used with| outage | recovery| failover | change of| | | all | Pacemaker| during | during | logic | messaging| | | versions | Remote | upgrade| upgrade | | layer | | | | nodes | | | | [#]_ | +===================================================+==========+==========+========+=========+==========+==========+ | Complete cluster shutdown | yes | yes | always | N/A | no | yes | +---------------------------------------------------+----------+----------+--------+---------+----------+----------+ | Rolling (node by node) | no | yes | always | yes | yes | no | | | | | [#]_ | | | | +---------------------------------------------------+----------+----------+--------+---------+----------+----------+ | Detach and reattach | yes | no | only | no | no | yes | | | | | due to | | | | | | | | failure| | | | +---------------------------------------------------+----------+----------+--------+---------+----------+----------+ .. index:: single: upgrade; shutdown Complete Cluster Shutdown _________________________ In this scenario, one shuts down all cluster nodes and resources, then upgrades all the nodes before restarting the cluster. #. On each node: a. Shutdown the cluster software (pacemaker and the messaging layer). #. Upgrade the Pacemaker software. This may also include upgrading the messaging layer and/or the underlying operating system. #. Check the configuration with the ``crm_verify`` tool. #. On each node: a. Start the cluster software. Currently, only Corosync version 2 and greater is supported as the cluster layer, but if another stack is supported in the future, the stack does not need to be the same one before the upgrade. One variation of this approach is to build a new cluster on new hosts. This allows the new version to be tested beforehand, and minimizes downtime by having the new nodes ready to be placed in production as soon as the old nodes are shut down. .. index:: single: upgrade; rolling upgrade Rolling (node by node) ______________________ In this scenario, each node is removed from the cluster, upgraded, and then brought back online, until all nodes are running the newest version. Special considerations when planning a rolling upgrade: * If you plan to upgrade other cluster software -- such as the messaging layer -- at the same time, consult that software's documentation for its compatibility with a rolling upgrade. * If the major version number is changing in the Pacemaker version you are upgrading to, a rolling upgrade may not be possible. Read the new version's release notes (as well the information here) for what limitations may exist. * If the CRM feature set is changing in the Pacemaker version you are upgrading to, you should run a mixed-version cluster only during a small rolling upgrade window. If one of the older nodes drops out of the cluster for any reason, it will not be able to rejoin until it is upgraded. * If the Pacemaker Remote protocol version is changing, all cluster nodes should be upgraded before upgrading any Pacemaker Remote nodes. See the `Pacemaker release calendar `_ on the ClusterLabs wiki to figure out whether the CRM feature set and/or -Pacemaker Remote protocol version changed between the the Pacemaker release -versions in your rolling upgrade. +Pacemaker Remote protocol version changed between the Pacemaker release versions +in your rolling upgrade. To perform a rolling upgrade, on each node in turn: #. Put the node into standby mode, and wait for any active resources to be moved cleanly to another node. (This step is optional, but allows you to deal with any resource issues before the upgrade.) #. Shutdown the cluster software (pacemaker and the messaging layer) on the node. #. Upgrade the Pacemaker software. This may also include upgrading the messaging layer and/or the underlying operating system. #. If this is the first node to be upgraded, check the configuration with the ``crm_verify`` tool. #. Start the messaging layer. This must be the same messaging layer (currently only Corosync version 2 and greater is supported) that the rest of the cluster is using. .. note:: Even if a rolling upgrade from the current version of the cluster to the newest version is not directly possible, it may be possible to perform a rolling upgrade in multiple steps, by upgrading to an intermediate version first. .. table:: **Version Compatibility Table** +-------------------------+---------------------------+ | Version being Installed | Oldest Compatible Version | +=========================+===========================+ | Pacemaker 2.y.z | Pacemaker 1.1.11 [#]_ | +-------------------------+---------------------------+ | Pacemaker 1.y.z | Pacemaker 1.0.0 | +-------------------------+---------------------------+ | Pacemaker 0.7.z | Pacemaker 0.6.z | +-------------------------+---------------------------+ .. index:: single: upgrade; detach and reattach Detach and Reattach ___________________ The reattach method is a variant of a complete cluster shutdown, where the resources are left active and get re-detected when the cluster is restarted. This method may not be used if the cluster contains any Pacemaker Remote nodes. #. Tell the cluster to stop managing services. This is required to allow the services to remain active after the cluster shuts down. .. code-block:: none # crm_attribute --name maintenance-mode --update true #. On each node, shutdown the cluster software (pacemaker and the messaging layer), and upgrade the Pacemaker software. This may also include upgrading the messaging layer. While the underlying operating system may be upgraded at the same time, that will be more likely to cause outages in the detached services (certainly, if a reboot is required). #. Check the configuration with the ``crm_verify`` tool. #. On each node, start the cluster software. Currently, only Corosync version 2 and greater is supported as the cluster layer, but if another stack is supported in the future, the stack does not need to be the same one before the upgrade. #. Verify that the cluster re-detected all resources correctly. #. Allow the cluster to resume managing resources again: .. code-block:: none # crm_attribute --name maintenance-mode --delete .. note:: While the goal of the detach-and-reattach method is to avoid disturbing running services, resources may still move after the upgrade if any resource's location is governed by a rule based on transient node attributes. Transient node attributes are erased when the node leaves the cluster. A common example is using the ``ocf:pacemaker:ping`` resource to set a node attribute used to locate other resources. .. index:: pair: upgrade; CIB Upgrading the Configuration ########################### The CIB schema version can change from one Pacemaker version to another. After cluster software is upgraded, the cluster will continue to use the older schema version that it was previously using. This can be useful, for example, when administrators have written tools that modify the configuration, and are based on the older syntax. [#]_ However, when using an older syntax, new features may be unavailable, and there is a performance impact, since the cluster must do a non-persistent configuration upgrade before each transition. So while using the old syntax is possible, it is not advisable to continue using it indefinitely. Even if you wish to continue using the old syntax, it is a good idea to follow the upgrade procedure outlined below, except for the last step, to ensure that the new software has no problems with your existing configuration (since it will perform much the same task internally). If you are brave, it is sufficient simply to run ``cibadmin --upgrade``. A more cautious approach would proceed like this: #. Create a shadow copy of the configuration. The later commands will automatically operate on this copy, rather than the live configuration. .. code-block:: none # crm_shadow --create shadow .. index:: single: configuration; verify #. Verify the configuration is valid with the new software (which may be stricter about syntax mistakes, or may have dropped support for deprecated features): .. code-block:: none # crm_verify --live-check #. Fix any errors or warnings. #. Perform the upgrade: .. code-block:: none # cibadmin --upgrade #. If this step fails, there are three main possibilities: a. The configuration was not valid to start with (did you do steps 2 and 3?). #. The transformation failed; `report a bug `_. #. The transformation was successful but produced an invalid result. If the result of the transformation is invalid, you may see a number of errors from the validation library. If these are not helpful, try the manual upgrade procedure described below. #. Check the changes: .. code-block:: none # crm_shadow --diff If at this point there is anything about the upgrade that you wish to fine-tune (for example, to change some of the automatic IDs), now is the time to do so: .. code-block:: none # crm_shadow --edit This will open the configuration in your favorite editor (whichever is specified by the standard ``$EDITOR`` environment variable). #. Preview how the cluster will react: .. code-block:: none # crm_simulate --live-check --save-dotfile shadow.dot -S # dot -Tsvg shadow.dot -o shadow.svg You can then view shadow.svg with any compatible image viewer or web browser. Verify that either no resource actions will occur or that you are happy with any that are scheduled. If the output contains actions you do not expect (possibly due to changes to the score calculations), you may need to make further manual changes. See :ref:`crm_simulate` for further details on how to interpret the output of ``crm_simulate`` and ``dot``. #. Upload the changes: .. code-block:: none # crm_shadow --commit shadow --force In the unlikely event this step fails, please report a bug. .. note:: It is also possible to perform the configuration upgrade steps manually: #. Locate the ``upgrade*.xsl`` conversion scripts provided with the source code. These will often be installed in a location such as ``/usr/share/pacemaker``, or may be obtained from the `source repository `_. #. Run the conversion scripts that apply to your older version, for example: .. code-block:: none # xsltproc /path/to/upgrade06.xsl config06.xml > config10.xml #. Locate the ``pacemaker.rng`` script (from the same location as the xsl files). #. Check the XML validity: .. code-block:: none # xmllint --relaxng /path/to/pacemaker.rng config10.xml The advantage of this method is that it can be performed without the cluster running, and any validation errors are often more informative. What Changed in 2.1 ################### The Pacemaker 2.1 release is fully backward-compatible in both the CIB XML and the C API. Highlights: * Pacemaker now supports the **OCF Resource Agent API version 1.1**. Most notably, the ``Master`` and ``Slave`` role names have been renamed to ``Promoted`` and ``Unpromoted``. * Pacemaker now supports colocations where the dependent resource does not affect the primary resource's placement (via a new ``influence`` colocation constraint option and ``critical`` resource meta-attribute). This is intended for cases where a less-important resource must be colocated with an essential resource, but it is preferred to leave the less-important resource stopped if it fails, rather than move both resources. * If Pacemaker is built with libqb 2.0 or later, the detail log will use **millisecond-resolution timestamps**. * In addition to crm_mon and stonith_admin, the crmadmin, crm_resource, crm_simulate, and crm_verify commands now support the ``--output-as`` and ``--output-to`` options, including **XML output** (which scripts and higher-level tools are strongly recommended to use instead of trying to parse the text output, which may change from release to release). For a detailed list of changes, see the release notes and `Pacemaker 2.1 Changes `_ on the ClusterLabs wiki. What Changed in 2.0 ################### The main goal of the 2.0 release was to remove support for deprecated syntax, along with some small changes in default configuration behavior and tool behavior. Highlights: * Only Corosync version 2 and greater is now supported as the underlying cluster layer. Support for Heartbeat and Corosync 1 (including CMAN) is removed. * The Pacemaker detail log file is now stored in ``/var/log/pacemaker/pacemaker.log`` by default. * The record-pending cluster property now defaults to true, which allows status tools such as crm_mon to show operations that are in progress. * Support for a number of deprecated build options, environment variables, and configuration settings has been removed. * The ``master`` tag has been deprecated in favor of using the ``clone`` tag with the new ``promotable`` meta-attribute set to ``true``. "Master/slave" clone resources are now referred to as "promotable" clone resources. * The public API for Pacemaker libraries that software applications can use has changed significantly. For a detailed list of changes, see the release notes and `Pacemaker 2.0 Changes `_ on the ClusterLabs wiki. What Changed in 1.0 ################### New ___ * Failure timeouts. * New section for resource and operation defaults. * Tool for making offline configuration changes. * ``Rules``, ``instance_attributes``, ``meta_attributes`` and sets of operations can be defined once and referenced in multiple places. * The CIB now accepts XPath-based create/modify/delete operations. See ``cibadmin --help``. * Multi-dimensional colocation and ordering constraints. * The ability to connect to the CIB from non-cluster machines. * Allow recurring actions to be triggered at known times. Changed _______ * Syntax * All resource and cluster options now use dashes (-) instead of underscores (_) * ``master_slave`` was renamed to ``master`` * The ``attributes`` container tag was removed * The operation field ``pre-req`` has been renamed ``requires`` * All operations must have an ``interval``, ``start``/``stop`` must have it set to zero * The ``stonith-enabled`` option now defaults to true. * The cluster will refuse to start resources if ``stonith-enabled`` is true (or unset) and no STONITH resources have been defined * The attributes of colocation and ordering constraints were renamed for clarity. * ``resource-failure-stickiness`` has been replaced by ``migration-threshold``. * The parameters for command-line tools have been made consistent * Switched to 'RelaxNG' schema validation and 'libxml2' parser * id fields are now XML IDs which have the following limitations: * id's cannot contain colons (:) * id's cannot begin with a number * id's must be globally unique (not just unique for that tag) * Some fields (such as those in constraints that refer to resources) are IDREFs. This means that they must reference existing resources or objects in order for the configuration to be valid. Removing an object which is referenced elsewhere will therefore fail. * The CIB representation, from which a MD5 digest is calculated to verify CIBs on the nodes, has changed. This means that every CIB update will require a full refresh on any upgraded nodes until the cluster is fully upgraded to 1.0. This will result in significant performance degradation and it is therefore highly inadvisable to run a mixed 1.0/0.6 cluster for any longer than absolutely necessary. * Ping node information no longer needs to be added to ``ha.cf``. Simply include the lists of hosts in your ping resource(s). Removed _______ * Syntax * It is no longer possible to set resource meta options as top-level attributes. Use meta-attributes instead. * Resource and operation defaults are no longer read from ``crm_config``. .. rubric:: Footnotes .. [#] Before CRM feature set 3.1.0 (Pacemaker 2.0.0), the minor-minor version number was treated the same as the minor version. .. [#] Currently, Corosync version 2 and greater is the only supported cluster stack, but other stacks have been supported by past versions, and may be supported by future versions. .. [#] Any active resources will be moved off the node being upgraded, so there will be at least a brief outage unless all resources can be migrated "live". .. [#] Rolling upgrades from Pacemaker 1.1.z to 2.y.z are possible only if the cluster uses corosync version 2 or greater as its messaging layer, and the Cluster Information Base (CIB) uses schema 1.0 or higher in its ``validate-with`` property. .. [#] As of Pacemaker 2.0.0, only schema versions pacemaker-1.0 and higher are supported (excluding pacemaker-1.1, which was a special case). diff --git a/doc/sphinx/Pacemaker_Development/helpers.rst b/doc/sphinx/Pacemaker_Development/helpers.rst index 6bd19266b4..dd244a4e48 100644 --- a/doc/sphinx/Pacemaker_Development/helpers.rst +++ b/doc/sphinx/Pacemaker_Development/helpers.rst @@ -1,520 +1,520 @@ C Development Helpers --------------------- .. index:: single: unit testing Refactoring ########### Pacemaker uses an optional tool called `coccinelle `_ to do automatic refactoring. coccinelle is a very complicated tool that can be difficult to understand, and the existing documentation makes it pretty tough to get started. Much of the documentation is either aimed at kernel developers or takes the form of grammars. However, it can apply very complex transformations across an entire source tree. This is useful for tasks like code refactoring, changing APIs (number or type of arguments, etc.), catching functions that should not be called, and changing existing patterns. coccinelle is driven by input scripts called `semantic patches `_ written in its own language. These scripts bear a passing resemblance to source code patches and tell coccinelle how to match and modify a piece of source code. They are stored in ``devel/coccinelle`` and each script either contains a single source transformation or several related transformations. In general, we try to keep these as simple as possible. In Pacemaker development, we use a couple targets in ``devel/Makefile.am`` to control coccinelle. The ``cocci`` target tries to apply each script to every Pacemaker source file, printing out any changes it would make to the console. The ``cocci-inplace`` target does the same but also makes those changes to the source files. A variety of warnings might also be printed. If you aren't working on a new script, these can usually be ignored. If you are working on a new coccinelle script, it can be useful (and faster) to skip everything else and only run the new script. The ``COCCI_FILES`` variable can be used for this: .. code-block:: none $ make -C devel COCCI_FILES=coccinelle/new-file.cocci cocci This variable is also used for preventing some coccinelle scripts in the Pacemaker source tree from running. Some scripts are disabled because they are not currently fully working or because they are there as templates. When adding a new script, remember to add it to this variable if it should always be run. One complication when writing coccinelle scripts is that certain Pacemaker source files may not use private functions (those whose name starts with ``pcmk__``). Handling this requires work in both the Makefile and in the coccinelle scripts. The Makefile deals with this by maintaining two lists of source files: those that may use private functions and those that may not. For those that may, a special argument (``-D internal``) is added to the coccinelle command line. This creates a virtual dependency named ``internal``. In the coccinelle scripts, those transformations that modify source code to use a private function also have a dependency on ``internal``. If that dependency was given on the command line, the transformation will be run. Otherwise, it will be skipped. This means that not all instances of an older style of code will be changed after running a given transformation. Some developer intervention is still necessary to know whether a source code block should have been changed or not. Probably the easiest way to learn how to use coccinelle is by following other people's scripts. In addition to the ones in the Pacemaker source directory, there's several others on the `coccinelle website `_. Sanitizers ########## gcc supports a variety of run-time checks called sanitizers. These can be used to catch programming errors with memory, race conditions, various undefined behavior conditions, and more. Because these are run-time checks, they should only be used during development and not in compiled packages or production code. Certain sanitizers cannot be combined with others because their run-time checks cause interfere. Instead of trying to figure out which combinations work, it is simplest to just enable one at a time. Each supported sanitizer requires an installed libray. In addition to just enabling the sanitizer, their use can be configured with environment variables. For example: .. code-block:: none $ ASAN_OPTIONS=verbosity=1:replace_str=true crm_mon -1R Pacemaker supports the following subset of gcc's sanitizers: +--------------------+-------------------------+----------+----------------------+ | Sanitizer | Configure Option | Library | Environment Variable | +====================+=========================+==========+======================+ | Address | --with-sanitizers=asan | libasan | ASAN_OPTIONS | +--------------------+-------------------------+----------+----------------------+ | Threads | --with-sanitizers=tsan | libtsan | TSAN_OPTIONS | +--------------------+-------------------------+----------+----------------------+ | Undefined behavior | --with-sanitizers=ubsan | libubsan | UBSAN_OPTIONS | +--------------------+-------------------------+----------+----------------------+ The undefined behavior sanitizer further supports suboptions that need to be given as CFLAGS when configuring pacemaker: .. code-block:: none $ CFLAGS=-fsanitize=integer-divide-by-zero ./configure --with-sanitizers=ubsan For more information, see the `gcc documentation `_ which also provides links to more information on each sanitizer. Unit Testing ############ Where possible, changes to the C side of Pacemaker should be accompanied by unit tests. Much of Pacemaker cannot effectively be unit tested (and there are other testing systems used for those parts), but the ``lib`` subdirectory is pretty easy to write tests for. Pacemaker uses the `cmocka unit testing framework `_ which looks a lot like other unit testing frameworks for C and should be fairly familiar. In addition to regular unit tests, cmocka also gives us the ability to use `mock functions `_ for unit testing functions that would otherwise be difficult to test. Organization ____________ Pay close attention to the organization and naming of test cases to ensure the unit tests continue to work as they should. Tests are spread throughout the source tree, alongside the source code they test. For instance, all the tests for the source code in ``lib/common/`` are in the ``lib/common/tests`` directory. If there is no ``tests`` subdirectory, there are no tests for that library yet. Under that directory, there is a ``Makefile.am`` and additional subdirectories. Each subdirectory contains the tests for a single library source file. For instance, all the tests for ``lib/common/strings.c`` are in the ``lib/common/tests/strings`` directory. Note that the test subdirectory does not have a ``.c`` suffix. If there is no test subdirectory, there are no tests for that file yet. Finally, under that directory, there is a ``Makefile.am`` and then various source files. Each of these source files tests the single function that it is named after. For instance, ``lib/common/tests/strings/pcmk__btoa_test.c`` tests the ``pcmk__btoa()`` function in ``lib/common/strings.c``. If there is no test source file, there are no tests for that function yet. The ``_test`` suffix on the test source file is important. All tests have this suffix, which means all the compiled test cases will also end with this suffix. That lets us ignore all the compiled tests with a single line in ``.gitignore``: .. code-block:: none /lib/*/tests/*/*_test Adding a test _____________ Testing a new function in an already testable source file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Follow these steps if you want to test a function in a source file where there are already other tested functions. For the purposes of this example, we will add a test for the ``pcmk__scan_port()`` function in ``lib/common/strings.c``. As you can see, there are already tests for other functions in this same file in the ``lib/common/tests/strings`` directory. * cd into ``lib/common/tests/strings`` -* Add the new file to the the ``check_PROGRAMS`` variable in ``Makefile.am``, - making it something like this: +* Add the new file to the ``check_PROGRAMS`` variable in ``Makefile.am``, making + it something like this: .. code-block:: none check_PROGRAMS = \ pcmk__add_word_test \ pcmk__btoa_test \ pcmk__scan_port_test * Create a new ``pcmk__scan_port_test.c`` file, copying the copyright and include boilerplate from another file in the same directory. * Continue with the steps in `Writing the test`_. Testing a function in a source file without tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Follow these steps if you want to test a function in a source file where there are not already other tested functions, but there are tests for other files in the same library. For the purposes of this example, we will add a test for the ``pcmk_acl_required()`` function in ``lib/common/acls.c``. At the time of this documentation being written, no tests existed for that source file, so there is no ``lib/common/tests/acls`` directory. * Add to ``AC_CONFIG_FILES`` in the top-level ``configure.ac`` file so the build process knows to use directory we're about to create. That variable would now look something like: .. code-block:: none dnl Other files we output AC_CONFIG_FILES(Makefile \ ... lib/common/tests/Makefile \ lib/common/tests/acls/Makefile \ lib/common/tests/agents/Makefile \ ... ) * cd into ``lib/common/tests`` * Add to the ``SUBDIRS`` variable in ``Makefile.am``, making it something like: .. code-block:: none SUBDIRS = agents acls cmdline flags operations strings utils xpath results * Create a new ``acls`` directory, copying the ``Makefile.am`` from some other directory. At this time, each ``Makefile.am`` is largely boilerplate with very little that needs to change from directory to directory. * cd into ``acls`` * Get rid of any existing values for ``check_PROGRAMS`` and set it to ``pcmk_acl_required_test`` like so: .. code-block:: none check_PROGRAMS = pcmk_acl_required_test * Double check that ``$(top_srcdir)/mk/tap.mk`` and ``$(top_srcdir)/mk/unittest.mk`` are included in the ``Makefile.am``. These files contain all the flags necessary for most unit tests. If necessary, individual settings can be overridden like so: .. code-block:: none AM_CPPFLAGS += -I$(top_srcdir) LDADD += $(top_builddir)/lib/pengine/libpe_status_test.la * Follow the steps in `Testing a new function in an already testable source file`_ to create the new ``pcmk_acl_required_test.c`` file. Testing a function in a library without tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Adding a test case for a function in a library that doesn't have any test cases to begin with is only slightly more complicated. In general, the steps are the same as for the previous section, except with an additional layer of directory creation. For the purposes of this example, we will add a test case for the ``lrmd_send_resource_alert()`` function in ``lib/lrmd/lrmd_alerts.c``. Note that this may not be a very good function or even library to write actual unit tests for. * Add to ``AC_CONFIG_FILES`` in the top-level ``configure.ac`` file so the build process knows to use directory we're about to create. That variable would now look something like: .. code-block:: none dnl Other files we output AC_CONFIG_FILES(Makefile \ ... lib/lrmd/Makefile \ lib/lrmd/tests/Makefile \ lib/services/Makefile \ ... ) * cd into ``lib/lrmd`` * Create a ``SUBDIRS`` variable in ``Makefile.am`` if it doesn't already exist. Most libraries should not have this variable already. .. code-block:: none SUBDIRS = tests * Create a new ``tests`` directory and add a ``Makefile.am`` with the following contents: .. code-block:: none SUBDIRS = lrmd_alerts * Follow the steps in `Testing a function in a source file without tests`_ to create the rest of the new directory structure. * Follow the steps in `Testing a new function in an already testable source file`_ to create the new ``lrmd_send_resource_alert_test.c`` file. Adding to an existing test case ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If all you need to do is add additional test cases to an existing file, none of the above work is necessary. All you need to do is find the test source file with the name matching your function and add to it and then follow the instructions in `Writing the test`_. Writing the test ________________ A test case file contains a fair amount of boilerplate. For this reason, it's usually easiest to just copy an existing file and adapt it to your needs. However, here's the basic structure: .. code-block:: c /* * Copyright 2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include /* Put your test-specific includes here */ /* Put your test functions here */ PCMK__UNIT_TEST(NULL, NULL, /* Register your test functions here */) Each test-specific function should test one aspect of the library function, though it can include many assertions if there are many ways of testing that one aspect. For instance, there might be multiple ways of testing regular expression matching: .. code-block:: c static void regex(void **state) { const char *s1 = "abcd"; const char *s2 = "ABCD"; assert_true(pcmk__strcmp(NULL, "a..d", pcmk__str_regex) < 0); assert_true(pcmk__strcmp(s1, NULL, pcmk__str_regex) > 0); assert_int_equal(pcmk__strcmp(s1, "a..d", pcmk__str_regex), 0); } Each test-specific function must also be registered or it will not be called. This is done with ``cmocka_unit_test()`` in the ``PCMK__UNIT_TEST`` macro: .. code-block:: c PCMK__UNIT_TEST(NULL, NULL, cmocka_unit_test(regex)) Most unit tests do not require a setup and teardown function to be executed around the entire group of tests. On occassion, this may be necessary. Simply pass those functions in as the first two parameters to ``PCMK__UNIT_TEST`` instead of using NULL. Assertions __________ In addition to the `assertions provided by `_, ``unittest_internal.h`` also provides ``pcmk__assert_asserts``. This macro takes an expression and verifies that the expression aborts due to a failed call to ``CRM_ASSERT`` or some other similar function. It can be used like so: .. code-block:: c static void null_input_variables(void **state) { long long start, end; pcmk__assert_asserts(pcmk__parse_ll_range("1234", NULL, &end)); pcmk__assert_asserts(pcmk__parse_ll_range("1234", &start, NULL)); } Here, ``pcmk__parse_ll_range`` expects non-NULL for its second and third arguments. If one of those arguments is NULL, ``CRM_ASSERT`` will fail and the program will abort. ``pcmk__assert_asserts`` checks that the code would abort and the test passes. If the code does not abort, the test fails. Running _______ If you had to create any new files or directories, you will first need to run ``./configure`` from the top level of the source directory. This will regenerate the Makefiles throughout the tree. If you skip this step, your changes will be skipped and you'll be left wondering why the output doesn't match what you expected. To run the tests, simply run ``make check`` after previously building the source with ``make``. The test cases in each directory will be built and then run. This should not take long. If all the tests succeed, you will be back at the prompt. Scrolling back through the history, you should see lines like the following: .. code-block:: none PASS: pcmk__strcmp_test 1 - same_pointer PASS: pcmk__strcmp_test 2 - one_is_null PASS: pcmk__strcmp_test 3 - case_matters PASS: pcmk__strcmp_test 4 - case_insensitive PASS: pcmk__strcmp_test 5 - regex ============================================================================ Testsuite summary for pacemaker 2.1.0 ============================================================================ # TOTAL: 33 # PASS: 33 # SKIP: 0 # XFAIL: 0 # FAIL: 0 # XPASS: 0 # ERROR: 0 ============================================================================ make[7]: Leaving directory '/home/clumens/src/pacemaker/lib/common/tests/strings' The testing process will quit on the first failed test, and you will see lines like these: .. code-block:: none PASS: pcmk__scan_double_test 3 - trailing_chars FAIL: pcmk__scan_double_test 4 - typical_case PASS: pcmk__scan_double_test 5 - double_overflow PASS: pcmk__scan_double_test 6 - double_underflow ERROR: pcmk__scan_double_test - exited with status 1 PASS: pcmk__starts_with_test 1 - bad_input ============================================================================ Testsuite summary for pacemaker 2.1.0 ============================================================================ # TOTAL: 56 # PASS: 54 # SKIP: 0 # XFAIL: 0 # FAIL: 1 # XPASS: 0 # ERROR: 1 ============================================================================ See lib/common/tests/strings/test-suite.log Please report to users@clusterlabs.org ============================================================================ make[7]: *** [Makefile:1218: test-suite.log] Error 1 make[7]: Leaving directory '/home/clumens/src/pacemaker/lib/common/tests/strings' The failure is in ``lib/common/tests/strings/test-suite.log``: .. code-block:: none ERROR: pcmk__scan_double_test ============================= 1..6 ok 1 - empty_input_string PASS: pcmk__scan_double_test 1 - empty_input_string ok 2 - bad_input_string PASS: pcmk__scan_double_test 2 - bad_input_string ok 3 - trailing_chars PASS: pcmk__scan_double_test 3 - trailing_chars not ok 4 - typical_case FAIL: pcmk__scan_double_test 4 - typical_case # 0.000000 != 3.000000 # pcmk__scan_double_test.c:80: error: Failure! ok 5 - double_overflow PASS: pcmk__scan_double_test 5 - double_overflow ok 6 - double_underflow PASS: pcmk__scan_double_test 6 - double_underflow # not ok - tests ERROR: pcmk__scan_double_test - exited with status 1 At this point, you need to determine whether your test case is incorrect or whether the code being tested is incorrect. Fix whichever is wrong and continue. Code Coverage ############# Figuring out what needs unit tests written is the purpose of a code coverage tool. The Pacemaker build process uses ``lcov`` and special make targets to generate an HTML coverage report that can be inspected with any web browser. To start, you'll need to install the ``lcov`` package which is included in most distributions. Next, reconfigure the source tree: .. code-block:: none $ ./configure --with-coverage Then run ``make -C devel coverage``. This will do the same thing as ``make check``, but will generate a bunch of intermediate files as part of the compiler's output. Essentially, the coverage tools run all the unit tests and make a note if a given line if code is executed as a part of some test program. This will include not just things run as part of the tests but anything in the setup and teardown functions as well. Afterwards, the HTML report will be in ``coverage/index.html``. You can drill down into individual source files to see exactly which lines are covered and which are not, which makes it easy to target new unit tests. Note that sometimes, it is impossible to achieve 100% coverage for a source file. For instance, how do you test a function with a return type of void that simply returns on some condition? Note that Pacemaker's overall code coverage numbers are very low at the moment. One reason for this is the large amount of code in the ``daemons`` directory that will be very difficult to write unit tests for. For now, it is best to focus efforts on increasing the coverage on individual libraries. Additionally, there is a ``coverage-cts`` target that does the same thing but instead of testing ``make check``, it tests ``cts/cts-cli``. The idea behind this target is to see what parts of our command line tools are covered by our regression tests. It is probably best to clean and rebuild the source tree when switching between these various targets. Debugging ######### gdb ___ If you use ``gdb`` for debugging, some helper functions are defined in ``devel/gdbhelpers``, which can be given to ``gdb`` using the ``-x`` option. From within the debugger, you can then invoke the ``pcmk`` command that will describe the helper functions available.