diff --git a/libknet/libknet.h b/libknet/libknet.h index 45fb3cbb..8a0790b3 100644 --- a/libknet/libknet.h +++ b/libknet/libknet.h @@ -1,1976 +1,1979 @@ /* * Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved. * * Authors: Fabio M. Di Nitto * Federico Simoncelli * * This software licensed under GPL-2.0+, LGPL-2.0+ */ #ifndef __LIBKNET_H__ #define __LIBKNET_H__ #include #include #include #include #include /** * @file libknet.h * @brief kronosnet API include file * @copyright Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved. * * Kronosnet is an advanced VPN system for High Availability applications. */ /* * libknet limits */ /* * Maximum number of hosts */ typedef uint16_t knet_node_id_t; #define KNET_MAX_HOST 65536 /* * Maximum number of links between 2 hosts */ #define KNET_MAX_LINK 8 /* * Maximum packet size that should be written to datafd * see knet_handle_new for details */ #define KNET_MAX_PACKET_SIZE 65536 /* * Buffers used for pretty logging * host is used to store both ip addresses and hostnames */ #define KNET_MAX_HOST_LEN 256 #define KNET_MAX_PORT_LEN 6 /* * Some notifications can be generated either on TX or RX */ #define KNET_NOTIFY_TX 0 #define KNET_NOTIFY_RX 1 /* * Link flags */ /* * Where possible, set traffic priority to high. * On Linux this sets the TOS to INTERACTIVE (6), * see tc-prio(8) for more infomation */ #define KNET_LINK_FLAG_TRAFFICHIPRIO (1ULL << 0) /* * Handle flags */ /* * Use privileged operations during socket setup. */ #define KNET_HANDLE_FLAG_PRIVILEGED (1ULL << 0) typedef struct knet_handle *knet_handle_t; /* * Handle structs/API calls */ /** * knet_handle_new_ex * * @brief create a new instance of a knet handle * * host_id - Each host in a knet is identified with a unique * ID. when creating a new handle local host_id * must be specified (0 to UINT16_MAX are all valid). * It is the user's responsibility to check that the value * is unique, or bad things might happen. * * log_fd - Write file descriptor. If set to a value > 0, it will be used * to write log packets from libknet to the application. * Setting to 0 will disable logging from libknet. * It is possible to enable logging at any given time (see logging API). * Make sure to either read from this filedescriptor properly and/or * mark it O_NONBLOCK, otherwise if the fd becomes full, libknet could * block. * It is strongly encouraged to use pipes (ex: pipe(2) or pipe2(2)) for * logging fds due to the atomic nature of writes between fds. * See also libknet test suite for reference and guidance. * * default_log_level - * If logfd is specified, it will initialize all subsystems to log * at default_log_level value. (see logging API) * * flags - bitwise OR of some of the following flags: * KNET_HANDLE_FLAG_PRIVILEGED: use privileged operations setting up the * communication sockets. If disabled, failure to acquire large * enough socket buffers is ignored but logged. Inadequate buffers * lead to poor performance. * * @return * on success, a new knet_handle_t is returned. * on failure, NULL is returned and errno is set. * knet-specific errno values: * ENAMETOOLONG - socket buffers couldn't be set big enough and KNET_HANDLE_FLAG_PRIVILEGED was specified * ERANGE - buffer size readback returned unexpected type */ knet_handle_t knet_handle_new_ex(knet_node_id_t host_id, int log_fd, uint8_t default_log_level, uint64_t flags); /** * knet_handle_new * * @brief knet_handle_new_ex with flags = KNET_HANDLE_FLAG_PRIVILEGED. */ knet_handle_t knet_handle_new(knet_node_id_t host_id, int log_fd, uint8_t default_log_level); /** * knet_handle_free * @brief Destroy a knet handle, free all resources * * knet_h - pointer to knet_handle_t * * @return * knet_handle_free returns * 0 on success * -1 on error and errno is set. */ int knet_handle_free(knet_handle_t knet_h); /** * knet_handle_enable_sock_notify * @brief Register a callback to receive socket events * * knet_h - pointer to knet_handle_t * * sock_notify_fn_private_data * void pointer to data that can be used to identify * the callback. * * sock_notify_fn * A callback function that is invoked every time * a socket in the datafd pool will report an error (-1) * or an end of read (0) (see socket.7). * This function MUST NEVER block or add substantial delays. * The callback is invoked in an internal unlocked area * to allow calls to knet_handle_add_datafd/knet_handle_remove_datafd * to swap/replace the bad fd. * if both err and errno are 0, it means that the socket * has received a 0 byte packet (EOF?). * The callback function must either remove the fd from knet * (by calling knet_handle_remove_fd()) or dup a new fd in its place. * Failure to do this can cause problems. * * @return * knet_handle_enable_sock_notify returns * 0 on success * -1 on error and errno is set. */ int knet_handle_enable_sock_notify(knet_handle_t knet_h, void *sock_notify_fn_private_data, void (*sock_notify_fn) ( void *private_data, int datafd, int8_t channel, uint8_t tx_rx, int error, int errorno)); /* sorry! can't call it errno ;) */ #define KNET_DATAFD_MAX 32 /** * knet_handle_add_datafd * * @brief Install a file descriptor for communication * * IMPORTANT: In order to add datafd to knet, knet_handle_enable_sock_notify * _MUST_ be set and be able to handle both errors (-1) and * 0 bytes read / write from the provided datafd. * On read error (< 0) from datafd, the socket is automatically * removed from polling to avoid spinning on dead sockets. * It is safe to call knet_handle_remove_datafd even on sockets * that have been removed. * * knet_h - pointer to knet_handle_t * * *datafd - read/write file descriptor. * knet will read data here to send to the other hosts * and will write data received from the network. * Each data packet can be of max size KNET_MAX_PACKET_SIZE! * Applications using knet_send/knet_recv will receive a * proper error if the packet size is not within boundaries. * Applications using their own functions to write to the * datafd should NOT write more than KNET_MAX_PACKET_SIZE. * * Please refer to handle.c on how to set up a socketpair. * * datafd can be 0, and knet_handle_add_datafd will create a properly * populated socket pair the same way as ping_test, or a value * higher than 0. A negative number will return an error. * On exit knet_handle_free will take care to cleanup the * socketpair only if they have been created by knet_handle_add_datafd. * * It is possible to pass either sockets or normal fds. * User provided datafd will be marked as non-blocking and close-on-exit. * * *channel - This value has the same effect of VLAN tagging. * A negative value will auto-allocate a channel. * Setting a value between 0 and 31 will try to allocate that * specific channel (unless already in use). * * It is possible to add up to 32 datafds but be aware that each * one of them must have a receiving end on the other host. * * Example: * hostA channel 0 will be delivered to datafd on hostB channel 0 * hostA channel 1 to hostB channel 1. * * Each channel must have a unique file descriptor. * * If your application could have 2 channels on one host and one * channel on another host, then you can use dst_host_filter * to manipulate channel values on TX and RX. * * @return * knet_handle_add_datafd returns * @retval 0 on success, * *datafd will be populated with a socket if the original value was 0 * or if a specific fd was set, the value is untouched. * *channel will be populated with a channel number if the original value * was negative or the value is untouched if a specific channel * was requested. * * @retval -1 on error and errno is set. * *datafd and *channel are untouched or empty. */ int knet_handle_add_datafd(knet_handle_t knet_h, int *datafd, int8_t *channel); /** * knet_handle_remove_datafd * @brief Remove a file descriptor from knet * * knet_h - pointer to knet_handle_t * * datafd - file descriptor to remove. * NOTE that if the socket/fd was created by knet_handle_add_datafd, * the socket will be closed by libknet. * * @return * knet_handle_remove_datafd returns * 0 on success * -1 on error and errno is set. */ int knet_handle_remove_datafd(knet_handle_t knet_h, int datafd); /** * knet_handle_get_channel * @brief Get the channel associated with a file descriptor * * knet_h - pointer to knet_handle_t * * datafd - get the channel associated to this datafd * * *channel - will contain the result * * @return * knet_handle_get_channel returns * @retval 0 on success * and *channel will contain the result * @retval -1 on error and errno is set. * and *channel content is meaningless */ int knet_handle_get_channel(knet_handle_t knet_h, const int datafd, int8_t *channel); /** * knet_handle_get_datafd * @brief Get the file descriptor associated with a channel * * knet_h - pointer to knet_handle_t * * channel - get the datafd associated to this channel * * *datafd - will contain the result * * @return * knet_handle_get_datafd returns * @retval 0 on success * and *datafd will contain the results * @retval -1 on error and errno is set. * and *datafd content is meaningless */ int knet_handle_get_datafd(knet_handle_t knet_h, const int8_t channel, int *datafd); /** * knet_recv * @brief Receive data from knet nodes * * knet_h - pointer to knet_handle_t * * buff - pointer to buffer to store the received data * * buff_len - buffer length * * channel - channel number * * @return * knet_recv is a commodity function to wrap iovec operations * around a socket. It returns a call to readv(2). */ ssize_t knet_recv(knet_handle_t knet_h, char *buff, const size_t buff_len, const int8_t channel); /** * knet_send * @brief Send data to knet nodes * * knet_h - pointer to knet_handle_t * * buff - pointer to the buffer of data to send * * buff_len - length of data to send * * channel - channel number * * @return * knet_send is a commodity function to wrap iovec operations * around a socket. It returns a call to writev(2). */ ssize_t knet_send(knet_handle_t knet_h, const char *buff, const size_t buff_len, const int8_t channel); /** * knet_send_sync * * @brief Synchronously send data to knet nodes * * knet_h - pointer to knet_handle_t * * buff - pointer to the buffer of data to send * * buff_len - length of data to send * * channel - data channel to use (see knet_handle_add_datafd(3)) * * All knet RX/TX operations are async for performance reasons. * There are applications that might need a sync version of data * transmission and receive errors in case of failure to deliver * to another host. * knet_send_sync bypasses the whole TX async layer and delivers * data directly to the link layer, and returns errors accordingly. * knet_send_sync sends only one packet to one host at a time. * It does NOT support multiple destinations or multicast packets. * Decision is still based on dst_host_filter_fn. * * @return * knet_send_sync returns 0 on success and -1 on error. * In addition to normal sendmmsg errors, knet_send_sync can fail * due to: * * @retval ECANCELED - data forward is disabled * @retval EFAULT - dst_host_filter fatal error * @retval EINVAL - dst_host_filter did not provide dst_host_ids_entries on unicast pckts * @retval E2BIG - dst_host_filter did return more than one dst_host_ids_entries on unicast pckts * @retval ENOMSG - received unknown message type * @retval EHOSTDOWN - unicast pckt cannot be delivered because dest host is not connected yet * @retval ECHILD - crypto failed * @retval EAGAIN - sendmmsg was unable to send all messages and there was no progress during retry */ int knet_send_sync(knet_handle_t knet_h, const char *buff, const size_t buff_len, const int8_t channel); /** * knet_handle_enable_filter * * @brief install a filter to route packets * * knet_h - pointer to knet_handle_t * * dst_host_filter_fn_private_data * void pointer to data that can be used to identify * the callback. * * dst_host_filter_fn - * is a callback function that is invoked every time * a packet hits datafd (see knet_handle_new(3)). * the function allows users to tell libknet where the * packet has to be delivered. * * const unsigned char *outdata - is a pointer to the * current packet * ssize_t outdata_len - length of the above data * uint8_t tx_rx - filter is called on tx or rx * (KNET_NOTIFY_TX, KNET_NOTIFY_RX) * knet_node_id_t this_host_id - host_id processing the packet * knet_node_id_t src_host_id - host_id that generated the * packet * knet_node_id_t *dst_host_ids - array of KNET_MAX_HOST knet_node_id_t * where to store the destinations * size_t *dst_host_ids_entries - number of hosts to send the message * * dst_host_filter_fn should return * -1 on error, packet is discarded. * 0 packet is unicast and should be sent to dst_host_ids and there are * dst_host_ids_entries in the buffer. * 1 packet is broadcast/multicast and is sent all hosts. * contents of dst_host_ids and dst_host_ids_entries are ignored. * (see also kronosnetd/etherfilter.* for an example that filters based * on ether protocol) * * @return * knet_handle_enable_filter returns * 0 on success * -1 on error and errno is set. */ int knet_handle_enable_filter(knet_handle_t knet_h, void *dst_host_filter_fn_private_data, int (*dst_host_filter_fn) ( void *private_data, const unsigned char *outdata, ssize_t outdata_len, uint8_t tx_rx, knet_node_id_t this_host_id, knet_node_id_t src_host_id, int8_t *channel, knet_node_id_t *dst_host_ids, size_t *dst_host_ids_entries)); /** * knet_handle_setfwd * * @brief Start packet forwarding * * knet_h - pointer to knet_handle_t * * enable - set to 1 to allow data forwarding, 0 to disable data forwarding. * * @return * knet_handle_setfwd returns * 0 on success * -1 on error and errno is set. * * By default data forwarding is off and no traffic will pass through knet until * it is set on. */ int knet_handle_setfwd(knet_handle_t knet_h, unsigned int enabled); #define KNET_PMTUD_DEFAULT_INTERVAL 60 /** * knet_handle_pmtud_setfreq * * @brief Set the interval between PMTUd scans * * knet_h - pointer to knet_handle_t * * interval - define the interval in seconds between PMTUd scans * range from 1 to 86400 (24h) * * @return * knet_handle_pmtud_setfreq returns * 0 on success * -1 on error and errno is set. * * default interval is 60. */ int knet_handle_pmtud_setfreq(knet_handle_t knet_h, unsigned int interval); /** * knet_handle_pmtud_getfreq * * @brief Get the interval between PMTUd scans * * knet_h - pointer to knet_handle_t * * interval - pointer where to store the current interval value * * @return * knet_handle_pmtud_setfreq returns * 0 on success * -1 on error and errno is set. */ int knet_handle_pmtud_getfreq(knet_handle_t knet_h, unsigned int *interval); /** * knet_handle_enable_pmtud_notify * * @brief install a callback to receive PMTUd changes * * knet_h - pointer to knet_handle_t * * pmtud_notify_fn_private_data * void pointer to data that can be used to identify * the callback. * * pmtud_notify_fn * is a callback function that is invoked every time * a path MTU size change is detected. * The function allows libknet to notify the user * of data MTU, that's the max value that can be send * onwire without fragmentation. The data MTU will always * be lower than real link MTU because it accounts for * protocol overhead, knet packet header and (if configured) * crypto overhead, * This function MUST NEVER block or add substantial delays. * * @return * knet_handle_enable_pmtud_notify returns * 0 on success * -1 on error and errno is set. */ int knet_handle_enable_pmtud_notify(knet_handle_t knet_h, void *pmtud_notify_fn_private_data, void (*pmtud_notify_fn) ( void *private_data, unsigned int data_mtu)); /** * knet_handle_pmtud_get * * @brief Get the current data MTU * * knet_h - pointer to knet_handle_t * * data_mtu - pointer where to store data_mtu * * @return * knet_handle_pmtud_get returns * 0 on success * -1 on error and errno is set. */ int knet_handle_pmtud_get(knet_handle_t knet_h, unsigned int *data_mtu); #define KNET_MIN_KEY_LEN 256 #define KNET_MAX_KEY_LEN 4096 struct knet_handle_crypto_cfg { char crypto_model[16]; char crypto_cipher_type[16]; char crypto_hash_type[16]; unsigned char private_key[KNET_MAX_KEY_LEN]; unsigned int private_key_len; }; /** * knet_handle_crypto * * @brief set up packet cryptographic signing & encryption * * knet_h - pointer to knet_handle_t * * knet_handle_crypto_cfg - * pointer to a knet_handle_crypto_cfg structure * * crypto_model should contain the model name. * Currently only "openssl" and "nss" are supported. * Setting to "none" will disable crypto. * * crypto_cipher_type * should contain the cipher algo name. * It can be set to "none" to disable * encryption. * Currently supported by "nss" model: * "3des", "aes128", "aes192" and "aes256". * "openssl" model supports more modes and it strictly * depends on the openssl build. See: EVP_get_cipherbyname * openssl API call for details. * * crypto_hash_type * should contain the hashing algo name. * It can be set to "none" to disable * hashing. * Currently supported by "nss" model: * "md5", "sha1", "sha256", "sha384" and "sha512". * "openssl" model supports more modes and it strictly * depends on the openssl build. See: EVP_get_digestbyname * openssl API call for details. * * private_key will contain the private shared key. * It has to be at least KNET_MIN_KEY_LEN long. * * private_key_len * length of the provided private_key. * * Implementation notes/current limitations: * - enabling crypto, will increase latency as packets have * to processed. * - enabling crypto might reduce the overall throughtput * due to crypto data overhead. * - re-keying is not implemented yet. * - private/public key encryption/hashing is not currently * planned. * - crypto key must be the same for all hosts in the same * knet instance. * - it is safe to call knet_handle_crypto multiple times at runtime. * The last config will be used. * IMPORTANT: a call to knet_handle_crypto can fail due to: * 1) failure to obtain locking * 2) errors to initializing the crypto level. * This can happen even in subsequent calls to knet_handle_crypto. * A failure in crypto init, might leave your traffic unencrypted! * It's best to stop data forwarding (see knet_handle_setfwd(3)), change crypto config, * start forward again. * * @return * knet_handle_crypto returns: * @retval 0 on success * @retval -1 on error and errno is set. * @retval -2 on crypto subsystem initialization error. No errno is provided at the moment (yet). */ int knet_handle_crypto(knet_handle_t knet_h, struct knet_handle_crypto_cfg *knet_handle_crypto_cfg); #define KNET_COMPRESS_THRESHOLD 100 struct knet_handle_compress_cfg { char compress_model[16]; uint32_t compress_threshold; int compress_level; }; /** * knet_handle_compress * * @brief Set up packet compression * * knet_h - pointer to knet_handle_t * * knet_handle_compress_cfg - * pointer to a knet_handle_compress_cfg structure * * compress_model contains the model name. * See "compress_level" for the list of accepted values. * Setting the value to "none" disables compression. * * compress_threshold * tells the transmission thread to NOT compress * any packets that are smaller than the value * indicated. Default 100 bytes. * Set to 0 to reset to the default. * Set to 1 to compress everything. * Max accepted value is KNET_MAX_PACKET_SIZE. * * compress_level is the "level" parameter for most models: * zlib: 0 (no compression), 1 (minimal) .. 9 (max compression). * lz4: 1 (max compression)... 9 (fastest compression). * lz4hc: 1 (min compression) ... LZ4HC_MAX_CLEVEL (16) or LZ4HC_CLEVEL_MAX (12) * depending on the version of lz4hc libknet was built with. * lzma: 0 (minimal) .. 9 (max compression) * bzip2: 1 (minimal) .. 9 (max compression) * For lzo2 it selects the algorithm to use: * 1 : lzo1x_1_compress (default) * 11 : lzo1x_1_11_compress * 12 : lzo1x_1_12_compress * 15 : lzo1x_1_15_compress * 999: lzo1x_999_compress * Other values select the default algorithm. * Please refer to the documentation of the respective * compression library for guidance about setting this * value. * * Implementation notes: * - it is possible to enable/disable compression at any time. * - nodes can be using a different compression algorithm at any time. * - knet does NOT implement the compression algorithm directly. it relies * on external libraries for this functionality. Please read * the libraries man pages to figure out which algorithm/compression * level is best for the data you are planning to transmit. * * @return * knet_handle_compress returns * 0 on success * -1 on error and errno is set. EINVAL means that either the model or the * level are not supported. */ int knet_handle_compress(knet_handle_t knet_h, struct knet_handle_compress_cfg *knet_handle_compress_cfg); struct knet_handle_stats { size_t size; uint64_t tx_uncompressed_packets; uint64_t tx_compressed_packets; uint64_t tx_compressed_original_bytes; uint64_t tx_compressed_size_bytes; uint64_t tx_compress_time_ave; uint64_t tx_compress_time_min; uint64_t tx_compress_time_max; + uint64_t tx_failed_to_compress; + uint64_t tx_unable_to_compress; uint64_t rx_compressed_packets; uint64_t rx_compressed_original_bytes; uint64_t rx_compressed_size_bytes; uint64_t rx_compress_time_ave; uint64_t rx_compress_time_min; uint64_t rx_compress_time_max; + uint64_t rx_failed_to_decompress; /* Overhead times, measured in usecs */ uint64_t tx_crypt_packets; uint64_t tx_crypt_byte_overhead; uint64_t tx_crypt_time_ave; uint64_t tx_crypt_time_min; uint64_t tx_crypt_time_max; uint64_t rx_crypt_packets; uint64_t rx_crypt_time_ave; uint64_t rx_crypt_time_min; uint64_t rx_crypt_time_max; }; /** * knet_handle_get_stats * * @brief Get statistics for compression & crypto * * knet_h - pointer to knet_handle_t * * knet_handle_stats * pointer to a knet_handle_stats structure * * struct_size * size of knet_handle_stats structure to allow * for backwards compatibility. libknet will only * copy this much data into the stats structure * so that older callers will not get overflowed if * new fields are added. * * @return * 0 on success * -1 on error and errno is set. * */ int knet_handle_get_stats(knet_handle_t knet_h, struct knet_handle_stats *stats, size_t struct_size); /* * Tell knet_handle_clear_stats whether to clear just the handle stats * or all of them. */ #define KNET_CLEARSTATS_HANDLE_ONLY 1 #define KNET_CLEARSTATS_HANDLE_AND_LINK 2 /** * knet_handle_clear_stats * * @brief Clear knet stats, link and/or handle * * knet_h - pointer to knet_handle_t * * clear_option - Which stats to clear, must be one of * * KNET_CLEARSTATS_HANDLE_ONLY or * KNET_CLEARSTATS_HANDLE_AND_LINK * * @return * 0 on success * -1 on error and errno is set. * */ int knet_handle_clear_stats(knet_handle_t knet_h, int clear_option); struct knet_crypto_info { const char *name; /* openssl,nss,etc.. */ uint8_t properties; /* currently unused */ char pad[256]; /* currently unused */ }; /** * knet_get_crypto_list * * @brief Get a list of supported crypto libraries * * crypto_list - array of struct knet_crypto_info * * If NULL then only the number of structs is returned in crypto_list_entries * to allow the caller to allocate sufficient space. * libknet does not allow more than 256 crypto methods at the moment. * it is safe to allocate 256 structs to avoid calling * knet_get_crypto_list twice. * * crypto_list_entries - returns the number of structs in crypto_list * * @return * knet_get_crypto_list returns * 0 on success * -1 on error and errno is set. */ int knet_get_crypto_list(struct knet_crypto_info *crypto_list, size_t *crypto_list_entries); struct knet_compress_info { const char *name; /* bzip2, lz4, etc.. */ uint8_t properties; /* currently unused */ char pad[256]; /* currently unused */ }; /** * knet_get_compress_list * * @brief Get a list of support compression types * * compress_list - array of struct knet_compress_info * * If NULL then only the number of structs is returned in compress_list_entries * to allow the caller to allocate sufficient space. * libknet does not allow more than 256 compress methods at the moment. * it is safe to allocate 256 structs to avoid calling * knet_get_compress_list twice. * * compress_list_entries - returns the number of structs in compress_list * * @return * knet_get_compress_list returns * 0 on success * -1 on error and errno is set. */ int knet_get_compress_list(struct knet_compress_info *compress_list, size_t *compress_list_entries); /* * host structs/API calls */ /** * knet_host_add * * @brief Add a new host ID to knet * * knet_h - pointer to knet_handle_t * * host_id - each host in a knet is identified with a unique ID * (see also knet_handle_new(3)) * * @return * knet_host_add returns: * 0 on success * -1 on error and errno is set. */ int knet_host_add(knet_handle_t knet_h, knet_node_id_t host_id); /** * knet_host_remove * * @brief Remove a host ID from knet * * knet_h - pointer to knet_handle_t * * host_id - each host in a knet is identified with a unique ID * (see also knet_handle_new(3)) * * @return * knet_host_remove returns: * 0 on success * -1 on error and errno is set. */ int knet_host_remove(knet_handle_t knet_h, knet_node_id_t host_id); /** * knet_host_set_name * * @brief Set the name of a knet host * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * name - this name will be used for pretty logging and eventually * search for hosts (see also knet_handle_host_get_name(2) and knet_handle_host_get_id(3)). * Only up to KNET_MAX_HOST_LEN - 1 bytes will be accepted and * name has to be unique for each host. * * @return * knet_host_set_name returns: * 0 on success * -1 on error and errno is set. */ int knet_host_set_name(knet_handle_t knet_h, knet_node_id_t host_id, const char *name); /** * knet_host_get_name_by_host_id * * @brief Get the name of a host given its ID * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * name - pointer to a preallocated buffer of at least size KNET_MAX_HOST_LEN * where the current host name will be stored * (as set by knet_host_set_name or default by knet_host_add) * * @return * knet_host_get_name_by_host_id returns: * 0 on success * -1 on error and errno is set (name is left untouched) */ int knet_host_get_name_by_host_id(knet_handle_t knet_h, knet_node_id_t host_id, char *name); /** * knet_host_get_id_by_host_name * * @brief Get the ID of a host given its name * * knet_h - pointer to knet_handle_t * * name - name to lookup, max len KNET_MAX_HOST_LEN * * host_id - where to store the result * * @return * knet_host_get_id_by_host_name returns: * 0 on success * -1 on error and errno is set. */ int knet_host_get_id_by_host_name(knet_handle_t knet_h, const char *name, knet_node_id_t *host_id); /** * knet_host_get_host_list * * @brief Get a list of hosts known to knet * * knet_h - pointer to knet_handle_t * * host_ids - array of at lest KNET_MAX_HOST size * * host_ids_entries - * number of entries writted in host_ids * * @return * knet_host_get_host_list returns * 0 on success * -1 on error and errno is set. */ int knet_host_get_host_list(knet_handle_t knet_h, knet_node_id_t *host_ids, size_t *host_ids_entries); /* * define switching policies */ #define KNET_LINK_POLICY_PASSIVE 0 #define KNET_LINK_POLICY_ACTIVE 1 #define KNET_LINK_POLICY_RR 2 /** * knet_host_set_policy * * knet_h - pointer to knet_handle_t * * @brief Set the switching policy for a host's links * * host_id - see knet_host_add(3) * * policy - there are currently 3 kind of simple switching policies * based on link configuration. * KNET_LINK_POLICY_PASSIVE - the active link with the lowest * priority will be used. * if one or more active links share * the same priority, the one with * lowest link_id will be used. * * KNET_LINK_POLICY_ACTIVE - all active links will be used * simultaneously to send traffic. * link priority is ignored. * * KNET_LINK_POLICY_RR - round-robin policy, every packet * will be send on a different active * link. * * @return * knet_host_set_policy returns * 0 on success * -1 on error and errno is set. */ int knet_host_set_policy(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t policy); /** * knet_host_get_policy * * @brief Get the switching policy for a host's links * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * policy - will contain the current configured switching policy. * Default is passive when creating a new host. * * @return * knet_host_get_policy returns * 0 on success * -1 on error and errno is set. */ int knet_host_get_policy(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t *policy); /** * knet_host_enable_status_change_notify * * @brief Install a callback to get host status change events * * knet_h - pointer to knet_handle_t * * host_status_change_notify_fn_private_data - * void pointer to data that can be used to identify * the callback * * host_status_change_notify_fn - * is a callback function that is invoked every time * there is a change in the host status. * host status is identified by: * - reachable, this host can send/receive data to/from host_id * - remote, 0 if the host_id is connected locally or 1 if * the there is one or more knet host(s) in between. * NOTE: re-switching is NOT currently implemented, * but this is ready for future and can avoid * an API/ABI breakage later on. * - external, 0 if the host_id is configured locally or 1 if * it has been added from remote nodes config. * NOTE: dynamic topology is NOT currently implemented, * but this is ready for future and can avoid * an API/ABI breakage later on. * This function MUST NEVER block or add substantial delays. * * @return * knet_host_status_change_notify returns * 0 on success * -1 on error and errno is set. */ int knet_host_enable_status_change_notify(knet_handle_t knet_h, void *host_status_change_notify_fn_private_data, void (*host_status_change_notify_fn) ( void *private_data, knet_node_id_t host_id, uint8_t reachable, uint8_t remote, uint8_t external)); /* * define host status structure for quick lookup * struct is in flux as more stats will be added soon * * reachable host_id can be seen either directly connected * or via another host_id * * remote 0 = node is connected locally, 1 is visible via * via another host_id * * external 0 = node is configured/known locally, * 1 host_id has been received via another host_id */ struct knet_host_status { uint8_t reachable; uint8_t remote; uint8_t external; /* add host statistics */ }; /** * knet_host_status_get * * @brief Get the status of a host * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * status - pointer to knet_host_status struct * * @return * knet_handle_pmtud_get returns * 0 on success * -1 on error and errno is set. */ int knet_host_get_status(knet_handle_t knet_h, knet_node_id_t host_id, struct knet_host_status *status); /* * link structs/API calls * * every host allocated/managed by knet_host_* has * KNET_MAX_LINK structures to define the network * paths that connect 2 hosts. * * Each link is identified by a link_id that has a * values between 0 and KNET_MAX_LINK - 1. * * KNOWN LIMITATIONS: * * - let's assume the scenario where two hosts are connected * with any number of links. link_id must match on both sides. * If host_id 0 link_id 0 is configured to connect IP1 to IP2 and * host_id 0 link_id 1 is configured to connect IP3 to IP4, * host_id 1 link_id 0 _must_ connect IP2 to IP1 and likewise * host_id 1 link_id 1 _must_ connect IP4 to IP3. * We might be able to lift this restriction in future, by using * other data to determine src/dst link_id, but for now, deal with it. */ /* * commodity functions to convert strings to sockaddr and viceversa */ /** * knet_strtoaddr * * @brief Convert a hostname string to an address * * host - IPaddr/hostname to convert * be aware only the first IP address will be returned * in case a hostname resolves to multiple IP * * port - port to connect to * * ss - sockaddr_storage where to store the converted data * * sslen - len of the sockaddr_storage * * @return * knet_strtoaddr returns same error codes as getaddrinfo * */ int knet_strtoaddr(const char *host, const char *port, struct sockaddr_storage *ss, socklen_t sslen); /** * knet_addrtostr * * @brief Convert an address to a host name * * ss - sockaddr_storage to convert * * sslen - len of the sockaddr_storage * * host - IPaddr/hostname where to store data * (recommended size: KNET_MAX_HOST_LEN) * * port - port buffer where to store data * (recommended size: KNET_MAX_PORT_LEN) * * @return * knet_strtoaddr returns same error codes as getnameinfo */ int knet_addrtostr(const struct sockaddr_storage *ss, socklen_t sslen, char *addr_buf, size_t addr_buf_size, char *port_buf, size_t port_buf_size); #define KNET_TRANSPORT_LOOPBACK 0 #define KNET_TRANSPORT_UDP 1 #define KNET_TRANSPORT_SCTP 2 #define KNET_MAX_TRANSPORTS UINT8_MAX /* * The Loopback transport is only valid for connections to localhost, the host * with the same node_id specified in knet_handle_new(). Only one link of this * type is allowed. Data sent down a LOOPBACK link will be copied directly from * the knet send datafd to the knet receive datafd so the application must be set * up to take data from that socket at least as often as it is sent or deadlocks * could occur. If used, a LOOPBACK link must be the only link configured to the * local host. */ struct knet_transport_info { const char *name; /* UDP/SCTP/etc... */ uint8_t id; /* value that can be used for link_set_config */ uint8_t properties; /* currently unused */ char pad[256]; /* currently unused */ }; /** * knet_get_transport_list * * @brief Get a list of the transports support by this build of knet * * transport_list - an array of struct transport_info that must be * at least of size struct transport_info * KNET_MAX_TRANSPORTS * * transport_list_entries - pointer to a size_t where to store how many transports * are available in this build of libknet. * * @return * knet_get_transport_list returns * 0 on success * -1 on error and errno is set. */ int knet_get_transport_list(struct knet_transport_info *transport_list, size_t *transport_list_entries); /** * knet_get_transport_name_by_id * * @brief Get a transport name from its ID number * * transport - one of the KNET_TRANSPORT_xxx constants * * @return * knet_get_transport_name_by_id returns: * * @retval pointer to the name on success or * @retval NULL on error and errno is set. */ const char *knet_get_transport_name_by_id(uint8_t transport); /** * knet_get_transport_id_by_name * * @brief Get a transport ID from its name * * name - transport name (UDP/SCTP/etc) * * @return * knet_get_transport_name_by_id returns: * * @retval KNET_MAX_TRANSPORTS on error and errno is set accordingly * @retval KNET_TRANSPORT_xxx on success. */ uint8_t knet_get_transport_id_by_name(const char *name); #define KNET_TRANSPORT_DEFAULT_RECONNECT_INTERVAL 1000 /** * knet_handle_set_transport_reconnect_interval * * @brief Set the interval between transport attempts to reconnect a failed link * * knet_h - pointer to knet_handle_t * * msecs - milliseconds * * @return * knet_handle_set_transport_reconnect_interval returns * 0 on success * -1 on error and errno is set. */ int knet_handle_set_transport_reconnect_interval(knet_handle_t knet_h, uint32_t msecs); /** * knet_handle_get_transport_reconnect_interval * * @brief Get the interval between transport attempts to reconnect a failed link * * knet_h - pointer to knet_handle_t * * msecs - milliseconds * * @return * knet_handle_get_transport_reconnect_interval returns * 0 on success * -1 on error and errno is set. */ int knet_handle_get_transport_reconnect_interval(knet_handle_t knet_h, uint32_t *msecs); /** * knet_link_set_config * * @brief Configure the link to a host * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * transport - one of the KNET_TRANSPORT_xxx constants * * src_addr - sockaddr_storage that can be either IPv4 or IPv6 * * dst_addr - sockaddr_storage that can be either IPv4 or IPv6 * this can be null if we don't know the incoming * IP address/port and the link will remain quiet * till the node on the other end will initiate a * connection * * flags - KNET_LINK_FLAG_* * * @return * knet_link_set_config returns * 0 on success * -1 on error and errno is set. */ int knet_link_set_config(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, uint8_t transport, struct sockaddr_storage *src_addr, struct sockaddr_storage *dst_addr, uint64_t flags); /** * knet_link_get_config * * @brief Get the link configutation information * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * transport - see knet_link_set_config(3) * * src_addr - sockaddr_storage that can be either IPv4 or IPv6 * * dst_addr - sockaddr_storage that can be either IPv4 or IPv6 * * dynamic - 0 if dst_addr is static or 1 if dst_addr is dynamic. * In case of 1, dst_addr can be NULL and it will be left * untouched. * * flags - KNET_LINK_FLAG_* * * @return * knet_link_get_config returns * 0 on success. * -1 on error and errno is set. */ int knet_link_get_config(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, uint8_t *transport, struct sockaddr_storage *src_addr, struct sockaddr_storage *dst_addr, uint8_t *dynamic, uint64_t *flags); /** * knet_link_clear_config * * @brief Clear link information and disconnect the link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * @return * knet_link_clear_config returns * 0 on success. * -1 on error and errno is set. */ int knet_link_clear_config(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id); /** * knet_link_set_enable * * @brief Enable traffic on a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * enabled - 0 disable the link, 1 enable the link * * @return * knet_link_set_enable returns * 0 on success * -1 on error and errno is set. */ int knet_link_set_enable(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, unsigned int enabled); /** * knet_link_get_enable * * @brief Find out whether a link is enabled or not * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * enabled - 0 disable the link, 1 enable the link * * @return * knet_link_get_enable returns * 0 on success * -1 on error and errno is set. */ int knet_link_get_enable(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, unsigned int *enabled); #define KNET_LINK_DEFAULT_PING_INTERVAL 1000 /* 1 second */ #define KNET_LINK_DEFAULT_PING_TIMEOUT 2000 /* 2 seconds */ #define KNET_LINK_DEFAULT_PING_PRECISION 2048 /* samples */ /** * knet_link_set_ping_timers * * @brief Set the ping timers for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * interval - specify the ping interval in milliseconds. * * timeout - if no pong is received within this time, * the link is declared dead, in milliseconds. * NOTE: in future it will be possible to set timeout to 0 * for an autocalculated timeout based on interval, pong_count * and latency. The API already accept 0 as value and it will * return ENOSYS / -1. Once the automatic calculation feature * will be implemented, this call will only return EINVAL * for incorrect values. * * precision - how many values of latency are used to calculate * the average link latency (see also knet_link_get_status(3)) * * @return * knet_link_set_ping_timers returns * 0 on success * -1 on error and errno is set. */ int knet_link_set_ping_timers(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, time_t interval, time_t timeout, unsigned int precision); /** * knet_link_get_ping_timers * * @brief Get the ping timers for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * interval - ping interval * * timeout - if no pong is received within this time, * the link is declared dead * * precision - how many values of latency are used to calculate * the average link latency (see also knet_link_get_status(3)) * * @return * knet_link_get_ping_timers returns * 0 on success * -1 on error and errno is set. */ int knet_link_get_ping_timers(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, time_t *interval, time_t *timeout, unsigned int *precision); #define KNET_LINK_DEFAULT_PONG_COUNT 5 /** * knet_link_set_pong_count * * @brief Set the pong count for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * pong_count - how many valid ping/pongs before a link is marked UP. * default: 5, value should be > 0 * * @return * knet_link_set_pong_count returns * 0 on success * -1 on error and errno is set. */ int knet_link_set_pong_count(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, uint8_t pong_count); /** * knet_link_get_pong_count * * @brief Get the pong count for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * pong_count - how many valid ping/pongs before a link is marked UP. * default: 5, value should be > 0 * * @return * knet_link_get_pong_count returns * 0 on success * -1 on error and errno is set. */ int knet_link_get_pong_count(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, uint8_t *pong_count); /** * knet_link_set_priority * * @brief Set the priority for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * priority - specify the switching priority for this link * see also knet_host_set_policy * * @return * knet_link_set_priority returns * 0 on success * -1 on error and errno is set. */ int knet_link_set_priority(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, uint8_t priority); /** * knet_link_get_priority * * @brief Get the priority for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * priority - gather the switching priority for this link * see also knet_host_set_policy * * @return * knet_link_get_priority returns * 0 on success * -1 on error and errno is set. */ int knet_link_get_priority(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, uint8_t *priority); /** * knet_link_get_link_list * * @brief Get a list of links connecting a host * * knet_h - pointer to knet_handle_t * * link_ids - array of at lest KNET_MAX_LINK size * with the list of configured links for a certain host. * * link_ids_entries - * number of entries contained in link_ids * * @return * knet_link_get_link_list returns * 0 on success * -1 on error and errno is set. */ int knet_link_get_link_list(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t *link_ids, size_t *link_ids_entries); /* * define link status structure for quick lookup * * src/dst_{ipaddr,port} strings are filled by * getnameinfo(3) when configuring the link. * if the link is dynamic (see knet_link_set_config(3)) * dst_ipaddr/port will contain ipaddr/port of the currently * connected peer or "Unknown" if it was not possible * to determine the ipaddr/port at runtime. * * enabled see also knet_link_set/get_enable. * * connected the link is connected to a peer and ping/pong traffic * is flowing. * * dynconnected the link has dynamic ip on the other end, and * we can see the other host is sending pings to us. * * latency average latency of this link * see also knet_link_set/get_timeout. * * pong_last if the link is down, this value tells us how long * ago this link was active. A value of 0 means that the link * has never been active. * * knet_link_stats structure that contains details statistics for the link */ #define MAX_LINK_EVENTS 16 struct knet_link_stats { /* onwire values */ uint64_t tx_data_packets; uint64_t rx_data_packets; uint64_t tx_data_bytes; uint64_t rx_data_bytes; uint64_t rx_ping_packets; uint64_t tx_ping_packets; uint64_t rx_ping_bytes; uint64_t tx_ping_bytes; uint64_t rx_pong_packets; uint64_t tx_pong_packets; uint64_t rx_pong_bytes; uint64_t tx_pong_bytes; uint64_t rx_pmtu_packets; uint64_t tx_pmtu_packets; uint64_t rx_pmtu_bytes; uint64_t tx_pmtu_bytes; /* Only filled in when requested */ uint64_t tx_total_packets; uint64_t rx_total_packets; uint64_t tx_total_bytes; uint64_t rx_total_bytes; uint64_t tx_total_errors; uint64_t tx_total_retries; uint32_t tx_pmtu_errors; uint32_t tx_pmtu_retries; uint32_t tx_ping_errors; uint32_t tx_ping_retries; uint32_t tx_pong_errors; uint32_t tx_pong_retries; uint32_t tx_data_errors; uint32_t tx_data_retries; /* measured in usecs */ uint32_t latency_min; uint32_t latency_max; uint32_t latency_ave; uint32_t latency_samples; /* how many times the link has been going up/down */ uint32_t down_count; uint32_t up_count; /* * circular buffer of time_t structs collecting the history * of up/down events on this link. * the index indicates current/last event. * it is safe to walk back the history by decreasing the index */ time_t last_up_times[MAX_LINK_EVENTS]; time_t last_down_times[MAX_LINK_EVENTS]; int8_t last_up_time_index; int8_t last_down_time_index; /* Always add new stats at the end */ }; struct knet_link_status { size_t size; /* For ABI checking */ char src_ipaddr[KNET_MAX_HOST_LEN]; char src_port[KNET_MAX_PORT_LEN]; char dst_ipaddr[KNET_MAX_HOST_LEN]; char dst_port[KNET_MAX_PORT_LEN]; uint8_t enabled; /* link is configured and admin enabled for traffic */ uint8_t connected; /* link is connected for data (local view) */ uint8_t dynconnected; /* link has been activated by remote dynip */ unsigned long long latency; /* average latency computed by fix/exp */ struct timespec pong_last; unsigned int mtu; /* current detected MTU on this link */ unsigned int proto_overhead; /* contains the size of the IP protocol, knet headers and * crypto headers (if configured). This value is filled in * ONLY after the first PMTUd run on that given link, * and can change if link configuration or crypto configuration * changes at runtime. * WARNING: in general mtu + proto_overhead might or might * not match the output of ifconfig mtu due to crypto * requirements to pad packets to some specific boundaries. */ /* Link statistics */ struct knet_link_stats stats; }; /** * knet_link_get_status * * @brief Get the status (and statistics) for a link * * knet_h - pointer to knet_handle_t * * host_id - see knet_host_add(3) * * link_id - see knet_link_set_config(3) * * status - pointer to knet_link_status struct * * struct_size - max size of knet_link_status - allows library to * add fields without ABI change. Returned structure * will be truncated to this length and .size member * indicates the full size. * * @return * knet_link_get_status returns * 0 on success * -1 on error and errno is set. */ int knet_link_get_status(knet_handle_t knet_h, knet_node_id_t host_id, uint8_t link_id, struct knet_link_status *status, size_t struct_size); /* * logging structs/API calls */ /* * libknet is composed of several subsystems. In order * to easily distinguish log messages coming from different * places, each subsystem has its own ID. * * 0-19 config/management * 20-39 internal threads * 40-59 transports * 60-69 crypto implementations */ #define KNET_SUB_COMMON 0 /* common.c */ #define KNET_SUB_HANDLE 1 /* handle.c alloc/dealloc config changes */ #define KNET_SUB_HOST 2 /* host add/del/modify */ #define KNET_SUB_LISTENER 3 /* listeners add/del/modify... */ #define KNET_SUB_LINK 4 /* link add/del/modify */ #define KNET_SUB_TRANSPORT 5 /* Transport common */ #define KNET_SUB_CRYPTO 6 /* crypto.c config generic layer */ #define KNET_SUB_COMPRESS 7 /* compress.c config generic layer */ #define KNET_SUB_FILTER 19 /* allocated for users to log from dst_filter */ #define KNET_SUB_DSTCACHE 20 /* switching thread (destination cache handling) */ #define KNET_SUB_HEARTBEAT 21 /* heartbeat thread */ #define KNET_SUB_PMTUD 22 /* Path MTU Discovery thread */ #define KNET_SUB_TX 23 /* send to link thread */ #define KNET_SUB_RX 24 /* recv from link thread */ #define KNET_SUB_TRANSP_BASE 40 /* Base log level for transports */ #define KNET_SUB_TRANSP_LOOPBACK (KNET_SUB_TRANSP_BASE + KNET_TRANSPORT_LOOPBACK) #define KNET_SUB_TRANSP_UDP (KNET_SUB_TRANSP_BASE + KNET_TRANSPORT_UDP) #define KNET_SUB_TRANSP_SCTP (KNET_SUB_TRANSP_BASE + KNET_TRANSPORT_SCTP) #define KNET_SUB_NSSCRYPTO 60 /* nsscrypto.c */ #define KNET_SUB_OPENSSLCRYPTO 61 /* opensslcrypto.c */ #define KNET_SUB_ZLIBCOMP 70 /* compress_zlib.c */ #define KNET_SUB_LZ4COMP 71 /* compress_lz4.c */ #define KNET_SUB_LZ4HCCOMP 72 /* compress_lz4.c */ #define KNET_SUB_LZO2COMP 73 /* compress_lzo.c */ #define KNET_SUB_LZMACOMP 74 /* compress_lzma.c */ #define KNET_SUB_BZIP2COMP 75 /* compress_bzip2.c */ #define KNET_SUB_UNKNOWN UINT8_MAX - 1 #define KNET_MAX_SUBSYSTEMS UINT8_MAX /* * Convert between subsystem IDs and names */ /** * knet_log_get_subsystem_name * * @brief Get a logging system name from its numeric ID * * @return * returns internal name of the subsystem or "common" */ const char *knet_log_get_subsystem_name(uint8_t subsystem); /** * knet_log_get_subsystem_id * * @brief Get a logging system ID from its name * * @return * returns internal ID of the subsystem or KNET_SUB_COMMON */ uint8_t knet_log_get_subsystem_id(const char *name); /* * 4 log levels are enough for everybody */ #define KNET_LOG_ERR 0 /* unrecoverable errors/conditions */ #define KNET_LOG_WARN 1 /* recoverable errors/conditions */ #define KNET_LOG_INFO 2 /* info, link up/down, config changes.. */ #define KNET_LOG_DEBUG 3 /* * Convert between log level values and names */ /** * knet_log_get_loglevel_name * * @brief Get a logging level name from its numeric ID * * @return * returns internal name of the log level or "ERROR" for unknown values */ const char *knet_log_get_loglevel_name(uint8_t level); /** * knet_log_get_loglevel_id * * @brief Get a logging level ID from its name * * @return * returns internal log level ID or KNET_LOG_ERR for invalid names */ uint8_t knet_log_get_loglevel_id(const char *name); /* * every log message is composed by a text message * and message level/subsystem IDs. * In order to make debugging easier it is possible to send those packets * straight to stdout/stderr (see knet_bench.c stdout option). */ #define KNET_MAX_LOG_MSG_SIZE 254 #if KNET_MAX_LOG_MSG_SIZE > PIPE_BUF #error KNET_MAX_LOG_MSG_SIZE cannot be bigger than PIPE_BUF for guaranteed system atomic writes #endif struct knet_log_msg { char msg[KNET_MAX_LOG_MSG_SIZE]; uint8_t subsystem; /* KNET_SUB_* */ uint8_t msglevel; /* KNET_LOG_* */ }; /** * knet_log_set_log_level * * @brief Set the logging level for a subsystem * * knet_h - same as above * * subsystem - same as above * * level - same as above * * knet_log_set_loglevel allows fine control of log levels by subsystem. * See also knet_handle_new for defaults. * * @return * knet_log_set_loglevel returns * 0 on success * -1 on error and errno is set. */ int knet_log_set_loglevel(knet_handle_t knet_h, uint8_t subsystem, uint8_t level); /** * knet_log_get_log_level * * @brief Get the logging level for a subsystem * * knet_h - same as above * * subsystem - same as above * * level - same as above * * @return * knet_log_get_loglevel returns * 0 on success * -1 on error and errno is set. */ int knet_log_get_loglevel(knet_handle_t knet_h, uint8_t subsystem, uint8_t *level); #endif diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c index 9a4948cd..2e58e727 100644 --- a/libknet/tests/knet_bench.c +++ b/libknet/tests/knet_bench.c @@ -1,1296 +1,1299 @@ /* * Copyright (C) 2016-2018 Red Hat, Inc. All rights reserved. * * Authors: Fabio M. Di Nitto * * This software licensed under GPL-2.0+, LGPL-2.0+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include "libknet.h" #include "compat.h" #include "internals.h" #include "netutils.h" #include "transport_common.h" #include "threads_common.h" #include "test-common.h" #define MAX_NODES 128 static int senderid = -1; static int thisnodeid = -1; static knet_handle_t knet_h; static int datafd = 0; static int8_t channel = 0; static int globallistener = 0; static int continous = 0; static int show_stats = 0; static struct sockaddr_storage allv4; static struct sockaddr_storage allv6; static int broadcast_test = 1; static pthread_t rx_thread = (pthread_t)NULL; static char *rx_buf[PCKT_FRAG_MAX]; static int wait_for_perf_rx = 0; static char *compresscfg = NULL; static char *cryptocfg = NULL; static int machine_output = 0; static int bench_shutdown_in_progress = 0; static pthread_mutex_t shutdown_mutex = PTHREAD_MUTEX_INITIALIZER; #define TEST_PING 0 #define TEST_PING_AND_DATA 1 #define TEST_PERF_BY_SIZE 2 #define TEST_PERF_BY_TIME 3 static int test_type = TEST_PING; #define TEST_START 2 #define TEST_STOP 4 #define TEST_COMPLETE 6 #define ONE_GIGABYTE 1073741824 static uint64_t perf_by_size_size = 1 * ONE_GIGABYTE; static uint64_t perf_by_time_secs = 10; struct node { int nodeid; int links; uint8_t transport[KNET_MAX_LINK]; struct sockaddr_storage address[KNET_MAX_LINK]; }; static void print_help(void) { printf("knet_bench usage:\n"); printf(" -h print this help (no really)\n"); printf(" -d enable debug logs (default INFO)\n"); printf(" -c [implementation]:[crypto]:[hashing] crypto configuration. (default disabled)\n"); printf(" Example: -c nss:aes128:sha1\n"); printf(" -z [implementation]:[level]:[threshold] compress configuration. (default disabled)\n"); printf(" Example: -z zlib:5:100\n"); printf(" -p [active|passive|rr] (default: passive)\n"); printf(" -P [UDP|SCTP] (default: UDP) protocol (transport) to use for all links\n"); printf(" -t [nodeid] This nodeid (required)\n"); printf(" -n [nodeid],[proto]/[link1_ip],[link2_..] Other nodes information (at least one required)\n"); printf(" Example: -t 1,192.168.8.1,SCTP/3ffe::8:1,UDP/172...\n"); printf(" can be repeated up to %d and should contain also the localnode info\n", MAX_NODES); printf(" -b [port] baseport (default: 50000)\n"); printf(" -l enable global listener on 0.0.0.0/:: (default: off, incompatible with -o)\n"); printf(" -o enable baseport offset per nodeid\n"); printf(" -m change PMTUd interval in seconds (default: 60)\n"); printf(" -w dont wait for all nodes to be up before starting the test (default: wait)\n"); printf(" -T [ping|ping_data|perf-by-size|perf-by-time]\n"); printf(" test type (default: ping)\n"); printf(" ping: will wait for all hosts to join the knet network, sleep 5 seconds and quit\n"); printf(" ping_data: will wait for all hosts to join the knet network, sends some data to all nodes and quit\n"); printf(" perf-by-size: will wait for all hosts to join the knet network,\n"); printf(" perform a series of benchmarks by transmitting a known\n"); printf(" size/quantity of packets and measuring the time, then quit\n"); printf(" perf-by-time: will wait for all hosts to join the knet network,\n"); printf(" perform a series of benchmarks by transmitting a known\n"); printf(" size of packets for a given amount of time (10 seconds)\n"); printf(" and measuring the quantity of data transmitted, then quit\n"); printf(" -s nodeid that will generate traffic for benchmarks\n"); printf(" -S [size|seconds] when used in combination with -T perf-by-size it indicates how many GB of traffic to generate for the test. (default: 1GB)\n"); printf(" when used in combination with -T perf-by-time it indicates how many Seconds of traffic to generate for the test. (default: 10 seconds)\n"); printf(" -C repeat the test continously (default: off)\n"); printf(" -X[XX] show stats at the end of the run (default: 1)\n"); printf(" 1: show handle stats, 2: show summary link stats\n"); printf(" 3: show detailed link stats\n"); printf(" -a enable machine parsable output (default: off).\n"); } static void parse_nodes(char *nodesinfo[MAX_NODES], int onidx, int port, struct node nodes[MAX_NODES], int *thisidx) { int i; char *temp = NULL; char port_str[10]; memset(port_str, 0, sizeof(port_str)); sprintf(port_str, "%d", port); for (i = 0; i < onidx; i++) { nodes[i].nodeid = atoi(strtok(nodesinfo[i], ",")); if ((nodes[i].nodeid < 0) || (nodes[i].nodeid > KNET_MAX_HOST)) { printf("Invalid nodeid: %d (0 - %d)\n", nodes[i].nodeid, KNET_MAX_HOST); exit(FAIL); } if (thisnodeid == nodes[i].nodeid) { *thisidx = i; } while((temp = strtok(NULL, ","))) { char *slash = NULL; uint8_t transport; if (nodes[i].links == KNET_MAX_LINK) { printf("Too many links configured. Max %d\n", KNET_MAX_LINK); exit(FAIL); } slash = strstr(temp, "/"); if (slash) { memset(slash, 0, 1); transport = knet_get_transport_id_by_name(temp); if (transport == KNET_MAX_TRANSPORTS) { printf("Unknown transport: %s\n", temp); exit(FAIL); } nodes[i].transport[nodes[i].links] = transport; temp = slash + 1; } else { nodes[i].transport[nodes[i].links] = KNET_TRANSPORT_UDP; } if (knet_strtoaddr(temp, port_str, &nodes[i].address[nodes[i].links], sizeof(struct sockaddr_storage)) < 0) { printf("Unable to convert %s to sockaddress\n", temp); exit(FAIL); } nodes[i].links++; } } if (knet_strtoaddr("0.0.0.0", port_str, &allv4, sizeof(struct sockaddr_storage)) < 0) { printf("Unable to convert 0.0.0.0 to sockaddress\n"); exit(FAIL); } if (knet_strtoaddr("::", port_str, &allv6, sizeof(struct sockaddr_storage)) < 0) { printf("Unable to convert :: to sockaddress\n"); exit(FAIL); } for (i = 1; i < onidx; i++) { if (nodes[0].links != nodes[i].links) { printf("knet_bench does not support unbalanced link configuration\n"); exit(FAIL); } } return; } static int private_data; static void sock_notify(void *pvt_data, int local_datafd, int8_t local_channel, uint8_t tx_rx, int error, int errorno) { printf("[info]: error (%d - %d - %s) from socket: %d\n", error, errorno, strerror(errno), local_datafd); return; } static int ping_dst_host_filter(void *pvt_data, const unsigned char *outdata, ssize_t outdata_len, uint8_t tx_rx, knet_node_id_t this_host_id, knet_node_id_t src_host_id, int8_t *dst_channel, knet_node_id_t *dst_host_ids, size_t *dst_host_ids_entries) { if (broadcast_test) { return 1; } if (tx_rx == KNET_NOTIFY_TX) { memmove(&dst_host_ids[0], outdata, 2); } else { dst_host_ids[0] = this_host_id; } *dst_host_ids_entries = 1; return 0; } static void setup_knet(int argc, char *argv[]) { int logfd = 0; int rv; char *policystr = NULL, *protostr = NULL; char *othernodeinfo[MAX_NODES]; struct node nodes[MAX_NODES]; int thisidx = -1; int onidx = 0; int debug = KNET_LOG_INFO; int port = 50000, portoffset = 0; int thisport = 0, otherport = 0; int thisnewport = 0, othernewport = 0; struct sockaddr_in *so_in; struct sockaddr_in6 *so_in6; struct sockaddr_storage *src; int i, link_idx, allnodesup = 0; int policy = KNET_LINK_POLICY_PASSIVE, policyfound = 0; int protocol = KNET_TRANSPORT_UDP, protofound = 0; int wait = 1; int pmtud_interval = 60; struct knet_handle_crypto_cfg knet_handle_crypto_cfg; char *cryptomodel = NULL, *cryptotype = NULL, *cryptohash = NULL; struct knet_handle_compress_cfg knet_handle_compress_cfg; memset(nodes, 0, sizeof(nodes)); while ((rv = getopt(argc, argv, "aCT:S:s:ldom:wb:t:n:c:p:X::P:z:h")) != EOF) { switch(rv) { case 'h': print_help(); exit(PASS); break; case 'a': machine_output = 1; break; case 'd': debug = KNET_LOG_DEBUG; break; case 'c': if (cryptocfg) { printf("Error: -c can only be specified once\n"); exit(FAIL); } cryptocfg = optarg; break; case 'p': if (policystr) { printf("Error: -p can only be specified once\n"); exit(FAIL); } policystr = optarg; if (!strcmp(policystr, "active")) { policy = KNET_LINK_POLICY_ACTIVE; policyfound = 1; } /* * we can't use rr because clangs can't compile * an array of 3 strings, one of which is 2 bytes long */ if (!strcmp(policystr, "round-robin")) { policy = KNET_LINK_POLICY_RR; policyfound = 1; } if (!strcmp(policystr, "passive")) { policy = KNET_LINK_POLICY_PASSIVE; policyfound = 1; } if (!policyfound) { printf("Error: invalid policy %s specified. -p accepts active|passive|rr\n", policystr); exit(FAIL); } break; case 'P': if (protostr) { printf("Error: -P can only be specified once\n"); exit(FAIL); } protostr = optarg; if (!strcmp(protostr, "UDP")) { protocol = KNET_TRANSPORT_UDP; protofound = 1; } if (!strcmp(protostr, "SCTP")) { protocol = KNET_TRANSPORT_SCTP; protofound = 1; } if (!protofound) { printf("Error: invalid protocol %s specified. -P accepts udp|sctp\n", policystr); exit(FAIL); } break; case 't': if (thisnodeid >= 0) { printf("Error: -t can only be specified once\n"); exit(FAIL); } thisnodeid = atoi(optarg); if ((thisnodeid < 0) || (thisnodeid > 65536)) { printf("Error: -t nodeid out of range %d (1 - 65536)\n", thisnodeid); exit(FAIL); } break; case 'n': if (onidx == MAX_NODES) { printf("Error: too many other nodes. Max %d\n", MAX_NODES); exit(FAIL); } othernodeinfo[onidx] = optarg; onidx++; break; case 'b': port = atoi(optarg); if ((port < 1) || (port > 65536)) { printf("Error: port %d out of range (1 - 65536)\n", port); exit(FAIL); } break; case 'o': if (globallistener) { printf("Error: -l cannot be used with -o\n"); exit(FAIL); } portoffset = 1; break; case 'm': pmtud_interval = atoi(optarg); if (pmtud_interval < 1) { printf("Error: pmtud interval %d out of range (> 0)\n", pmtud_interval); exit(FAIL); } break; case 'l': if (portoffset) { printf("Error: -o cannot be used with -l\n"); exit(FAIL); } globallistener = 1; break; case 'w': wait = 0; break; case 's': if (senderid >= 0) { printf("Error: -s can only be specified once\n"); exit(FAIL); } senderid = atoi(optarg); if ((senderid < 0) || (senderid > 65536)) { printf("Error: -s nodeid out of range %d (1 - 65536)\n", senderid); exit(FAIL); } break; case 'T': if (!strcmp("ping", optarg)) { test_type = TEST_PING; } if (!strcmp("ping_data", optarg)) { test_type = TEST_PING_AND_DATA; } if (!strcmp("perf-by-size", optarg)) { test_type = TEST_PERF_BY_SIZE; } if (!strcmp("perf-by-time", optarg)) { test_type = TEST_PERF_BY_TIME; } break; case 'S': perf_by_size_size = (uint64_t)atoi(optarg) * ONE_GIGABYTE; perf_by_time_secs = (uint64_t)atoi(optarg); break; case 'C': continous = 1; break; case 'X': if (optarg) { show_stats = atoi(optarg); } else { show_stats = 1; } break; case 'z': if (compresscfg) { printf("Error: -c can only be specified once\n"); exit(FAIL); } compresscfg = optarg; break; default: break; } } if (thisnodeid < 0) { printf("Who am I?!? missing -t from command line?\n"); exit(FAIL); } if (onidx < 1) { printf("no other nodes configured?!? missing -n from command line\n"); exit(FAIL); } parse_nodes(othernodeinfo, onidx, port, nodes, &thisidx); if (thisidx < 0) { printf("no config for this node found\n"); exit(FAIL); } if (senderid >= 0) { for (i=0; i < onidx; i++) { if (senderid == nodes[i].nodeid) { break; } } if (i == onidx) { printf("Unable to find senderid in nodelist\n"); exit(FAIL); } } if (((test_type == TEST_PERF_BY_SIZE) || (test_type == TEST_PERF_BY_TIME)) && (senderid < 0)) { printf("Error: performance test requires -s to be set (for now)\n"); exit(FAIL); } logfd = start_logging(stdout); knet_h = knet_handle_new(thisnodeid, logfd, debug); if (!knet_h) { printf("Unable to knet_handle_new: %s\n", strerror(errno)); exit(FAIL); } if (cryptocfg) { memset(&knet_handle_crypto_cfg, 0, sizeof(knet_handle_crypto_cfg)); cryptomodel = strtok(cryptocfg, ":"); cryptotype = strtok(NULL, ":"); cryptohash = strtok(NULL, ":"); if (cryptomodel) { strncpy(knet_handle_crypto_cfg.crypto_model, cryptomodel, sizeof(knet_handle_crypto_cfg.crypto_model) - 1); } if (cryptotype) { strncpy(knet_handle_crypto_cfg.crypto_cipher_type, cryptotype, sizeof(knet_handle_crypto_cfg.crypto_cipher_type) - 1); } if (cryptohash) { strncpy(knet_handle_crypto_cfg.crypto_hash_type, cryptohash, sizeof(knet_handle_crypto_cfg.crypto_hash_type) - 1); } knet_handle_crypto_cfg.private_key_len = KNET_MAX_KEY_LEN; if (knet_handle_crypto(knet_h, &knet_handle_crypto_cfg)) { printf("Unable to init crypto\n"); exit(FAIL); } } if (compresscfg) { memset(&knet_handle_compress_cfg, 0, sizeof(struct knet_handle_compress_cfg)); snprintf(knet_handle_compress_cfg.compress_model, 16, "%s", strtok(compresscfg, ":")); knet_handle_compress_cfg.compress_level = atoi(strtok(NULL, ":")); knet_handle_compress_cfg.compress_threshold = atoi(strtok(NULL, ":")); if (knet_handle_compress(knet_h, &knet_handle_compress_cfg)) { printf("Unable to configure compress\n"); exit(FAIL); } } if (knet_handle_enable_sock_notify(knet_h, &private_data, sock_notify) < 0) { printf("knet_handle_enable_sock_notify failed: %s\n", strerror(errno)); knet_handle_free(knet_h); exit(FAIL); } datafd = 0; channel = -1; if (knet_handle_add_datafd(knet_h, &datafd, &channel) < 0) { printf("knet_handle_add_datafd failed: %s\n", strerror(errno)); knet_handle_free(knet_h); exit(FAIL); } if (knet_handle_pmtud_setfreq(knet_h, pmtud_interval) < 0) { printf("knet_handle_pmtud_setfreq failed: %s\n", strerror(errno)); knet_handle_free(knet_h); exit(FAIL); } for (i=0; i < onidx; i++) { if (i == thisidx) { continue; } if (knet_host_add(knet_h, nodes[i].nodeid) < 0) { printf("knet_host_add failed: %s\n", strerror(errno)); exit(FAIL); } if (knet_host_set_policy(knet_h, nodes[i].nodeid, policy) < 0) { printf("knet_host_set_policy failed: %s\n", strerror(errno)); exit(FAIL); } for (link_idx = 0; link_idx < nodes[i].links; link_idx++) { if (portoffset) { if (nodes[thisidx].address[link_idx].ss_family == AF_INET) { so_in = (struct sockaddr_in *)&nodes[thisidx].address[link_idx]; thisport = ntohs(so_in->sin_port); thisnewport = thisport + nodes[i].nodeid; so_in->sin_port = (htons(thisnewport)); so_in = (struct sockaddr_in *)&nodes[i].address[link_idx]; otherport = ntohs(so_in->sin_port); othernewport = otherport + nodes[thisidx].nodeid; so_in->sin_port = (htons(othernewport)); } else { so_in6 = (struct sockaddr_in6 *)&nodes[thisidx].address[link_idx]; thisport = ntohs(so_in6->sin6_port); thisnewport = thisport + nodes[i].nodeid; so_in6->sin6_port = (htons(thisnewport)); so_in6 = (struct sockaddr_in6 *)&nodes[i].address[link_idx]; otherport = ntohs(so_in6->sin6_port); othernewport = otherport + nodes[thisidx].nodeid; so_in6->sin6_port = (htons(othernewport)); } } if (!globallistener) { src = &nodes[thisidx].address[link_idx]; } else { if (nodes[thisidx].address[link_idx].ss_family == AF_INET) { src = &allv4; } else { src = &allv6; } } /* * -P overrides per link protocol configuration */ if (protofound) { nodes[i].transport[link_idx] = protocol; } if (knet_link_set_config(knet_h, nodes[i].nodeid, link_idx, nodes[i].transport[link_idx], src, &nodes[i].address[link_idx], 0) < 0) { printf("Unable to configure link: %s\n", strerror(errno)); exit(FAIL); } if (portoffset) { if (nodes[thisidx].address[link_idx].ss_family == AF_INET) { so_in = (struct sockaddr_in *)&nodes[thisidx].address[link_idx]; so_in->sin_port = (htons(thisport)); so_in = (struct sockaddr_in *)&nodes[i].address[link_idx]; so_in->sin_port = (htons(otherport)); } else { so_in6 = (struct sockaddr_in6 *)&nodes[thisidx].address[link_idx]; so_in6->sin6_port = (htons(thisport)); so_in6 = (struct sockaddr_in6 *)&nodes[i].address[link_idx]; so_in6->sin6_port = (htons(otherport)); } } if (knet_link_set_enable(knet_h, nodes[i].nodeid, link_idx, 1) < 0) { printf("knet_link_set_enable failed: %s\n", strerror(errno)); exit(FAIL); } if (knet_link_set_ping_timers(knet_h, nodes[i].nodeid, link_idx, 1000, 10000, 2048) < 0) { printf("knet_link_set_ping_timers failed: %s\n", strerror(errno)); exit(FAIL); } if (knet_link_set_pong_count(knet_h, nodes[i].nodeid, link_idx, 2) < 0) { printf("knet_link_set_pong_count failed: %s\n", strerror(errno)); exit(FAIL); } } } if (knet_handle_enable_filter(knet_h, NULL, ping_dst_host_filter)) { printf("Unable to enable dst_host_filter: %s\n", strerror(errno)); exit(FAIL); } if (knet_handle_setfwd(knet_h, 1) < 0) { printf("knet_handle_setfwd failed: %s\n", strerror(errno)); exit(FAIL); } if (wait) { while(!allnodesup) { allnodesup = 1; for (i=0; i < onidx; i++) { if (i == thisidx) { continue; } if (knet_h->host_index[nodes[i].nodeid]->status.reachable != 1) { printf("[info]: waiting host %d to be reachable\n", nodes[i].nodeid); allnodesup = 0; } } if (!allnodesup) { sleep(1); } } sleep(1); } } static void *_rx_thread(void *args) { int rx_epoll; struct epoll_event ev; struct epoll_event events[KNET_EPOLL_MAX_EVENTS]; struct sockaddr_storage address[PCKT_FRAG_MAX]; struct knet_mmsghdr msg[PCKT_FRAG_MAX]; struct iovec iov_in[PCKT_FRAG_MAX]; int i, msg_recv; struct timespec clock_start, clock_end; unsigned long long time_diff = 0; uint64_t rx_pkts = 0; uint64_t rx_bytes = 0; unsigned int current_pckt_size = 0; for (i = 0; i < PCKT_FRAG_MAX; i++) { rx_buf[i] = malloc(KNET_MAX_PACKET_SIZE); if (!rx_buf[i]) { printf("RXT: Unable to malloc!\nHALTING RX THREAD!\n"); return NULL; } memset(rx_buf[i], 0, KNET_MAX_PACKET_SIZE); iov_in[i].iov_base = (void *)rx_buf[i]; iov_in[i].iov_len = KNET_MAX_PACKET_SIZE; memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr)); msg[i].msg_hdr.msg_name = &address[i]; msg[i].msg_hdr.msg_namelen = sizeof(struct sockaddr_storage); msg[i].msg_hdr.msg_iov = &iov_in[i]; msg[i].msg_hdr.msg_iovlen = 1; } rx_epoll = epoll_create(KNET_EPOLL_MAX_EVENTS + 1); if (rx_epoll < 0) { printf("RXT: Unable to create epoll!\nHALTING RX THREAD!\n"); return NULL; } memset(&ev, 0, sizeof(struct epoll_event)); ev.events = EPOLLIN; ev.data.fd = datafd; if (epoll_ctl(rx_epoll, EPOLL_CTL_ADD, datafd, &ev)) { printf("RXT: Unable to add datafd to epoll\nHALTING RX THREAD!\n"); return NULL; } memset(&clock_start, 0, sizeof(clock_start)); memset(&clock_end, 0, sizeof(clock_start)); while (!bench_shutdown_in_progress) { if (epoll_wait(rx_epoll, events, KNET_EPOLL_MAX_EVENTS, 1) >= 1) { msg_recv = _recvmmsg(datafd, &msg[0], PCKT_FRAG_MAX, MSG_DONTWAIT | MSG_NOSIGNAL); if (msg_recv < 0) { printf("[info]: RXT: error from recvmmsg: %s\n", strerror(errno)); } switch(test_type) { case TEST_PING_AND_DATA: for (i = 0; i < msg_recv; i++) { if (msg[i].msg_len == 0) { printf("[info]: RXT: received 0 bytes message?\n"); } printf("[info]: received %u bytes message: %s\n", msg[i].msg_len, (char *)msg[i].msg_hdr.msg_iov->iov_base); } break; case TEST_PERF_BY_TIME: case TEST_PERF_BY_SIZE: for (i = 0; i < msg_recv; i++) { if (msg[i].msg_len < 64) { if (msg[i].msg_len == 0) { printf("[info]: RXT: received 0 bytes message?\n"); } if (msg[i].msg_len == TEST_START) { if (clock_gettime(CLOCK_MONOTONIC, &clock_start) != 0) { printf("[info]: unable to get start time!\n"); } } if (msg[i].msg_len == TEST_STOP) { double average_rx_mbytes; double average_rx_pkts; double time_diff_sec; if (clock_gettime(CLOCK_MONOTONIC, &clock_end) != 0) { printf("[info]: unable to get end time!\n"); } timespec_diff(clock_start, clock_end, &time_diff); /* * adjust for sleep(2) between sending the last data and TEST_STOP */ time_diff = time_diff - 2000000000llu; /* * convert to seconds */ time_diff_sec = (double)time_diff / 1000000000llu; average_rx_mbytes = (double)((rx_bytes / time_diff_sec) / (1024 * 1024)); average_rx_pkts = (double)(rx_pkts / time_diff_sec); if (!machine_output) { printf("[perf] execution time: %8.4f secs Average speed: %8.4f MB/sec %8.4f pckts/sec (size: %u total: %" PRIu64 ")\n", time_diff_sec, average_rx_mbytes, average_rx_pkts, current_pckt_size, rx_pkts); } else { printf("[perf],%.4f,%u,%" PRIu64 ",%.4f,%.4f\n", time_diff_sec, current_pckt_size, rx_pkts, average_rx_mbytes, average_rx_pkts); } rx_pkts = 0; rx_bytes = 0; current_pckt_size = 0; } if (msg[i].msg_len == TEST_COMPLETE) { wait_for_perf_rx = 1; } continue; } rx_pkts++; rx_bytes = rx_bytes + msg[i].msg_len; current_pckt_size = msg[i].msg_len; } break; } } } epoll_ctl(rx_epoll, EPOLL_CTL_DEL, datafd, &ev); close(rx_epoll); return NULL; } static void setup_data_txrx_common(void) { if (!rx_thread) { if (knet_handle_enable_filter(knet_h, NULL, ping_dst_host_filter)) { printf("Unable to enable dst_host_filter: %s\n", strerror(errno)); exit(FAIL); } printf("[info]: setting up rx thread\n"); if (pthread_create(&rx_thread, 0, _rx_thread, NULL)) { printf("Unable to start rx thread\n"); exit(FAIL); } } } static void stop_rx_thread(void) { void *retval; int i; if (rx_thread) { printf("[info]: shutting down rx thread\n"); sleep(2); pthread_cancel(rx_thread); pthread_join(rx_thread, &retval); for (i = 0; i < PCKT_FRAG_MAX; i ++) { free(rx_buf[i]); } } } static void send_ping_data(void) { char buf[65535]; ssize_t len; memset(&buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "Hello world!"); if (compresscfg) { len = sizeof(buf); } else { len = strlen(buf); } if (knet_send(knet_h, buf, len, channel) != len) { printf("[info]: Error sending hello world: %s\n", strerror(errno)); } sleep(1); } static int send_messages(struct knet_mmsghdr *msg, int msgs_to_send) { int sent_msgs, prev_sent, progress, total_sent; total_sent = 0; sent_msgs = 0; prev_sent = 0; progress = 1; retry: errno = 0; sent_msgs = _sendmmsg(datafd, &msg[0], msgs_to_send, MSG_NOSIGNAL); if (sent_msgs < 0) { if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { usleep(KNET_THREADS_TIMERES / 16); goto retry; } printf("[info]: Unable to send messages: %s\n", strerror(errno)); return -1; } total_sent = total_sent + sent_msgs; if ((sent_msgs >= 0) && (sent_msgs < msgs_to_send)) { if ((sent_msgs) || (progress)) { msgs_to_send = msgs_to_send - sent_msgs; prev_sent = prev_sent + sent_msgs; if (sent_msgs) { progress = 1; } else { progress = 0; } goto retry; } if (!progress) { printf("[info]: Unable to send more messages after retry\n"); } } return total_sent; } static int setup_send_buffers_common(struct knet_mmsghdr *msg, struct iovec *iov_out, char *tx_buf[]) { int i; for (i = 0; i < PCKT_FRAG_MAX; i++) { tx_buf[i] = malloc(KNET_MAX_PACKET_SIZE); if (!tx_buf[i]) { printf("TXT: Unable to malloc!\n"); return -1; } memset(tx_buf[i], 0, KNET_MAX_PACKET_SIZE); iov_out[i].iov_base = (void *)tx_buf[i]; memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr)); msg[i].msg_hdr.msg_iov = &iov_out[i]; msg[i].msg_hdr.msg_iovlen = 1; } return 0; } static void send_perf_data_by_size(void) { char *tx_buf[PCKT_FRAG_MAX]; struct knet_mmsghdr msg[PCKT_FRAG_MAX]; struct iovec iov_out[PCKT_FRAG_MAX]; char ctrl_message[16]; int sent_msgs; int i; uint64_t total_pkts_to_tx; uint64_t packets_to_send; uint32_t packetsize = 64; setup_send_buffers_common(msg, iov_out, tx_buf); while (packetsize <= KNET_MAX_PACKET_SIZE) { for (i = 0; i < PCKT_FRAG_MAX; i++) { iov_out[i].iov_len = packetsize; } total_pkts_to_tx = perf_by_size_size / packetsize; printf("[info]: testing with %u packet size. total bytes to transfer: %" PRIu64 " (%" PRIu64 " packets)\n", packetsize, perf_by_size_size, total_pkts_to_tx); memset(ctrl_message, 0, sizeof(ctrl_message)); knet_send(knet_h, ctrl_message, TEST_START, channel); while (total_pkts_to_tx > 0) { if (total_pkts_to_tx >= PCKT_FRAG_MAX) { packets_to_send = PCKT_FRAG_MAX; } else { packets_to_send = total_pkts_to_tx; } sent_msgs = send_messages(&msg[0], packets_to_send); if (sent_msgs < 0) { printf("Something went wrong, aborting\n"); exit(FAIL); } total_pkts_to_tx = total_pkts_to_tx - sent_msgs; } sleep(2); knet_send(knet_h, ctrl_message, TEST_STOP, channel); if (packetsize == KNET_MAX_PACKET_SIZE) { break; } /* * Use a multiplier that can always divide properly a GB * into smaller chunks without worry about boundaries */ packetsize *= 4; if (packetsize > KNET_MAX_PACKET_SIZE) { packetsize = KNET_MAX_PACKET_SIZE; } } knet_send(knet_h, ctrl_message, TEST_COMPLETE, channel); for (i = 0; i < PCKT_FRAG_MAX; i++) { free(tx_buf[i]); } } /* For sorting the node list into order */ static int node_compare(const void *aptr, const void *bptr) { uint16_t a,b; a = *(uint16_t *)aptr; b = *(uint16_t *)bptr; return a > b; } static void display_stats(int level) { struct knet_handle_stats handle_stats; struct knet_link_status link_status; struct knet_link_stats total_link_stats; knet_node_id_t host_list[KNET_MAX_HOST]; uint8_t link_list[KNET_MAX_LINK]; int res; unsigned int i,j; size_t num_hosts, num_links; res = knet_handle_get_stats(knet_h, &handle_stats, sizeof(handle_stats)); if (res) { perror("[info]: failed to get knet handle stats"); return; } if (compresscfg || cryptocfg) { printf("\n"); printf("[stat]: handle stats\n"); printf("[stat]: ------------\n"); if (compresscfg) { printf("[stat]: tx_uncompressed_packets: %" PRIu64 "\n", handle_stats.tx_uncompressed_packets); printf("[stat]: tx_compressed_packets: %" PRIu64 "\n", handle_stats.tx_compressed_packets); printf("[stat]: tx_compressed_original_bytes: %" PRIu64 "\n", handle_stats.tx_compressed_original_bytes); printf("[stat]: tx_compressed_size_bytes: %" PRIu64 "\n", handle_stats.tx_compressed_size_bytes ); printf("[stat]: tx_compress_time_ave: %" PRIu64 "\n", handle_stats.tx_compress_time_ave); printf("[stat]: tx_compress_time_min: %" PRIu64 "\n", handle_stats.tx_compress_time_min); printf("[stat]: tx_compress_time_max: %" PRIu64 "\n", handle_stats.tx_compress_time_max); + printf("[stat]: tx_failed_to_compress: %" PRIu64 "\n", handle_stats.tx_failed_to_compress); + printf("[stat]: tx_unable_to_compress: %" PRIu64 "\n", handle_stats.tx_unable_to_compress); printf("[stat]: rx_compressed_packets: %" PRIu64 "\n", handle_stats.rx_compressed_packets); printf("[stat]: rx_compressed_original_bytes: %" PRIu64 "\n", handle_stats.rx_compressed_original_bytes); printf("[stat]: rx_compressed_size_bytes: %" PRIu64 "\n", handle_stats.rx_compressed_size_bytes); printf("[stat]: rx_compress_time_ave: %" PRIu64 "\n", handle_stats.rx_compress_time_ave); printf("[stat]: rx_compress_time_min: %" PRIu64 "\n", handle_stats.rx_compress_time_min); printf("[stat]: rx_compress_time_max: %" PRIu64 "\n", handle_stats.rx_compress_time_max); + printf("[stat]: rx_failed_to_decompress: %" PRIu64 "\n", handle_stats.rx_failed_to_decompress); printf("\n"); } if (cryptocfg) { printf("[stat]: tx_crypt_packets: %" PRIu64 "\n", handle_stats.tx_crypt_packets); printf("[stat]: tx_crypt_byte_overhead: %" PRIu64 "\n", handle_stats.tx_crypt_byte_overhead); printf("[stat]: tx_crypt_time_ave: %" PRIu64 "\n", handle_stats.tx_crypt_time_ave); printf("[stat]: tx_crypt_time_min: %" PRIu64 "\n", handle_stats.tx_crypt_time_min); printf("[stat]: tx_crypt_time_max: %" PRIu64 "\n", handle_stats.tx_crypt_time_max); printf("[stat]: rx_crypt_packets: %" PRIu64 "\n", handle_stats.rx_crypt_packets); printf("[stat]: rx_crypt_time_ave: %" PRIu64 "\n", handle_stats.rx_crypt_time_ave); printf("[stat]: rx_crypt_time_min: %" PRIu64 "\n", handle_stats.rx_crypt_time_min); printf("[stat]: rx_crypt_time_max: %" PRIu64 "\n", handle_stats.rx_crypt_time_max); printf("\n"); } } if (level < 2) { return; } memset(&total_link_stats, 0, sizeof(struct knet_link_stats)); res = knet_host_get_host_list(knet_h, host_list, &num_hosts); if (res) { perror("[info]: cannot get host list for stats"); return; } /* Print in host ID order */ qsort(host_list, num_hosts, sizeof(uint16_t), node_compare); for (j=0; j 2) { printf("\n"); printf("[stat]: Node %d Link %d\n", host_list[j], link_list[i]); printf("[stat]: tx_data_packets: %" PRIu64 "\n", link_status.stats.tx_data_packets); printf("[stat]: rx_data_packets: %" PRIu64 "\n", link_status.stats.rx_data_packets); printf("[stat]: tx_data_bytes: %" PRIu64 "\n", link_status.stats.tx_data_bytes); printf("[stat]: rx_data_bytes: %" PRIu64 "\n", link_status.stats.rx_data_bytes); printf("[stat]: rx_ping_packets: %" PRIu64 "\n", link_status.stats.rx_ping_packets); printf("[stat]: tx_ping_packets: %" PRIu64 "\n", link_status.stats.tx_ping_packets); printf("[stat]: rx_ping_bytes: %" PRIu64 "\n", link_status.stats.rx_ping_bytes); printf("[stat]: tx_ping_bytes: %" PRIu64 "\n", link_status.stats.tx_ping_bytes); printf("[stat]: rx_pong_packets: %" PRIu64 "\n", link_status.stats.rx_pong_packets); printf("[stat]: tx_pong_packets: %" PRIu64 "\n", link_status.stats.tx_pong_packets); printf("[stat]: rx_pong_bytes: %" PRIu64 "\n", link_status.stats.rx_pong_bytes); printf("[stat]: tx_pong_bytes: %" PRIu64 "\n", link_status.stats.tx_pong_bytes); printf("[stat]: rx_pmtu_packets: %" PRIu64 "\n", link_status.stats.rx_pmtu_packets); printf("[stat]: tx_pmtu_packets: %" PRIu64 "\n", link_status.stats.tx_pmtu_packets); printf("[stat]: rx_pmtu_bytes: %" PRIu64 "\n", link_status.stats.rx_pmtu_bytes); printf("[stat]: tx_pmtu_bytes: %" PRIu64 "\n", link_status.stats.tx_pmtu_bytes); printf("[stat]: tx_total_packets: %" PRIu64 "\n", link_status.stats.tx_total_packets); printf("[stat]: rx_total_packets: %" PRIu64 "\n", link_status.stats.rx_total_packets); printf("[stat]: tx_total_bytes: %" PRIu64 "\n", link_status.stats.tx_total_bytes); printf("[stat]: rx_total_bytes: %" PRIu64 "\n", link_status.stats.rx_total_bytes); printf("[stat]: tx_total_errors: %" PRIu64 "\n", link_status.stats.tx_total_errors); printf("[stat]: tx_total_retries: %" PRIu64 "\n", link_status.stats.tx_total_retries); printf("[stat]: tx_pmtu_errors: %" PRIu32 "\n", link_status.stats.tx_pmtu_errors); printf("[stat]: tx_pmtu_retries: %" PRIu32 "\n", link_status.stats.tx_pmtu_retries); printf("[stat]: tx_ping_errors: %" PRIu32 "\n", link_status.stats.tx_ping_errors); printf("[stat]: tx_ping_retries: %" PRIu32 "\n", link_status.stats.tx_ping_retries); printf("[stat]: tx_pong_errors: %" PRIu32 "\n", link_status.stats.tx_pong_errors); printf("[stat]: tx_pong_retries: %" PRIu32 "\n", link_status.stats.tx_pong_retries); printf("[stat]: tx_data_errors: %" PRIu32 "\n", link_status.stats.tx_data_errors); printf("[stat]: tx_data_retries: %" PRIu32 "\n", link_status.stats.tx_data_retries); printf("[stat]: latency_min: %" PRIu32 "\n", link_status.stats.latency_min); printf("[stat]: latency_max: %" PRIu32 "\n", link_status.stats.latency_max); printf("[stat]: latency_ave: %" PRIu32 "\n", link_status.stats.latency_ave); printf("[stat]: latency_samples: %" PRIu32 "\n", link_status.stats.latency_samples); printf("[stat]: down_count: %" PRIu32 "\n", link_status.stats.down_count); printf("[stat]: up_count: %" PRIu32 "\n", link_status.stats.up_count); } } } printf("\n"); printf("[stat]: Total link stats\n"); printf("[stat]: ----------------\n"); printf("[stat]: tx_data_packets: %" PRIu64 "\n", total_link_stats.tx_data_packets); printf("[stat]: rx_data_packets: %" PRIu64 "\n", total_link_stats.rx_data_packets); printf("[stat]: tx_data_bytes: %" PRIu64 "\n", total_link_stats.tx_data_bytes); printf("[stat]: rx_data_bytes: %" PRIu64 "\n", total_link_stats.rx_data_bytes); printf("[stat]: rx_ping_packets: %" PRIu64 "\n", total_link_stats.rx_ping_packets); printf("[stat]: tx_ping_packets: %" PRIu64 "\n", total_link_stats.tx_ping_packets); printf("[stat]: rx_ping_bytes: %" PRIu64 "\n", total_link_stats.rx_ping_bytes); printf("[stat]: tx_ping_bytes: %" PRIu64 "\n", total_link_stats.tx_ping_bytes); printf("[stat]: rx_pong_packets: %" PRIu64 "\n", total_link_stats.rx_pong_packets); printf("[stat]: tx_pong_packets: %" PRIu64 "\n", total_link_stats.tx_pong_packets); printf("[stat]: rx_pong_bytes: %" PRIu64 "\n", total_link_stats.rx_pong_bytes); printf("[stat]: tx_pong_bytes: %" PRIu64 "\n", total_link_stats.tx_pong_bytes); printf("[stat]: rx_pmtu_packets: %" PRIu64 "\n", total_link_stats.rx_pmtu_packets); printf("[stat]: tx_pmtu_packets: %" PRIu64 "\n", total_link_stats.tx_pmtu_packets); printf("[stat]: rx_pmtu_bytes: %" PRIu64 "\n", total_link_stats.rx_pmtu_bytes); printf("[stat]: tx_pmtu_bytes: %" PRIu64 "\n", total_link_stats.tx_pmtu_bytes); printf("[stat]: tx_total_packets: %" PRIu64 "\n", total_link_stats.tx_total_packets); printf("[stat]: rx_total_packets: %" PRIu64 "\n", total_link_stats.rx_total_packets); printf("[stat]: tx_total_bytes: %" PRIu64 "\n", total_link_stats.tx_total_bytes); printf("[stat]: rx_total_bytes: %" PRIu64 "\n", total_link_stats.rx_total_bytes); printf("[stat]: tx_total_errors: %" PRIu64 "\n", total_link_stats.tx_total_errors); printf("[stat]: tx_total_retries: %" PRIu64 "\n", total_link_stats.tx_total_retries); printf("[stat]: tx_pmtu_errors: %" PRIu32 "\n", total_link_stats.tx_pmtu_errors); printf("[stat]: tx_pmtu_retries: %" PRIu32 "\n", total_link_stats.tx_pmtu_retries); printf("[stat]: tx_ping_errors: %" PRIu32 "\n", total_link_stats.tx_ping_errors); printf("[stat]: tx_ping_retries: %" PRIu32 "\n", total_link_stats.tx_ping_retries); printf("[stat]: tx_pong_errors: %" PRIu32 "\n", total_link_stats.tx_pong_errors); printf("[stat]: tx_pong_retries: %" PRIu32 "\n", total_link_stats.tx_pong_retries); printf("[stat]: tx_data_errors: %" PRIu32 "\n", total_link_stats.tx_data_errors); printf("[stat]: tx_data_retries: %" PRIu32 "\n", total_link_stats.tx_data_retries); printf("[stat]: down_count: %" PRIu32 "\n", total_link_stats.down_count); printf("[stat]: up_count: %" PRIu32 "\n", total_link_stats.up_count); } static void send_perf_data_by_time(void) { char *tx_buf[PCKT_FRAG_MAX]; struct knet_mmsghdr msg[PCKT_FRAG_MAX]; struct iovec iov_out[PCKT_FRAG_MAX]; char ctrl_message[16]; int sent_msgs; int i; uint32_t packetsize = 64; struct timespec clock_start, clock_end; unsigned long long time_diff = 0; setup_send_buffers_common(msg, iov_out, tx_buf); memset(&clock_start, 0, sizeof(clock_start)); memset(&clock_end, 0, sizeof(clock_start)); while (packetsize <= KNET_MAX_PACKET_SIZE) { for (i = 0; i < PCKT_FRAG_MAX; i++) { iov_out[i].iov_len = packetsize; } printf("[info]: testing with %u bytes packet size for %" PRIu64 " seconds.\n", packetsize, perf_by_time_secs); memset(ctrl_message, 0, sizeof(ctrl_message)); knet_send(knet_h, ctrl_message, TEST_START, channel); if (clock_gettime(CLOCK_MONOTONIC, &clock_start) != 0) { printf("[info]: unable to get start time!\n"); } time_diff = 0; while (time_diff < (perf_by_time_secs * 1000000000llu)) { sent_msgs = send_messages(&msg[0], PCKT_FRAG_MAX); if (sent_msgs < 0) { printf("Something went wrong, aborting\n"); exit(FAIL); } if (clock_gettime(CLOCK_MONOTONIC, &clock_end) != 0) { printf("[info]: unable to get end time!\n"); } timespec_diff(clock_start, clock_end, &time_diff); } sleep(2); knet_send(knet_h, ctrl_message, TEST_STOP, channel); if (packetsize == KNET_MAX_PACKET_SIZE) { break; } /* * Use a multiplier that can always divide properly a GB * into smaller chunks without worry about boundaries */ packetsize *= 4; if (packetsize > KNET_MAX_PACKET_SIZE) { packetsize = KNET_MAX_PACKET_SIZE; } } knet_send(knet_h, ctrl_message, TEST_COMPLETE, channel); for (i = 0; i < PCKT_FRAG_MAX; i++) { free(tx_buf[i]); } } static void cleanup_all(void) { if (pthread_mutex_lock(&shutdown_mutex)) { return; } if (bench_shutdown_in_progress) { pthread_mutex_unlock(&shutdown_mutex); return; } bench_shutdown_in_progress = 1; pthread_mutex_unlock(&shutdown_mutex); if (rx_thread) { stop_rx_thread(); } knet_handle_stop(knet_h); } static void sigint_handler(int signum) { printf("[info]: cleaning up... got signal: %d\n", signum); cleanup_all(); exit(PASS); } int main(int argc, char *argv[]) { if (signal(SIGINT, sigint_handler) == SIG_ERR) { printf("Unable to configure SIGINT handler\n"); exit(FAIL); } setup_knet(argc, argv); setup_data_txrx_common(); sleep(5); restart: switch(test_type) { default: case TEST_PING: /* basic ping, no data */ sleep(5); break; case TEST_PING_AND_DATA: send_ping_data(); break; case TEST_PERF_BY_SIZE: if (senderid == thisnodeid) { send_perf_data_by_size(); } else { printf("[info]: waiting for perf rx thread to finish\n"); while(!wait_for_perf_rx) { sleep(1); } } break; case TEST_PERF_BY_TIME: if (senderid == thisnodeid) { send_perf_data_by_time(); } else { printf("[info]: waiting for perf rx thread to finish\n"); while(!wait_for_perf_rx) { sleep(1); } } break; } if (continous) { goto restart; } if (show_stats) { display_stats(show_stats); } cleanup_all(); return PASS; } diff --git a/libknet/threads_rx.c b/libknet/threads_rx.c index 2aa1164a..49d574eb 100644 --- a/libknet/threads_rx.c +++ b/libknet/threads_rx.c @@ -1,851 +1,852 @@ /* * Copyright (C) 2012-2018 Red Hat, Inc. All rights reserved. * * Authors: Fabio M. Di Nitto * Federico Simoncelli * * This software licensed under GPL-2.0+, LGPL-2.0+ */ #include "config.h" #include #include #include #include #include #include "compat.h" #include "compress.h" #include "crypto.h" #include "host.h" #include "links.h" #include "logging.h" #include "transports.h" #include "transport_common.h" #include "threads_common.h" #include "threads_heartbeat.h" #include "threads_rx.h" #include "netutils.h" /* * RECV */ /* * return 1 if a > b * return -1 if b > a * return 0 if they are equal */ static inline int timecmp(struct timespec a, struct timespec b) { if (a.tv_sec != b.tv_sec) { if (a.tv_sec > b.tv_sec) { return 1; } else { return -1; } } else { if (a.tv_nsec > b.tv_nsec) { return 1; } else if (a.tv_nsec < b.tv_nsec) { return -1; } else { return 0; } } } /* * this functions needs to return an index (0 to 7) * to a knet_host_defrag_buf. (-1 on errors) */ static int find_pckt_defrag_buf(knet_handle_t knet_h, struct knet_header *inbuf) { struct knet_host *src_host = knet_h->host_index[inbuf->kh_node]; int i, oldest; /* * check if there is a buffer already in use handling the same seq_num */ for (i = 0; i < KNET_MAX_LINK; i++) { if (src_host->defrag_buf[i].in_use) { if (src_host->defrag_buf[i].pckt_seq == inbuf->khp_data_seq_num) { return i; } } } /* * If there is no buffer that's handling the current seq_num * either it's new or it's been reclaimed already. * check if it's been reclaimed/seen before using the defrag circular * buffer. If the pckt has been seen before, the buffer expired (ETIME) * and there is no point to try to defrag it again. */ if (!_seq_num_lookup(src_host, inbuf->khp_data_seq_num, 1, 0)) { errno = ETIME; return -1; } /* * register the pckt as seen */ _seq_num_set(src_host, inbuf->khp_data_seq_num, 1); /* * see if there is a free buffer */ for (i = 0; i < KNET_MAX_LINK; i++) { if (!src_host->defrag_buf[i].in_use) { return i; } } /* * at this point, there are no free buffers, the pckt is new * and we need to reclaim a buffer, and we will take the one * with the oldest timestamp. It's as good as any. */ oldest = 0; for (i = 0; i < KNET_MAX_LINK; i++) { if (timecmp(src_host->defrag_buf[i].last_update, src_host->defrag_buf[oldest].last_update) < 0) { oldest = i; } } src_host->defrag_buf[oldest].in_use = 0; return oldest; } static int pckt_defrag(knet_handle_t knet_h, struct knet_header *inbuf, ssize_t *len) { struct knet_host_defrag_buf *defrag_buf; int defrag_buf_idx; defrag_buf_idx = find_pckt_defrag_buf(knet_h, inbuf); if (defrag_buf_idx < 0) { if (errno == ETIME) { log_debug(knet_h, KNET_SUB_RX, "Defrag buffer expired"); } return 1; } defrag_buf = &knet_h->host_index[inbuf->kh_node]->defrag_buf[defrag_buf_idx]; /* * if the buf is not is use, then make sure it's clean */ if (!defrag_buf->in_use) { memset(defrag_buf, 0, sizeof(struct knet_host_defrag_buf)); defrag_buf->in_use = 1; defrag_buf->pckt_seq = inbuf->khp_data_seq_num; } /* * update timestamp on the buffer */ clock_gettime(CLOCK_MONOTONIC, &defrag_buf->last_update); /* * check if we already received this fragment */ if (defrag_buf->frag_map[inbuf->khp_data_frag_seq]) { /* * if we have received this fragment and we didn't clear the buffer * it means that we don't have all fragments yet */ return 1; } /* * we need to handle the last packet with gloves due to its different size */ if (inbuf->khp_data_frag_seq == inbuf->khp_data_frag_num) { defrag_buf->last_frag_size = *len; /* * in the event when the last packet arrives first, * we still don't know the offset vs the other fragments (based on MTU), * so we store the fragment at the end of the buffer where it's safe * and take a copy of the len so that we can restore its offset later. * remember we can't use the local MTU for this calculation because pMTU * can be asymettric between the same hosts. */ if (!defrag_buf->frag_size) { defrag_buf->last_first = 1; memmove(defrag_buf->buf + (KNET_MAX_PACKET_SIZE - *len), inbuf->khp_data_userdata, *len); } } else { defrag_buf->frag_size = *len; } memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size), inbuf->khp_data_userdata, *len); defrag_buf->frag_recv++; defrag_buf->frag_map[inbuf->khp_data_frag_seq] = 1; /* * check if we received all the fragments */ if (defrag_buf->frag_recv == inbuf->khp_data_frag_num) { /* * special case the last pckt */ if (defrag_buf->last_first) { memmove(defrag_buf->buf + ((inbuf->khp_data_frag_num - 1) * defrag_buf->frag_size), defrag_buf->buf + (KNET_MAX_PACKET_SIZE - defrag_buf->last_frag_size), defrag_buf->last_frag_size); } /* * recalculate packet lenght */ *len = ((inbuf->khp_data_frag_num - 1) * defrag_buf->frag_size) + defrag_buf->last_frag_size; /* * copy the pckt back in the user data */ memmove(inbuf->khp_data_userdata, defrag_buf->buf, *len); /* * free this buffer */ defrag_buf->in_use = 0; return 0; } return 1; } static void _parse_recv_from_links(knet_handle_t knet_h, int sockfd, const struct knet_mmsghdr *msg) { int err = 0, savederrno = 0; ssize_t outlen; struct knet_host *src_host; struct knet_link *src_link; unsigned long long latency_last; knet_node_id_t dst_host_ids[KNET_MAX_HOST]; size_t dst_host_ids_entries = 0; int bcast = 1; int was_decrypted = 0; uint64_t crypt_time = 0; struct timespec recvtime; struct knet_header *inbuf = msg->msg_hdr.msg_iov->iov_base; unsigned char *outbuf = (unsigned char *)msg->msg_hdr.msg_iov->iov_base; ssize_t len = msg->msg_len; struct knet_hostinfo *knet_hostinfo; struct iovec iov_out[1]; int8_t channel; struct sockaddr_storage pckt_src; seq_num_t recv_seq_num; int wipe_bufs = 0; if (knet_h->crypto_instance) { struct timespec start_time; struct timespec end_time; clock_gettime(CLOCK_MONOTONIC, &start_time); if (crypto_authenticate_and_decrypt(knet_h, (unsigned char *)inbuf, len, knet_h->recv_from_links_buf_decrypt, &outlen) < 0) { log_debug(knet_h, KNET_SUB_RX, "Unable to decrypt/auth packet"); return; } clock_gettime(CLOCK_MONOTONIC, &end_time); timespec_diff(start_time, end_time, &crypt_time); if (crypt_time < knet_h->stats.rx_crypt_time_min) { knet_h->stats.rx_crypt_time_min = crypt_time; } if (crypt_time > knet_h->stats.rx_crypt_time_max) { knet_h->stats.rx_crypt_time_max = crypt_time; } len = outlen; inbuf = (struct knet_header *)knet_h->recv_from_links_buf_decrypt; was_decrypted++; } if (len < (ssize_t)(KNET_HEADER_SIZE + 1)) { log_debug(knet_h, KNET_SUB_RX, "Packet is too short: %ld", (long)len); return; } if (inbuf->kh_version != KNET_HEADER_VERSION) { log_debug(knet_h, KNET_SUB_RX, "Packet version does not match"); return; } inbuf->kh_node = ntohs(inbuf->kh_node); src_host = knet_h->host_index[inbuf->kh_node]; if (src_host == NULL) { /* host not found */ log_debug(knet_h, KNET_SUB_RX, "Unable to find source host for this packet"); return; } src_link = NULL; src_link = src_host->link + (inbuf->khp_ping_link % KNET_MAX_LINK); if ((inbuf->kh_type & KNET_HEADER_TYPE_PMSK) != 0) { if (src_link->dynamic == KNET_LINK_DYNIP) { /* * cpyaddrport will only copy address and port of the incoming * packet and strip extra bits such as flow and scopeid */ cpyaddrport(&pckt_src, msg->msg_hdr.msg_name); if (cmpaddr(&src_link->dst_addr, sockaddr_len(&src_link->dst_addr), &pckt_src, sockaddr_len(&pckt_src)) != 0) { log_debug(knet_h, KNET_SUB_RX, "host: %u link: %u appears to have changed ip address", src_host->host_id, src_link->link_id); memmove(&src_link->dst_addr, &pckt_src, sizeof(struct sockaddr_storage)); if (knet_addrtostr(&src_link->dst_addr, sockaddr_len(msg->msg_hdr.msg_name), src_link->status.dst_ipaddr, KNET_MAX_HOST_LEN, src_link->status.dst_port, KNET_MAX_PORT_LEN) != 0) { log_debug(knet_h, KNET_SUB_RX, "Unable to resolve ???"); snprintf(src_link->status.dst_ipaddr, KNET_MAX_HOST_LEN - 1, "Unknown!!!"); snprintf(src_link->status.dst_port, KNET_MAX_PORT_LEN - 1, "??"); } else { log_info(knet_h, KNET_SUB_RX, "host: %u link: %u new connection established from: %s %s", src_host->host_id, src_link->link_id, src_link->status.dst_ipaddr, src_link->status.dst_port); } } /* * transport has already accepted the connection here * otherwise we would not be receiving packets */ transport_link_dyn_connect(knet_h, sockfd, src_link); } } switch (inbuf->kh_type) { case KNET_HEADER_TYPE_HOST_INFO: case KNET_HEADER_TYPE_DATA: /* * TODO: should we accept data even if we can't reply to the other node? * how would that work with SCTP and guaranteed delivery? */ if (!src_host->status.reachable) { log_debug(knet_h, KNET_SUB_RX, "Source host %u not reachable yet", src_host->host_id); //return; } inbuf->khp_data_seq_num = ntohs(inbuf->khp_data_seq_num); channel = inbuf->khp_data_channel; src_host->got_data = 1; if (src_link) { src_link->status.stats.rx_data_packets++; src_link->status.stats.rx_data_bytes += len; } if (!_seq_num_lookup(src_host, inbuf->khp_data_seq_num, 0, 0)) { if (src_host->link_handler_policy != KNET_LINK_POLICY_ACTIVE) { log_debug(knet_h, KNET_SUB_RX, "Packet has already been delivered"); } return; } if (inbuf->khp_data_frag_num > 1) { /* * len as received from the socket also includes extra stuff * that the defrag code doesn't care about. So strip it * here and readd only for repadding once we are done * defragging */ len = len - KNET_HEADER_DATA_SIZE; if (pckt_defrag(knet_h, inbuf, &len)) { return; } len = len + KNET_HEADER_DATA_SIZE; } if (inbuf->khp_data_compress) { ssize_t decmp_outlen = KNET_DATABUFSIZE_COMPRESS; struct timespec start_time; struct timespec end_time; uint64_t compress_time; clock_gettime(CLOCK_MONOTONIC, &start_time); err = decompress(knet_h, inbuf->khp_data_compress, (const unsigned char *)inbuf->khp_data_userdata, len - KNET_HEADER_DATA_SIZE, knet_h->recv_from_links_buf_decompress, &decmp_outlen); if (!err) { /* Collect stats */ clock_gettime(CLOCK_MONOTONIC, &end_time); timespec_diff(start_time, end_time, &compress_time); if (compress_time < knet_h->stats.rx_compress_time_min) { knet_h->stats.rx_compress_time_min = compress_time; } if (compress_time > knet_h->stats.rx_compress_time_max) { knet_h->stats.rx_compress_time_max = compress_time; } knet_h->stats.rx_compress_time_ave = (knet_h->stats.rx_compress_time_ave * knet_h->stats.rx_compressed_packets + compress_time) / (knet_h->stats.rx_compressed_packets+1); knet_h->stats.rx_compressed_packets++; knet_h->stats.rx_compressed_original_bytes += decmp_outlen; knet_h->stats.rx_compressed_size_bytes += len - KNET_HEADER_SIZE; memmove(inbuf->khp_data_userdata, knet_h->recv_from_links_buf_decompress, decmp_outlen); len = decmp_outlen + KNET_HEADER_DATA_SIZE; } else { + knet_h->stats.rx_failed_to_decompress++; log_warn(knet_h, KNET_SUB_COMPRESS, "Unable to decompress packet (%d): %s", err, strerror(errno)); return; } } if (inbuf->kh_type == KNET_HEADER_TYPE_DATA) { if (knet_h->enabled != 1) /* data forward is disabled */ break; /* Only update the crypto overhead for data packets. Mainly to be consistent with TX */ knet_h->stats.rx_crypt_time_ave = (knet_h->stats.rx_crypt_time_ave * knet_h->stats.rx_crypt_packets + crypt_time) / (knet_h->stats.rx_crypt_packets+1); knet_h->stats.rx_crypt_packets++; if (knet_h->dst_host_filter_fn) { size_t host_idx; int found = 0; bcast = knet_h->dst_host_filter_fn( knet_h->dst_host_filter_fn_private_data, (const unsigned char *)inbuf->khp_data_userdata, len - KNET_HEADER_DATA_SIZE, KNET_NOTIFY_RX, knet_h->host_id, inbuf->kh_node, &channel, dst_host_ids, &dst_host_ids_entries); if (bcast < 0) { log_debug(knet_h, KNET_SUB_RX, "Error from dst_host_filter_fn: %d", bcast); return; } if ((!bcast) && (!dst_host_ids_entries)) { log_debug(knet_h, KNET_SUB_RX, "Message is unicast but no dst_host_ids_entries"); return; } /* check if we are dst for this packet */ if (!bcast) { if (dst_host_ids_entries > KNET_MAX_HOST) { log_debug(knet_h, KNET_SUB_RX, "dst_host_filter_fn returned too many destinations"); return; } for (host_idx = 0; host_idx < dst_host_ids_entries; host_idx++) { if (dst_host_ids[host_idx] == knet_h->host_id) { found = 1; break; } } if (!found) { log_debug(knet_h, KNET_SUB_RX, "Packet is not for us"); return; } } } } if (inbuf->kh_type == KNET_HEADER_TYPE_DATA) { if (!knet_h->sockfd[channel].in_use) { log_debug(knet_h, KNET_SUB_RX, "received packet for channel %d but there is no local sock connected", channel); return; } memset(iov_out, 0, sizeof(iov_out)); iov_out[0].iov_base = (void *) inbuf->khp_data_userdata; iov_out[0].iov_len = len - KNET_HEADER_DATA_SIZE; outlen = writev(knet_h->sockfd[channel].sockfd[knet_h->sockfd[channel].is_created], iov_out, 1); if (outlen <= 0) { knet_h->sock_notify_fn(knet_h->sock_notify_fn_private_data, knet_h->sockfd[channel].sockfd[0], channel, KNET_NOTIFY_RX, outlen, errno); return; } if ((size_t)outlen == iov_out[0].iov_len) { _seq_num_set(src_host, inbuf->khp_data_seq_num, 0); } } else { /* HOSTINFO */ knet_hostinfo = (struct knet_hostinfo *)inbuf->khp_data_userdata; if (knet_hostinfo->khi_bcast == KNET_HOSTINFO_UCAST) { bcast = 0; knet_hostinfo->khi_dst_node_id = ntohs(knet_hostinfo->khi_dst_node_id); } if (!_seq_num_lookup(src_host, inbuf->khp_data_seq_num, 0, 0)) { return; } _seq_num_set(src_host, inbuf->khp_data_seq_num, 0); switch(knet_hostinfo->khi_type) { case KNET_HOSTINFO_TYPE_LINK_UP_DOWN: break; case KNET_HOSTINFO_TYPE_LINK_TABLE: break; default: log_warn(knet_h, KNET_SUB_RX, "Receiving unknown host info message from host %u", src_host->host_id); break; } } break; case KNET_HEADER_TYPE_PING: outlen = KNET_HEADER_PING_SIZE; inbuf->kh_type = KNET_HEADER_TYPE_PONG; inbuf->kh_node = htons(knet_h->host_id); recv_seq_num = ntohs(inbuf->khp_ping_seq_num); src_link->status.stats.rx_ping_packets++; src_link->status.stats.rx_ping_bytes += len; wipe_bufs = 0; if (!inbuf->khp_ping_timed) { /* * we might be receiving this message from all links, but we want * to process it only the first time */ if (recv_seq_num != src_host->untimed_rx_seq_num) { /* * cache the untimed seq num */ src_host->untimed_rx_seq_num = recv_seq_num; /* * if the host has received data in between * untimed ping, then we don't need to wipe the bufs */ if (src_host->got_data) { src_host->got_data = 0; wipe_bufs = 0; } else { wipe_bufs = 1; } } _seq_num_lookup(src_host, recv_seq_num, 0, wipe_bufs); } else { /* * pings always arrives in bursts over all the link * catch the first of them to cache the seq num and * avoid duplicate processing */ if (recv_seq_num != src_host->timed_rx_seq_num) { src_host->timed_rx_seq_num = recv_seq_num; if (recv_seq_num == 0) { _seq_num_lookup(src_host, recv_seq_num, 0, 1); } } } if (knet_h->crypto_instance) { if (crypto_encrypt_and_sign(knet_h, (const unsigned char *)inbuf, outlen, knet_h->recv_from_links_buf_crypt, &outlen) < 0) { log_debug(knet_h, KNET_SUB_RX, "Unable to encrypt pong packet"); break; } outbuf = knet_h->recv_from_links_buf_crypt; knet_h->stats_extra.tx_crypt_pong_packets++; } retry_pong: len = sendto(src_link->outsock, outbuf, outlen, MSG_DONTWAIT | MSG_NOSIGNAL, (struct sockaddr *) &src_link->dst_addr, sizeof(struct sockaddr_storage)); savederrno = errno; if (len != outlen) { err = transport_tx_sock_error(knet_h, src_link->transport_type, src_link->outsock, len, savederrno); switch(err) { case -1: /* unrecoverable error */ log_debug(knet_h, KNET_SUB_RX, "Unable to send pong reply (sock: %d) packet (sendto): %d %s. recorded src ip: %s src port: %s dst ip: %s dst port: %s", src_link->outsock, errno, strerror(errno), src_link->status.src_ipaddr, src_link->status.src_port, src_link->status.dst_ipaddr, src_link->status.dst_port); src_link->status.stats.tx_pong_errors++; break; case 0: /* ignore error and continue */ break; case 1: /* retry to send those same data */ src_link->status.stats.tx_pong_retries++; goto retry_pong; break; } } src_link->status.stats.tx_pong_packets++; src_link->status.stats.tx_pong_bytes += outlen; break; case KNET_HEADER_TYPE_PONG: src_link->status.stats.rx_pong_packets++; src_link->status.stats.rx_pong_bytes += len; clock_gettime(CLOCK_MONOTONIC, &src_link->status.pong_last); memmove(&recvtime, &inbuf->khp_ping_time[0], sizeof(struct timespec)); timespec_diff(recvtime, src_link->status.pong_last, &latency_last); src_link->status.latency = ((src_link->status.latency * src_link->latency_exp) + ((latency_last / 1000llu) * (src_link->latency_fix - src_link->latency_exp))) / src_link->latency_fix; if (src_link->status.latency < src_link->pong_timeout_adj) { if (!src_link->status.connected) { if (src_link->received_pong >= src_link->pong_count) { log_info(knet_h, KNET_SUB_RX, "host: %u link: %u is up", src_host->host_id, src_link->link_id); _link_updown(knet_h, src_host->host_id, src_link->link_id, src_link->status.enabled, 1); } else { src_link->received_pong++; log_debug(knet_h, KNET_SUB_RX, "host: %u link: %u received pong: %u", src_host->host_id, src_link->link_id, src_link->received_pong); } } } /* Calculate latency stats */ if (src_link->status.latency > src_link->status.stats.latency_max) { src_link->status.stats.latency_max = src_link->status.latency; } if (src_link->status.latency < src_link->status.stats.latency_min) { src_link->status.stats.latency_min = src_link->status.latency; } src_link->status.stats.latency_ave = (src_link->status.stats.latency_ave * src_link->status.stats.latency_samples + src_link->status.latency) / (src_link->status.stats.latency_samples+1); src_link->status.stats.latency_samples++; break; case KNET_HEADER_TYPE_PMTUD: src_link->status.stats.rx_pmtu_packets++; src_link->status.stats.rx_pmtu_bytes += len; outlen = KNET_HEADER_PMTUD_SIZE; inbuf->kh_type = KNET_HEADER_TYPE_PMTUD_REPLY; inbuf->kh_node = htons(knet_h->host_id); if (knet_h->crypto_instance) { if (crypto_encrypt_and_sign(knet_h, (const unsigned char *)inbuf, outlen, knet_h->recv_from_links_buf_crypt, &outlen) < 0) { log_debug(knet_h, KNET_SUB_RX, "Unable to encrypt PMTUd reply packet"); break; } outbuf = knet_h->recv_from_links_buf_crypt; knet_h->stats_extra.tx_crypt_pmtu_reply_packets++; } savederrno = pthread_mutex_lock(&knet_h->tx_mutex); if (savederrno) { log_err(knet_h, KNET_SUB_RX, "Unable to get TX mutex lock: %s", strerror(savederrno)); goto out_pmtud; } retry_pmtud: len = sendto(src_link->outsock, outbuf, outlen, MSG_DONTWAIT | MSG_NOSIGNAL, (struct sockaddr *) &src_link->dst_addr, sizeof(struct sockaddr_storage)); savederrno = errno; if (len != outlen) { err = transport_tx_sock_error(knet_h, src_link->transport_type, src_link->outsock, len, savederrno); switch(err) { case -1: /* unrecoverable error */ log_debug(knet_h, KNET_SUB_RX, "Unable to send PMTUd reply (sock: %d) packet (sendto): %d %s. recorded src ip: %s src port: %s dst ip: %s dst port: %s", src_link->outsock, errno, strerror(errno), src_link->status.src_ipaddr, src_link->status.src_port, src_link->status.dst_ipaddr, src_link->status.dst_port); src_link->status.stats.tx_pmtu_errors++; break; case 0: /* ignore error and continue */ src_link->status.stats.tx_pmtu_errors++; break; case 1: /* retry to send those same data */ src_link->status.stats.tx_pmtu_retries++; goto retry_pmtud; break; } } pthread_mutex_unlock(&knet_h->tx_mutex); out_pmtud: break; case KNET_HEADER_TYPE_PMTUD_REPLY: src_link->status.stats.rx_pmtu_packets++; src_link->status.stats.rx_pmtu_bytes += len; if (pthread_mutex_lock(&knet_h->pmtud_mutex) != 0) { log_debug(knet_h, KNET_SUB_RX, "Unable to get mutex lock"); break; } src_link->last_recv_mtu = inbuf->khp_pmtud_size; pthread_cond_signal(&knet_h->pmtud_cond); pthread_mutex_unlock(&knet_h->pmtud_mutex); break; default: return; } } static void _handle_recv_from_links(knet_handle_t knet_h, int sockfd, struct knet_mmsghdr *msg) { int err, savederrno; int i, msg_recv, transport; if (pthread_rwlock_rdlock(&knet_h->global_rwlock) != 0) { log_debug(knet_h, KNET_SUB_RX, "Unable to get global read lock"); return; } if (_is_valid_fd(knet_h, sockfd) < 1) { /* * this is normal if a fd got an event and before we grab the read lock * and the link is removed by another thread */ goto exit_unlock; } transport = knet_h->knet_transport_fd_tracker[sockfd].transport; /* * reset msg_namelen to buffer size because after recvmmsg * each msg_namelen will contain sizeof sockaddr_in or sockaddr_in6 */ for (i = 0; i < PCKT_RX_BUFS; i++) { msg[i].msg_hdr.msg_namelen = sizeof(struct sockaddr_storage); } msg_recv = _recvmmsg(sockfd, &msg[0], PCKT_RX_BUFS, MSG_DONTWAIT | MSG_NOSIGNAL); savederrno = errno; /* * WARNING: man page for recvmmsg is wrong. Kernel implementation here: * recvmmsg can return: * -1 on error * 0 if the previous run of recvmmsg recorded an error on the socket * N number of messages (see exception below). * * If there is an error from recvmsg after receiving a frame or more, the recvmmsg * loop is interrupted, error recorded in the socket (getsockopt(SO_ERROR) and * it will be visibile in the next run. * * Need to be careful how we handle errors at this stage. * * error messages need to be handled on a per transport/protocol base * at this point we have different layers of error handling * - msg_recv < 0 -> error from this run * msg_recv = 0 -> error from previous run and error on socket needs to be cleared * - per-transport message data * example: msg[i].msg_hdr.msg_flags & MSG_NOTIFICATION or msg_len for SCTP == EOF, * but for UDP it is perfectly legal to receive a 0 bytes message.. go figure * - NOTE: on SCTP MSG_NOTIFICATION we get msg_recv == PCKT_FRAG_MAX messages and no * errno set. That means the error api needs to be able to abort the loop below. */ if (msg_recv <= 0) { transport_rx_sock_error(knet_h, transport, sockfd, msg_recv, savederrno); goto exit_unlock; } for (i = 0; i < msg_recv; i++) { err = transport_rx_is_data(knet_h, transport, sockfd, &msg[i]); /* * TODO: make this section silent once we are confident * all protocols packet handlers are good */ switch(err) { case -1: /* on error */ log_debug(knet_h, KNET_SUB_RX, "Transport reported error parsing packet"); goto exit_unlock; break; case 0: /* packet is not data and we should continue the packet process loop */ log_debug(knet_h, KNET_SUB_RX, "Transport reported no data, continue"); break; case 1: /* packet is not data and we should STOP the packet process loop */ log_debug(knet_h, KNET_SUB_RX, "Transport reported no data, stop"); goto exit_unlock; break; case 2: /* packet is data and should be parsed as such */ _parse_recv_from_links(knet_h, sockfd, &msg[i]); break; } } exit_unlock: pthread_rwlock_unlock(&knet_h->global_rwlock); } void *_handle_recv_from_links_thread(void *data) { int i, nev; knet_handle_t knet_h = (knet_handle_t) data; struct epoll_event events[KNET_EPOLL_MAX_EVENTS]; struct sockaddr_storage address[PCKT_RX_BUFS]; struct knet_mmsghdr msg[PCKT_RX_BUFS]; struct iovec iov_in[PCKT_RX_BUFS]; set_thread_status(knet_h, KNET_THREAD_RX, KNET_THREAD_RUNNING); memset(&msg, 0, sizeof(msg)); for (i = 0; i < PCKT_RX_BUFS; i++) { iov_in[i].iov_base = (void *)knet_h->recv_from_links_buf[i]; iov_in[i].iov_len = KNET_DATABUFSIZE; memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr)); msg[i].msg_hdr.msg_name = &address[i]; msg[i].msg_hdr.msg_namelen = sizeof(struct sockaddr_storage); msg[i].msg_hdr.msg_iov = &iov_in[i]; msg[i].msg_hdr.msg_iovlen = 1; } while (!shutdown_in_progress(knet_h)) { nev = epoll_wait(knet_h->recv_from_links_epollfd, events, KNET_EPOLL_MAX_EVENTS, KNET_THREADS_TIMERES / 1000); /* * we use timeout to detect if thread is shutting down */ if (nev == 0) { continue; } for (i = 0; i < nev; i++) { _handle_recv_from_links(knet_h, events[i].data.fd, msg); } } set_thread_status(knet_h, KNET_THREAD_RX, KNET_THREAD_STOPPED); return NULL; } diff --git a/libknet/threads_tx.c b/libknet/threads_tx.c index a1b407d3..d1451ff2 100644 --- a/libknet/threads_tx.c +++ b/libknet/threads_tx.c @@ -1,748 +1,751 @@ /* * Copyright (C) 2012-2018 Red Hat, Inc. All rights reserved. * * Authors: Fabio M. Di Nitto * Federico Simoncelli * * This software licensed under GPL-2.0+, LGPL-2.0+ */ #include "config.h" #include #include #include #include #include #include #include "compat.h" #include "compress.h" #include "crypto.h" #include "host.h" #include "link.h" #include "logging.h" #include "transports.h" #include "transport_common.h" #include "threads_common.h" #include "threads_heartbeat.h" #include "threads_tx.h" #include "netutils.h" /* * SEND */ static int _dispatch_to_links(knet_handle_t knet_h, struct knet_host *dst_host, struct knet_mmsghdr *msg, int msgs_to_send) { int link_idx, msg_idx, sent_msgs, prev_sent, progress; int err = 0, savederrno = 0; unsigned int i; struct knet_mmsghdr *cur; struct knet_link *cur_link; for (link_idx = 0; link_idx < dst_host->active_link_entries; link_idx++) { sent_msgs = 0; prev_sent = 0; progress = 1; cur_link = &dst_host->link[dst_host->active_links[link_idx]]; if (cur_link->transport_type == KNET_TRANSPORT_LOOPBACK) { continue; } msg_idx = 0; while (msg_idx < msgs_to_send) { msg[msg_idx].msg_hdr.msg_name = &cur_link->dst_addr; /* Cast for Linux/BSD compatibility */ for (i=0; i<(unsigned int)msg[msg_idx].msg_hdr.msg_iovlen; i++) { cur_link->status.stats.tx_data_bytes += msg[msg_idx].msg_hdr.msg_iov[i].iov_len; } cur_link->status.stats.tx_data_packets++; msg_idx++; } retry: cur = &msg[prev_sent]; sent_msgs = _sendmmsg(dst_host->link[dst_host->active_links[link_idx]].outsock, &cur[0], msgs_to_send - prev_sent, MSG_DONTWAIT | MSG_NOSIGNAL); savederrno = errno; err = transport_tx_sock_error(knet_h, dst_host->link[dst_host->active_links[link_idx]].transport_type, dst_host->link[dst_host->active_links[link_idx]].outsock, sent_msgs, savederrno); switch(err) { case -1: /* unrecoverable error */ cur_link->status.stats.tx_data_errors++; goto out_unlock; break; case 0: /* ignore error and continue */ break; case 1: /* retry to send those same data */ cur_link->status.stats.tx_data_retries++; goto retry; break; } prev_sent = prev_sent + sent_msgs; if ((sent_msgs >= 0) && (prev_sent < msgs_to_send)) { if ((sent_msgs) || (progress)) { if (sent_msgs) { progress = 1; } else { progress = 0; } #ifdef DEBUG log_debug(knet_h, KNET_SUB_TX, "Unable to send all (%d/%d) data packets to host %s (%u) link %s:%s (%u)", sent_msgs, msg_idx, dst_host->name, dst_host->host_id, dst_host->link[dst_host->active_links[link_idx]].status.dst_ipaddr, dst_host->link[dst_host->active_links[link_idx]].status.dst_port, dst_host->link[dst_host->active_links[link_idx]].link_id); #endif goto retry; } if (!progress) { savederrno = EAGAIN; err = -1; goto out_unlock; } } if ((dst_host->link_handler_policy == KNET_LINK_POLICY_RR) && (dst_host->active_link_entries > 1)) { uint8_t cur_link_id = dst_host->active_links[0]; memmove(&dst_host->active_links[0], &dst_host->active_links[1], KNET_MAX_LINK - 1); dst_host->active_links[dst_host->active_link_entries - 1] = cur_link_id; break; } } out_unlock: errno = savederrno; return err; } static int _parse_recv_from_sock(knet_handle_t knet_h, size_t inlen, int8_t channel, int is_sync) { size_t outlen, frag_len; struct knet_host *dst_host; knet_node_id_t dst_host_ids_temp[KNET_MAX_HOST]; size_t dst_host_ids_entries_temp = 0; knet_node_id_t dst_host_ids[KNET_MAX_HOST]; size_t dst_host_ids_entries = 0; int bcast = 1; struct knet_hostinfo *knet_hostinfo; struct iovec iov_out[PCKT_FRAG_MAX][2]; int iovcnt_out = 2; uint8_t frag_idx; unsigned int temp_data_mtu; size_t host_idx; int send_mcast = 0; struct knet_header *inbuf; int savederrno = 0; int err = 0; seq_num_t tx_seq_num; struct knet_mmsghdr msg[PCKT_FRAG_MAX]; int msgs_to_send, msg_idx; unsigned int i; int j; int send_local = 0; int data_compressed = 0; size_t uncrypted_frag_size; inbuf = knet_h->recv_from_sock_buf; if ((knet_h->enabled != 1) && (inbuf->kh_type != KNET_HEADER_TYPE_HOST_INFO)) { /* data forward is disabled */ log_debug(knet_h, KNET_SUB_TX, "Received data packet but forwarding is disabled"); savederrno = ECANCELED; err = -1; goto out_unlock; } /* * move this into a separate function to expand on * extra switching rules */ switch(inbuf->kh_type) { case KNET_HEADER_TYPE_DATA: if (knet_h->dst_host_filter_fn) { bcast = knet_h->dst_host_filter_fn( knet_h->dst_host_filter_fn_private_data, (const unsigned char *)inbuf->khp_data_userdata, inlen, KNET_NOTIFY_TX, knet_h->host_id, knet_h->host_id, &channel, dst_host_ids_temp, &dst_host_ids_entries_temp); if (bcast < 0) { log_debug(knet_h, KNET_SUB_TX, "Error from dst_host_filter_fn: %d", bcast); savederrno = EFAULT; err = -1; goto out_unlock; } if ((!bcast) && (!dst_host_ids_entries_temp)) { log_debug(knet_h, KNET_SUB_TX, "Message is unicast but no dst_host_ids_entries"); savederrno = EINVAL; err = -1; goto out_unlock; } if ((!bcast) && (dst_host_ids_entries_temp > KNET_MAX_HOST)) { log_debug(knet_h, KNET_SUB_TX, "dst_host_filter_fn returned too many destinations"); savederrno = EINVAL; err = -1; goto out_unlock; } } /* Send to localhost if appropriate and enabled */ if (knet_h->has_loop_link) { send_local = 0; if (bcast) { send_local = 1; } else { for (i=0; i< dst_host_ids_entries_temp; i++) { if (dst_host_ids_temp[i] == knet_h->host_id) { send_local = 1; } } } if (send_local) { const unsigned char *buf = inbuf->khp_data_userdata; ssize_t buflen = inlen; struct knet_link *local_link; local_link = knet_h->host_index[knet_h->host_id]->link; local_retry: err = write(knet_h->sockfd[channel].sockfd[knet_h->sockfd[channel].is_created], buf, buflen); if (err < 0) { log_err(knet_h, KNET_SUB_TRANSP_LOOPBACK, "send local failed. error=%s\n", strerror(errno)); local_link->status.stats.tx_data_errors++; } if (err > 0 && err < buflen) { log_debug(knet_h, KNET_SUB_TRANSP_LOOPBACK, "send local incomplete=%d bytes of %zu\n", err, inlen); local_link->status.stats.tx_data_retries++; buf += err; buflen -= err; usleep(KNET_THREADS_TIMERES / 16); goto local_retry; } if (err == buflen) { local_link->status.stats.tx_data_packets++; local_link->status.stats.tx_data_bytes += inlen; } } } break; case KNET_HEADER_TYPE_HOST_INFO: knet_hostinfo = (struct knet_hostinfo *)inbuf->khp_data_userdata; if (knet_hostinfo->khi_bcast == KNET_HOSTINFO_UCAST) { bcast = 0; dst_host_ids_temp[0] = knet_hostinfo->khi_dst_node_id; dst_host_ids_entries_temp = 1; knet_hostinfo->khi_dst_node_id = htons(knet_hostinfo->khi_dst_node_id); } break; default: log_warn(knet_h, KNET_SUB_TX, "Receiving unknown messages from socket"); savederrno = ENOMSG; err = -1; goto out_unlock; break; } if (is_sync) { if ((bcast) || ((!bcast) && (dst_host_ids_entries_temp > 1))) { log_debug(knet_h, KNET_SUB_TX, "knet_send_sync is only supported with unicast packets for one destination"); savederrno = E2BIG; err = -1; goto out_unlock; } } /* * check destinations hosts before spending time * in fragmenting/encrypting packets to save * time processing data for unreachable hosts. * for unicast, also remap the destination data * to skip unreachable hosts. */ if (!bcast) { dst_host_ids_entries = 0; for (host_idx = 0; host_idx < dst_host_ids_entries_temp; host_idx++) { dst_host = knet_h->host_index[dst_host_ids_temp[host_idx]]; if (!dst_host) { continue; } if (!(dst_host->host_id == knet_h->host_id && knet_h->has_loop_link) && dst_host->status.reachable) { dst_host_ids[dst_host_ids_entries] = dst_host_ids_temp[host_idx]; dst_host_ids_entries++; } } if (!dst_host_ids_entries) { savederrno = EHOSTDOWN; err = -1; goto out_unlock; } } else { send_mcast = 0; for (dst_host = knet_h->host_head; dst_host != NULL; dst_host = dst_host->next) { if (!(dst_host->host_id == knet_h->host_id && knet_h->has_loop_link) && dst_host->status.reachable) { send_mcast = 1; break; } } if (!send_mcast) { savederrno = EHOSTDOWN; err = -1; goto out_unlock; } } if (!knet_h->data_mtu) { /* * using MIN_MTU_V4 for data mtu is not completely accurate but safe enough */ log_debug(knet_h, KNET_SUB_TX, "Received data packet but data MTU is still unknown." " Packet might not be delivered." " Assuming minimum IPv4 MTU (%d)", KNET_PMTUD_MIN_MTU_V4); temp_data_mtu = KNET_PMTUD_MIN_MTU_V4; } else { /* * take a copy of the mtu to avoid value changing under * our feet while we are sending a fragmented pckt */ temp_data_mtu = knet_h->data_mtu; } /* * compress data */ if ((knet_h->compress_model > 0) && (inlen > knet_h->compress_threshold)) { size_t cmp_outlen = KNET_DATABUFSIZE_COMPRESS; struct timespec start_time; struct timespec end_time; uint64_t compress_time; clock_gettime(CLOCK_MONOTONIC, &start_time); err = compress(knet_h, (const unsigned char *)inbuf->khp_data_userdata, inlen, knet_h->send_to_links_buf_compress, (ssize_t *)&cmp_outlen); if (err < 0) { + knet_h->stats.tx_failed_to_compress++; log_warn(knet_h, KNET_SUB_COMPRESS, "Compression failed (%d): %s", err, strerror(errno)); } else { /* Collect stats */ clock_gettime(CLOCK_MONOTONIC, &end_time); timespec_diff(start_time, end_time, &compress_time); if (compress_time < knet_h->stats.tx_compress_time_min) { knet_h->stats.tx_compress_time_min = compress_time; } if (compress_time > knet_h->stats.tx_compress_time_max) { knet_h->stats.tx_compress_time_max = compress_time; } knet_h->stats.tx_compress_time_ave = (unsigned long long)(knet_h->stats.tx_compress_time_ave * knet_h->stats.tx_compressed_packets + compress_time) / (knet_h->stats.tx_compressed_packets+1); knet_h->stats.tx_compressed_packets++; knet_h->stats.tx_compressed_original_bytes += inlen; knet_h->stats.tx_compressed_size_bytes += cmp_outlen; if (cmp_outlen < inlen) { memmove(inbuf->khp_data_userdata, knet_h->send_to_links_buf_compress, cmp_outlen); inlen = cmp_outlen; data_compressed = 1; + } else { + knet_h->stats.tx_unable_to_compress++; } } } if ((knet_h->compress_model > 0) && (inlen <= knet_h->compress_threshold)) { knet_h->stats.tx_uncompressed_packets++; } /* * prepare the outgoing buffers */ frag_len = inlen; frag_idx = 0; inbuf->khp_data_bcast = bcast; inbuf->khp_data_frag_num = ceil((float)inlen / temp_data_mtu); inbuf->khp_data_channel = channel; if (data_compressed) { inbuf->khp_data_compress = knet_h->compress_model; } else { inbuf->khp_data_compress = 0; } if (pthread_mutex_lock(&knet_h->tx_seq_num_mutex)) { log_debug(knet_h, KNET_SUB_TX, "Unable to get seq mutex lock"); goto out_unlock; } knet_h->tx_seq_num++; /* * force seq_num 0 to detect a node that has crashed and rejoining * the knet instance. seq_num 0 will clear the buffers in the RX * thread */ if (knet_h->tx_seq_num == 0) { knet_h->tx_seq_num++; } /* * cache the value in locked context */ tx_seq_num = knet_h->tx_seq_num; inbuf->khp_data_seq_num = htons(knet_h->tx_seq_num); pthread_mutex_unlock(&knet_h->tx_seq_num_mutex); /* * forcefully broadcast a ping to all nodes every SEQ_MAX / 8 * pckts. * this solves 2 problems: * 1) on TX socket overloads we generate extra pings to keep links alive * 2) in 3+ nodes setup, where all the traffic is flowing between node 1 and 2, * node 3+ will be able to keep in sync on the TX seq_num even without * receiving traffic or pings in betweens. This avoids issues with * rollover of the circular buffer */ if (tx_seq_num % (SEQ_MAX / 8) == 0) { _send_pings(knet_h, 0); } if (inbuf->khp_data_frag_num > 1) { while (frag_idx < inbuf->khp_data_frag_num) { /* * set the iov_base */ iov_out[frag_idx][0].iov_base = (void *)knet_h->send_to_links_buf[frag_idx]; iov_out[frag_idx][0].iov_len = KNET_HEADER_DATA_SIZE; iov_out[frag_idx][1].iov_base = inbuf->khp_data_userdata + (temp_data_mtu * frag_idx); /* * set the len */ if (frag_len > temp_data_mtu) { iov_out[frag_idx][1].iov_len = temp_data_mtu; } else { iov_out[frag_idx][1].iov_len = frag_len; } /* * copy the frag info on all buffers */ knet_h->send_to_links_buf[frag_idx]->kh_type = inbuf->kh_type; knet_h->send_to_links_buf[frag_idx]->khp_data_seq_num = inbuf->khp_data_seq_num; knet_h->send_to_links_buf[frag_idx]->khp_data_frag_num = inbuf->khp_data_frag_num; knet_h->send_to_links_buf[frag_idx]->khp_data_bcast = inbuf->khp_data_bcast; knet_h->send_to_links_buf[frag_idx]->khp_data_channel = inbuf->khp_data_channel; knet_h->send_to_links_buf[frag_idx]->khp_data_compress = inbuf->khp_data_compress; frag_len = frag_len - temp_data_mtu; frag_idx++; } iovcnt_out = 2; } else { iov_out[frag_idx][0].iov_base = (void *)inbuf; iov_out[frag_idx][0].iov_len = frag_len + KNET_HEADER_DATA_SIZE; iovcnt_out = 1; } if (knet_h->crypto_instance) { struct timespec start_time; struct timespec end_time; uint64_t crypt_time; frag_idx = 0; while (frag_idx < inbuf->khp_data_frag_num) { clock_gettime(CLOCK_MONOTONIC, &start_time); if (crypto_encrypt_and_signv( knet_h, iov_out[frag_idx], iovcnt_out, knet_h->send_to_links_buf_crypt[frag_idx], (ssize_t *)&outlen) < 0) { log_debug(knet_h, KNET_SUB_TX, "Unable to encrypt packet"); savederrno = ECHILD; err = -1; goto out_unlock; } clock_gettime(CLOCK_MONOTONIC, &end_time); timespec_diff(start_time, end_time, &crypt_time); if (crypt_time < knet_h->stats.tx_crypt_time_min) { knet_h->stats.tx_crypt_time_min = crypt_time; } if (crypt_time > knet_h->stats.tx_crypt_time_max) { knet_h->stats.tx_crypt_time_max = crypt_time; } knet_h->stats.tx_crypt_time_ave = (knet_h->stats.tx_crypt_time_ave * knet_h->stats.tx_crypt_packets + crypt_time) / (knet_h->stats.tx_crypt_packets+1); uncrypted_frag_size = 0; for (j=0; j < iovcnt_out; j++) { uncrypted_frag_size += iov_out[frag_idx][j].iov_len; } knet_h->stats.tx_crypt_byte_overhead += (outlen - uncrypted_frag_size); knet_h->stats.tx_crypt_packets++; iov_out[frag_idx][0].iov_base = knet_h->send_to_links_buf_crypt[frag_idx]; iov_out[frag_idx][0].iov_len = outlen; frag_idx++; } iovcnt_out = 1; } memset(&msg, 0, sizeof(msg)); msgs_to_send = inbuf->khp_data_frag_num; msg_idx = 0; while (msg_idx < msgs_to_send) { msg[msg_idx].msg_hdr.msg_namelen = sizeof(struct sockaddr_storage); msg[msg_idx].msg_hdr.msg_iov = &iov_out[msg_idx][0]; msg[msg_idx].msg_hdr.msg_iovlen = iovcnt_out; msg_idx++; } if (!bcast) { for (host_idx = 0; host_idx < dst_host_ids_entries; host_idx++) { dst_host = knet_h->host_index[dst_host_ids[host_idx]]; err = _dispatch_to_links(knet_h, dst_host, &msg[0], msgs_to_send); savederrno = errno; if (err) { goto out_unlock; } } } else { for (dst_host = knet_h->host_head; dst_host != NULL; dst_host = dst_host->next) { if (dst_host->status.reachable) { err = _dispatch_to_links(knet_h, dst_host, &msg[0], msgs_to_send); savederrno = errno; if (err) { goto out_unlock; } } } } out_unlock: errno = savederrno; return err; } int knet_send_sync(knet_handle_t knet_h, const char *buff, const size_t buff_len, const int8_t channel) { int savederrno = 0, err = 0; if (!knet_h) { errno = EINVAL; return -1; } if (buff == NULL) { errno = EINVAL; return -1; } if (buff_len <= 0) { errno = EINVAL; return -1; } if (buff_len > KNET_MAX_PACKET_SIZE) { errno = EINVAL; return -1; } if (channel < 0) { errno = EINVAL; return -1; } if (channel >= KNET_DATAFD_MAX) { errno = EINVAL; return -1; } savederrno = pthread_rwlock_rdlock(&knet_h->global_rwlock); if (savederrno) { log_err(knet_h, KNET_SUB_TX, "Unable to get read lock: %s", strerror(savederrno)); errno = savederrno; return -1; } if (!knet_h->sockfd[channel].in_use) { savederrno = EINVAL; err = -1; goto out; } savederrno = pthread_mutex_lock(&knet_h->tx_mutex); if (savederrno) { log_err(knet_h, KNET_SUB_TX, "Unable to get TX mutex lock: %s", strerror(savederrno)); err = -1; goto out; } knet_h->recv_from_sock_buf->kh_type = KNET_HEADER_TYPE_DATA; memmove(knet_h->recv_from_sock_buf->khp_data_userdata, buff, buff_len); err = _parse_recv_from_sock(knet_h, buff_len, channel, 1); savederrno = errno; pthread_mutex_unlock(&knet_h->tx_mutex); out: pthread_rwlock_unlock(&knet_h->global_rwlock); errno = savederrno; return err; } static void _handle_send_to_links(knet_handle_t knet_h, struct msghdr *msg, int sockfd, int8_t channel, int type) { ssize_t inlen = 0; int savederrno = 0, docallback = 0; if ((channel >= 0) && (channel < KNET_DATAFD_MAX) && (!knet_h->sockfd[channel].is_socket)) { inlen = readv(sockfd, msg->msg_iov, 1); } else { inlen = recvmsg(sockfd, msg, MSG_DONTWAIT | MSG_NOSIGNAL); } if (inlen == 0) { savederrno = 0; docallback = 1; } else if (inlen < 0) { struct epoll_event ev; savederrno = errno; docallback = 1; memset(&ev, 0, sizeof(struct epoll_event)); if (epoll_ctl(knet_h->send_to_links_epollfd, EPOLL_CTL_DEL, knet_h->sockfd[channel].sockfd[knet_h->sockfd[channel].is_created], &ev)) { log_err(knet_h, KNET_SUB_TX, "Unable to del datafd %d from linkfd epoll pool: %s", knet_h->sockfd[channel].sockfd[0], strerror(savederrno)); } else { knet_h->sockfd[channel].has_error = 1; } } else { knet_h->recv_from_sock_buf->kh_type = type; _parse_recv_from_sock(knet_h, inlen, channel, 0); } if (docallback) { knet_h->sock_notify_fn(knet_h->sock_notify_fn_private_data, knet_h->sockfd[channel].sockfd[0], channel, KNET_NOTIFY_TX, inlen, savederrno); } } void *_handle_send_to_links_thread(void *data) { knet_handle_t knet_h = (knet_handle_t) data; struct epoll_event events[KNET_EPOLL_MAX_EVENTS]; int i, nev, type; int8_t channel; struct iovec iov_in; struct msghdr msg; struct sockaddr_storage address; set_thread_status(knet_h, KNET_THREAD_TX, KNET_THREAD_RUNNING); memset(&iov_in, 0, sizeof(iov_in)); iov_in.iov_base = (void *)knet_h->recv_from_sock_buf->khp_data_userdata; iov_in.iov_len = KNET_MAX_PACKET_SIZE; memset(&msg, 0, sizeof(struct msghdr)); msg.msg_name = &address; msg.msg_namelen = sizeof(struct sockaddr_storage); msg.msg_iov = &iov_in; msg.msg_iovlen = 1; knet_h->recv_from_sock_buf->kh_version = KNET_HEADER_VERSION; knet_h->recv_from_sock_buf->khp_data_frag_seq = 0; knet_h->recv_from_sock_buf->kh_node = htons(knet_h->host_id); for (i = 0; i < PCKT_FRAG_MAX; i++) { knet_h->send_to_links_buf[i]->kh_version = KNET_HEADER_VERSION; knet_h->send_to_links_buf[i]->khp_data_frag_seq = i + 1; knet_h->send_to_links_buf[i]->kh_node = htons(knet_h->host_id); } while (!shutdown_in_progress(knet_h)) { nev = epoll_wait(knet_h->send_to_links_epollfd, events, KNET_EPOLL_MAX_EVENTS + 1, KNET_THREADS_TIMERES / 1000); /* * we use timeout to detect if thread is shutting down */ if (nev == 0) { continue; } if (pthread_rwlock_rdlock(&knet_h->global_rwlock) != 0) { log_debug(knet_h, KNET_SUB_TX, "Unable to get read lock"); continue; } for (i = 0; i < nev; i++) { if (events[i].data.fd == knet_h->hostsockfd[0]) { type = KNET_HEADER_TYPE_HOST_INFO; channel = -1; } else { type = KNET_HEADER_TYPE_DATA; for (channel = 0; channel < KNET_DATAFD_MAX; channel++) { if ((knet_h->sockfd[channel].in_use) && (knet_h->sockfd[channel].sockfd[knet_h->sockfd[channel].is_created] == events[i].data.fd)) { break; } } if (channel >= KNET_DATAFD_MAX) { log_debug(knet_h, KNET_SUB_TX, "No available channels"); continue; /* channel not found */ } } if (pthread_mutex_lock(&knet_h->tx_mutex) != 0) { log_debug(knet_h, KNET_SUB_TX, "Unable to get mutex lock"); continue; } _handle_send_to_links(knet_h, &msg, events[i].data.fd, channel, type); pthread_mutex_unlock(&knet_h->tx_mutex); } pthread_rwlock_unlock(&knet_h->global_rwlock); } set_thread_status(knet_h, KNET_THREAD_TX, KNET_THREAD_STOPPED); return NULL; }