diff --git a/libknet/crypto.c b/libknet/crypto.c index 9f05fbaf..9d6757b6 100644 --- a/libknet/crypto.c +++ b/libknet/crypto.c @@ -1,234 +1,234 @@ /* * Copyright (C) 2012-2019 Red Hat, Inc. All rights reserved. * * Author: Fabio M. Di Nitto * * This software licensed under LGPL-2.0+ */ #include "config.h" #include #include #include #include #include #include "crypto.h" #include "crypto_model.h" #include "internals.h" #include "logging.h" #include "common.h" /* * internal module switch data */ static crypto_model_t crypto_modules_cmds[] = { { "nss", WITH_CRYPTO_NSS, 0, NULL }, { "openssl", WITH_CRYPTO_OPENSSL, 0, NULL }, { NULL, 0, 0, NULL } }; static int crypto_get_model(const char *model) { int idx = 0; while (crypto_modules_cmds[idx].model_name != NULL) { if (!strcmp(crypto_modules_cmds[idx].model_name, model)) return idx; idx++; } return -1; } /* * exported API */ int crypto_encrypt_and_sign ( knet_handle_t knet_h, const unsigned char *buf_in, const ssize_t buf_in_len, unsigned char *buf_out, ssize_t *buf_out_len) { return crypto_modules_cmds[knet_h->crypto_instance->model].ops->crypt(knet_h, buf_in, buf_in_len, buf_out, buf_out_len); } int crypto_encrypt_and_signv ( knet_handle_t knet_h, const struct iovec *iov_in, int iovcnt_in, unsigned char *buf_out, ssize_t *buf_out_len) { return crypto_modules_cmds[knet_h->crypto_instance->model].ops->cryptv(knet_h, iov_in, iovcnt_in, buf_out, buf_out_len); } int crypto_authenticate_and_decrypt ( knet_handle_t knet_h, const unsigned char *buf_in, const ssize_t buf_in_len, unsigned char *buf_out, ssize_t *buf_out_len) { return crypto_modules_cmds[knet_h->crypto_instance->model].ops->decrypt(knet_h, buf_in, buf_in_len, buf_out, buf_out_len); } int crypto_init( knet_handle_t knet_h, struct knet_handle_crypto_cfg *knet_handle_crypto_cfg) { int err = 0, savederrno = 0; int model = 0; struct crypto_instance *current = NULL, *new = NULL; current = knet_h->crypto_instance; model = crypto_get_model(knet_handle_crypto_cfg->crypto_model); if (model < 0) { log_err(knet_h, KNET_SUB_CRYPTO, "model %s not supported", knet_handle_crypto_cfg->crypto_model); return -1; } if (crypto_modules_cmds[model].built_in == 0) { log_err(knet_h, KNET_SUB_CRYPTO, "this version of libknet was built without %s support. Please contact your vendor or fix the build.", knet_handle_crypto_cfg->crypto_model); return -1; } savederrno = pthread_rwlock_wrlock(&shlib_rwlock); if (savederrno) { log_err(knet_h, KNET_SUB_CRYPTO, "Unable to get write lock: %s", strerror(savederrno)); return -1; } if (!crypto_modules_cmds[model].loaded) { crypto_modules_cmds[model].ops = load_module (knet_h, "crypto", crypto_modules_cmds[model].model_name); if (!crypto_modules_cmds[model].ops) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_CRYPTO, "Unable to load %s lib", crypto_modules_cmds[model].model_name); goto out; } if (crypto_modules_cmds[model].ops->abi_ver != KNET_CRYPTO_MODEL_ABI) { savederrno = EINVAL; err = -1; log_err(knet_h, KNET_SUB_CRYPTO, "ABI mismatch loading module %s. knet ver: %d, module ver: %d", crypto_modules_cmds[model].model_name, KNET_CRYPTO_MODEL_ABI, crypto_modules_cmds[model].ops->abi_ver); goto out; } crypto_modules_cmds[model].loaded = 1; } log_debug(knet_h, KNET_SUB_CRYPTO, "Initizializing crypto module [%s/%s/%s]", knet_handle_crypto_cfg->crypto_model, knet_handle_crypto_cfg->crypto_cipher_type, knet_handle_crypto_cfg->crypto_hash_type); new = malloc(sizeof(struct crypto_instance)); if (!new) { savederrno = ENOMEM; err = -1; log_err(knet_h, KNET_SUB_CRYPTO, "Unable to allocate memory for crypto instance"); goto out; } /* * if crypto_modules_cmds.ops->init fails, it is expected that * it will clean everything by itself. * crypto_modules_cmds.ops->fini is not invoked on error. */ new->model = model; if (crypto_modules_cmds[model].ops->init(knet_h, new, knet_handle_crypto_cfg)) { savederrno = errno; err = -1; goto out; } - log_debug(knet_h, KNET_SUB_CRYPTO, "security network overhead: %zu", knet_h->sec_header_size); - out: if (!err) { knet_h->crypto_instance = new; knet_h->sec_header_size = new->sec_header_size; knet_h->sec_block_size = new->sec_block_size; knet_h->sec_hash_size = new->sec_hash_size; knet_h->sec_salt_size = new->sec_salt_size; + log_debug(knet_h, KNET_SUB_CRYPTO, "security network overhead: %zu", knet_h->sec_header_size); + if (current) { if (crypto_modules_cmds[current->model].ops->fini != NULL) { crypto_modules_cmds[current->model].ops->fini(knet_h, current); } free(current); } } else { if (new) { free(new); } } pthread_rwlock_unlock(&shlib_rwlock); errno = err ? savederrno : 0; return err; } void crypto_fini( knet_handle_t knet_h) { int savederrno = 0; savederrno = pthread_rwlock_wrlock(&shlib_rwlock); if (savederrno) { log_err(knet_h, KNET_SUB_CRYPTO, "Unable to get write lock: %s", strerror(savederrno)); return; } if (knet_h->crypto_instance) { if (crypto_modules_cmds[knet_h->crypto_instance->model].ops->fini != NULL) { crypto_modules_cmds[knet_h->crypto_instance->model].ops->fini(knet_h, knet_h->crypto_instance); } free(knet_h->crypto_instance); knet_h->sec_header_size = 0; knet_h->sec_block_size = 0; knet_h->sec_hash_size = 0; knet_h->sec_salt_size = 0; knet_h->crypto_instance = NULL; } pthread_rwlock_unlock(&shlib_rwlock); return; } int knet_get_crypto_list(struct knet_crypto_info *crypto_list, size_t *crypto_list_entries) { int err = 0; int idx = 0; int outidx = 0; if (!crypto_list_entries) { errno = EINVAL; return -1; } while (crypto_modules_cmds[idx].model_name != NULL) { if (crypto_modules_cmds[idx].built_in) { if (crypto_list) { crypto_list[outidx].name = crypto_modules_cmds[idx].model_name; } outidx++; } idx++; } *crypto_list_entries = outidx; if (!err) errno = 0; return err; } diff --git a/libknet/threads_pmtud.c b/libknet/threads_pmtud.c index 463fe1b3..a09516f2 100644 --- a/libknet/threads_pmtud.c +++ b/libknet/threads_pmtud.c @@ -1,572 +1,637 @@ /* * Copyright (C) 2015-2019 Red Hat, Inc. All rights reserved. * * Authors: Fabio M. Di Nitto * Federico Simoncelli * * This software licensed under LGPL-2.0+ */ #include "config.h" #include #include #include #include #include "crypto.h" #include "links.h" #include "host.h" #include "logging.h" #include "transports.h" #include "threads_common.h" #include "threads_pmtud.h" static int _handle_check_link_pmtud(knet_handle_t knet_h, struct knet_host *dst_host, struct knet_link *dst_link) { int err, ret, savederrno, mutex_retry_limit, failsafe, use_kernel_mtu, warn_once; uint32_t kernel_mtu; /* record kernel_mtu from EMSGSIZE */ size_t onwire_len; /* current packet onwire size */ size_t overhead_len; /* onwire packet overhead (protocol based) */ size_t max_mtu_len; /* max mtu for protocol */ size_t data_len; /* how much data we can send in the packet * generally would be onwire_len - overhead_len * needs to be adjusted for crypto */ size_t pad_len; /* crypto packet pad size, needs to move into crypto.c callbacks */ ssize_t len; /* len of what we were able to sendto onwire */ struct timespec ts; unsigned long long pong_timeout_adj_tmp; unsigned char *outbuf = (unsigned char *)knet_h->pmtudbuf; warn_once = 0; mutex_retry_limit = 0; failsafe = 0; dst_link->last_bad_mtu = 0; knet_h->pmtudbuf->khp_pmtud_link = dst_link->link_id; switch (dst_link->dst_addr.ss_family) { case AF_INET6: max_mtu_len = KNET_PMTUD_SIZE_V6; overhead_len = KNET_PMTUD_OVERHEAD_V6 + dst_link->proto_overhead; dst_link->last_good_mtu = dst_link->last_ping_size + overhead_len; break; case AF_INET: max_mtu_len = KNET_PMTUD_SIZE_V4; overhead_len = KNET_PMTUD_OVERHEAD_V4 + dst_link->proto_overhead; dst_link->last_good_mtu = dst_link->last_ping_size + overhead_len; break; default: log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD aborted, unknown protocol"); return -1; break; } /* * discovery starts from the top because kernel will * refuse to send packets > current iface mtu. * this saves us some time and network bw. */ onwire_len = max_mtu_len; restart: /* * prevent a race when interface mtu is changed _exactly_ during * the discovery process and it's complex to detect. Easier * to wait the next loop. * 30 is not an arbitrary value. To bisect from 576 to 128000 doesn't * take more than 18/19 steps. */ if (failsafe == 30) { log_err(knet_h, KNET_SUB_PMTUD, "Aborting PMTUD process: Too many attempts. MTU might have changed during discovery."); return -1; } else { failsafe++; } + /* + * unencrypted packet looks like: + * + * | ip | protocol | knet_header | unencrypted data | + * | onwire_len | + * | overhead_len | + * | data_len | + * | app MTU | + * + * encrypted packet looks like (not to scale): + * + * | ip | protocol | salt | crypto(knet_header | data) | crypto_data_pad | hash | + * | onwire_len | + * | overhead_len | + * | data_len | + * | app MTU | + * + * knet_h->sec_block_size is >= 0 if encryption will pad the data + * knet_h->sec_salt_size is >= 0 if encryption is enabled + * knet_h->sec_hash_size is >= 0 if signing is enabled + */ + + /* + * common to all packets + */ data_len = onwire_len - overhead_len; if (knet_h->crypto_instance) { +realign: if (knet_h->sec_block_size) { + + /* + * drop both salt and hash, that leaves only the crypto data and padding + * we need to calculate the padding based on the real encrypted data. + */ + data_len = data_len - (knet_h->sec_salt_size + knet_h->sec_hash_size); + + /* + * if the crypto mechanism requires padding, calculate the padding + * and add it back to data_len because that's what the crypto layer + * would do. + */ pad_len = knet_h->sec_block_size - (data_len % knet_h->sec_block_size); + + /* + * if are at the boundary, reset padding + */ if (pad_len == knet_h->sec_block_size) { pad_len = 0; } data_len = data_len + pad_len; - } - - data_len = data_len + (knet_h->sec_hash_size + knet_h->sec_salt_size + knet_h->sec_block_size); - if (knet_h->sec_block_size) { + /* + * if our current data_len is higher than max_mtu_len + * then we need to reduce by padding size (that is our + * increment / decrement value) + * + * this generally happens only on the first PMTUd run + */ while (data_len + overhead_len >= max_mtu_len) { data_len = data_len - knet_h->sec_block_size; } + + /* + * add both hash and salt size back, similar to padding above, + * the crypto layer will add them to the data_len + */ + data_len = data_len + (knet_h->sec_salt_size + knet_h->sec_hash_size); } if (dst_link->last_bad_mtu) { - while (data_len + overhead_len >= dst_link->last_bad_mtu) { - data_len = data_len - (knet_h->sec_hash_size + knet_h->sec_salt_size + knet_h->sec_block_size); + if (data_len + overhead_len >= dst_link->last_bad_mtu) { + /* + * reduce data_len to something lower than last_bad_mtu, overhead_len + * and sec_block_size (decrementing step) - 1 (granularity) + */ + data_len = dst_link->last_bad_mtu - overhead_len - knet_h->sec_block_size - 1; + if (knet_h->sec_block_size) { + /* + * make sure that data_len is aligned to the sec_block_size boundary + */ + goto realign; + } } } if (data_len < (knet_h->sec_hash_size + knet_h->sec_salt_size + knet_h->sec_block_size) + 1) { log_debug(knet_h, KNET_SUB_PMTUD, "Aborting PMTUD process: link mtu smaller than crypto header detected (link might have been disconnected)"); return -1; } + /* + * recalculate onwire_len based on crypto information + * and place it in the PMTUd packet info + */ onwire_len = data_len + overhead_len; knet_h->pmtudbuf->khp_pmtud_size = onwire_len; if (crypto_encrypt_and_sign(knet_h, (const unsigned char *)knet_h->pmtudbuf, data_len - (knet_h->sec_hash_size + knet_h->sec_salt_size + knet_h->sec_block_size), knet_h->pmtudbuf_crypt, (ssize_t *)&data_len) < 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to crypto pmtud packet"); return -1; } outbuf = knet_h->pmtudbuf_crypt; knet_h->stats_extra.tx_crypt_pmtu_packets++; } else { knet_h->pmtudbuf->khp_pmtud_size = onwire_len; } /* link has gone down, aborting pmtud */ if (dst_link->status.connected != 1) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD detected host (%u) link (%u) has been disconnected", dst_host->host_id, dst_link->link_id); return -1; } if (dst_link->transport_connected != 1) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD detected host (%u) link (%u) has been disconnected", dst_host->host_id, dst_link->link_id); return -1; } if (pthread_mutex_lock(&knet_h->pmtud_mutex) != 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get mutex lock"); return -1; } if (knet_h->pmtud_abort) { pthread_mutex_unlock(&knet_h->pmtud_mutex); errno = EDEADLK; return -1; } savederrno = pthread_mutex_lock(&knet_h->tx_mutex); if (savederrno) { log_err(knet_h, KNET_SUB_PMTUD, "Unable to get TX mutex lock: %s", strerror(savederrno)); return -1; } retry: if (transport_get_connection_oriented(knet_h, dst_link->transport) == TRANSPORT_PROTO_NOT_CONNECTION_ORIENTED) { len = sendto(dst_link->outsock, outbuf, data_len, MSG_DONTWAIT | MSG_NOSIGNAL, (struct sockaddr *) &dst_link->dst_addr, sizeof(struct sockaddr_storage)); } else { len = sendto(dst_link->outsock, outbuf, data_len, MSG_DONTWAIT | MSG_NOSIGNAL, NULL, 0); } savederrno = errno; /* * we cannot hold a lock on kmtu_mutex between resetting * knet_h->kernel_mtu here and below where it's used. * use_kernel_mtu tells us if the knet_h->kernel_mtu was * set to 0 and we can trust its value later. */ use_kernel_mtu = 0; if (pthread_mutex_lock(&knet_h->kmtu_mutex) == 0) { use_kernel_mtu = 1; knet_h->kernel_mtu = 0; pthread_mutex_unlock(&knet_h->kmtu_mutex); } kernel_mtu = 0; err = transport_tx_sock_error(knet_h, dst_link->transport, dst_link->outsock, len, savederrno); switch(err) { case -1: /* unrecoverable error */ log_debug(knet_h, KNET_SUB_PMTUD, "Unable to send pmtu packet (sendto): %d %s", savederrno, strerror(savederrno)); pthread_mutex_unlock(&knet_h->tx_mutex); pthread_mutex_unlock(&knet_h->pmtud_mutex); dst_link->status.stats.tx_pmtu_errors++; return -1; case 0: /* ignore error and continue */ break; case 1: /* retry to send those same data */ dst_link->status.stats.tx_pmtu_retries++; goto retry; break; } pthread_mutex_unlock(&knet_h->tx_mutex); if (len != (ssize_t )data_len) { if (savederrno == EMSGSIZE) { /* * we cannot hold a lock on kmtu_mutex between resetting * knet_h->kernel_mtu and here. * use_kernel_mtu tells us if the knet_h->kernel_mtu was * set to 0 previously and we can trust its value now. */ if (use_kernel_mtu) { use_kernel_mtu = 0; if (pthread_mutex_lock(&knet_h->kmtu_mutex) == 0) { kernel_mtu = knet_h->kernel_mtu; pthread_mutex_unlock(&knet_h->kmtu_mutex); } } if (kernel_mtu > 0) { dst_link->last_bad_mtu = kernel_mtu + 1; } else { dst_link->last_bad_mtu = onwire_len; } } else { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to send pmtu packet len: %zu err: %s", onwire_len, strerror(savederrno)); } } else { dst_link->last_sent_mtu = onwire_len; dst_link->last_recv_mtu = 0; dst_link->status.stats.tx_pmtu_packets++; dst_link->status.stats.tx_pmtu_bytes += data_len; if (clock_gettime(CLOCK_REALTIME, &ts) < 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get current time: %s", strerror(errno)); pthread_mutex_unlock(&knet_h->pmtud_mutex); return -1; } /* * set PMTUd reply timeout to match pong_timeout on a given link * * math: internally pong_timeout is expressed in microseconds, while * the public API exports milliseconds. So careful with the 0's here. * the loop is necessary because we are grabbing the current time just above * and add values to it that could overflow into seconds. */ if (pthread_mutex_lock(&knet_h->backoff_mutex)) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get backoff_mutex"); pthread_mutex_unlock(&knet_h->pmtud_mutex); return -1; } if (knet_h->crypto_instance) { /* * crypto, under pressure, is a royal PITA */ pong_timeout_adj_tmp = dst_link->pong_timeout_adj * 2; } else { pong_timeout_adj_tmp = dst_link->pong_timeout_adj; } ts.tv_sec += pong_timeout_adj_tmp / 1000000; ts.tv_nsec += (((pong_timeout_adj_tmp) % 1000000) * 1000); while (ts.tv_nsec > 1000000000) { ts.tv_sec += 1; ts.tv_nsec -= 1000000000; } pthread_mutex_unlock(&knet_h->backoff_mutex); knet_h->pmtud_waiting = 1; ret = pthread_cond_timedwait(&knet_h->pmtud_cond, &knet_h->pmtud_mutex, &ts); knet_h->pmtud_waiting = 0; if (knet_h->pmtud_abort) { pthread_mutex_unlock(&knet_h->pmtud_mutex); errno = EDEADLK; return -1; } /* * we cannot use shutdown_in_progress in here because * we already hold the read lock */ if (knet_h->fini_in_progress) { pthread_mutex_unlock(&knet_h->pmtud_mutex); log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD aborted. shutdown in progress"); return -1; } if (ret) { if (ret == ETIMEDOUT) { if (!warn_once) { log_warn(knet_h, KNET_SUB_PMTUD, "possible MTU misconfiguration detected. " "kernel is reporting MTU: %u bytes for " "host %u link %u but the other node is " "not acknowledging packets of this size. ", dst_link->last_sent_mtu, dst_host->host_id, dst_link->link_id); log_warn(knet_h, KNET_SUB_PMTUD, "This can be caused by this node interface MTU " "too big or a network device that does not " "support or has been misconfigured to manage MTU " "of this size, or packet loss. knet will continue " "to run but performances might be affected."); warn_once = 1; } } else { pthread_mutex_unlock(&knet_h->pmtud_mutex); if (mutex_retry_limit == 3) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD aborted, unable to get mutex lock"); return -1; } mutex_retry_limit++; goto restart; } } if ((dst_link->last_recv_mtu != onwire_len) || (ret)) { dst_link->last_bad_mtu = onwire_len; } else { int found_mtu = 0; if (knet_h->sec_block_size) { if ((onwire_len + knet_h->sec_block_size >= max_mtu_len) || ((dst_link->last_bad_mtu) && (dst_link->last_bad_mtu <= (onwire_len + knet_h->sec_block_size)))) { found_mtu = 1; } } else { if ((onwire_len == max_mtu_len) || ((dst_link->last_bad_mtu) && (dst_link->last_bad_mtu == (onwire_len + 1))) || (dst_link->last_bad_mtu == dst_link->last_good_mtu)) { found_mtu = 1; } } if (found_mtu) { /* * account for IP overhead, knet headers and crypto in PMTU calculation */ dst_link->status.mtu = onwire_len - dst_link->status.proto_overhead; pthread_mutex_unlock(&knet_h->pmtud_mutex); return 0; } dst_link->last_good_mtu = onwire_len; } } if (kernel_mtu) { onwire_len = kernel_mtu; } else { onwire_len = (dst_link->last_good_mtu + dst_link->last_bad_mtu) / 2; } pthread_mutex_unlock(&knet_h->pmtud_mutex); goto restart; } static int _handle_check_pmtud(knet_handle_t knet_h, struct knet_host *dst_host, struct knet_link *dst_link, unsigned int *min_mtu, int force_run) { uint8_t saved_valid_pmtud; unsigned int saved_pmtud; struct timespec clock_now; unsigned long long diff_pmtud, interval; if (clock_gettime(CLOCK_MONOTONIC, &clock_now) != 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get monotonic clock"); return 0; } if (!force_run) { interval = knet_h->pmtud_interval * 1000000000llu; /* nanoseconds */ timespec_diff(dst_link->pmtud_last, clock_now, &diff_pmtud); if (diff_pmtud < interval) { *min_mtu = dst_link->status.mtu; return dst_link->has_valid_mtu; } } switch (dst_link->dst_addr.ss_family) { case AF_INET6: dst_link->status.proto_overhead = KNET_PMTUD_OVERHEAD_V6 + dst_link->proto_overhead + KNET_HEADER_ALL_SIZE + knet_h->sec_header_size; break; case AF_INET: dst_link->status.proto_overhead = KNET_PMTUD_OVERHEAD_V4 + dst_link->proto_overhead + KNET_HEADER_ALL_SIZE + knet_h->sec_header_size; break; } saved_pmtud = dst_link->status.mtu; saved_valid_pmtud = dst_link->has_valid_mtu; log_debug(knet_h, KNET_SUB_PMTUD, "Starting PMTUD for host: %u link: %u", dst_host->host_id, dst_link->link_id); errno = 0; if (_handle_check_link_pmtud(knet_h, dst_host, dst_link) < 0) { if (errno == EDEADLK) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD for host: %u link: %u has been rescheduled", dst_host->host_id, dst_link->link_id); dst_link->status.mtu = saved_pmtud; dst_link->has_valid_mtu = saved_valid_pmtud; errno = EDEADLK; return dst_link->has_valid_mtu; } dst_link->has_valid_mtu = 0; } else { dst_link->has_valid_mtu = 1; switch (dst_link->dst_addr.ss_family) { case AF_INET6: if (((dst_link->status.mtu + dst_link->status.proto_overhead) < KNET_PMTUD_MIN_MTU_V6) || ((dst_link->status.mtu + dst_link->status.proto_overhead) > KNET_PMTUD_SIZE_V6)) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD detected an IPv6 MTU out of bound value (%u) for host: %u link: %u.", dst_link->status.mtu + dst_link->status.proto_overhead, dst_host->host_id, dst_link->link_id); dst_link->has_valid_mtu = 0; } break; case AF_INET: if (((dst_link->status.mtu + dst_link->status.proto_overhead) < KNET_PMTUD_MIN_MTU_V4) || ((dst_link->status.mtu + dst_link->status.proto_overhead) > KNET_PMTUD_SIZE_V4)) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD detected an IPv4 MTU out of bound value (%u) for host: %u link: %u.", dst_link->status.mtu + dst_link->status.proto_overhead, dst_host->host_id, dst_link->link_id); dst_link->has_valid_mtu = 0; } break; } if (dst_link->has_valid_mtu) { if ((saved_pmtud) && (saved_pmtud != dst_link->status.mtu)) { log_info(knet_h, KNET_SUB_PMTUD, "PMTUD link change for host: %u link: %u from %u to %u", dst_host->host_id, dst_link->link_id, saved_pmtud, dst_link->status.mtu); } log_debug(knet_h, KNET_SUB_PMTUD, "PMTUD completed for host: %u link: %u current link mtu: %u", dst_host->host_id, dst_link->link_id, dst_link->status.mtu); if (dst_link->status.mtu < *min_mtu) { *min_mtu = dst_link->status.mtu; } /* * set pmtud_last, if we can, after we are done with the PMTUd process * because it can take a very long time. */ dst_link->pmtud_last = clock_now; if (!clock_gettime(CLOCK_MONOTONIC, &clock_now)) { dst_link->pmtud_last = clock_now; } } } if (saved_valid_pmtud != dst_link->has_valid_mtu) { _host_dstcache_update_sync(knet_h, dst_host); } return dst_link->has_valid_mtu; } void *_handle_pmtud_link_thread(void *data) { knet_handle_t knet_h = (knet_handle_t) data; struct knet_host *dst_host; struct knet_link *dst_link; int link_idx; unsigned int min_mtu, have_mtu; unsigned int lower_mtu; int link_has_mtu; int force_run = 0; set_thread_status(knet_h, KNET_THREAD_PMTUD, KNET_THREAD_STARTED); knet_h->data_mtu = KNET_PMTUD_MIN_MTU_V4 - KNET_HEADER_ALL_SIZE - knet_h->sec_header_size; /* preparing pmtu buffer */ knet_h->pmtudbuf->kh_version = KNET_HEADER_VERSION; knet_h->pmtudbuf->kh_type = KNET_HEADER_TYPE_PMTUD; knet_h->pmtudbuf->kh_node = htons(knet_h->host_id); while (!shutdown_in_progress(knet_h)) { usleep(knet_h->threads_timer_res); if (pthread_mutex_lock(&knet_h->pmtud_mutex) != 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get mutex lock"); continue; } knet_h->pmtud_abort = 0; knet_h->pmtud_running = 1; force_run = knet_h->pmtud_forcerun; knet_h->pmtud_forcerun = 0; pthread_mutex_unlock(&knet_h->pmtud_mutex); if (force_run) { log_debug(knet_h, KNET_SUB_PMTUD, "PMTUd request to rerun has been received"); } if (pthread_rwlock_rdlock(&knet_h->global_rwlock) != 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get read lock"); continue; } lower_mtu = KNET_PMTUD_SIZE_V4; min_mtu = KNET_PMTUD_SIZE_V4 - KNET_HEADER_ALL_SIZE - knet_h->sec_header_size; have_mtu = 0; for (dst_host = knet_h->host_head; dst_host != NULL; dst_host = dst_host->next) { for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) { dst_link = &dst_host->link[link_idx]; if ((dst_link->status.enabled != 1) || (dst_link->status.connected != 1) || (dst_host->link[link_idx].transport == KNET_TRANSPORT_LOOPBACK) || (!dst_link->last_ping_size) || ((dst_link->dynamic == KNET_LINK_DYNIP) && (dst_link->status.dynconnected != 1))) continue; link_has_mtu = _handle_check_pmtud(knet_h, dst_host, dst_link, &min_mtu, force_run); if (errno == EDEADLK) { goto out_unlock; } if (link_has_mtu) { have_mtu = 1; if (min_mtu < lower_mtu) { lower_mtu = min_mtu; } } } } if (have_mtu) { if (knet_h->data_mtu != lower_mtu) { knet_h->data_mtu = lower_mtu; log_info(knet_h, KNET_SUB_PMTUD, "Global data MTU changed to: %u", knet_h->data_mtu); if (knet_h->pmtud_notify_fn) { knet_h->pmtud_notify_fn(knet_h->pmtud_notify_fn_private_data, knet_h->data_mtu); } } } out_unlock: pthread_rwlock_unlock(&knet_h->global_rwlock); if (pthread_mutex_lock(&knet_h->pmtud_mutex) != 0) { log_debug(knet_h, KNET_SUB_PMTUD, "Unable to get mutex lock"); } else { knet_h->pmtud_running = 0; pthread_mutex_unlock(&knet_h->pmtud_mutex); } } set_thread_status(knet_h, KNET_THREAD_PMTUD, KNET_THREAD_STOPPED); return NULL; } diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c index 2baaa58f..a8f24d36 100644 --- a/libknet/transport_udp.c +++ b/libknet/transport_udp.c @@ -1,431 +1,432 @@ /* * Copyright (C) 2016-2019 Red Hat, Inc. All rights reserved. * * Author: Christine Caulfield * * This software licensed under LGPL-2.0+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #if defined (IP_RECVERR) || defined (IPV6_RECVERR) #include #endif #include "libknet.h" #include "compat.h" #include "host.h" #include "link.h" #include "logging.h" #include "common.h" #include "transport_common.h" #include "transport_udp.h" #include "threads_common.h" typedef struct udp_handle_info { struct knet_list_head links_list; } udp_handle_info_t; typedef struct udp_link_info { struct knet_list_head list; struct sockaddr_storage local_address; int socket_fd; int on_epoll; } udp_link_info_t; int udp_transport_link_set_config(knet_handle_t knet_h, struct knet_link *kn_link) { int err = 0, savederrno = 0; int sock = -1; struct epoll_event ev; udp_link_info_t *info; udp_handle_info_t *handle_info = knet_h->transports[KNET_TRANSPORT_UDP]; #if defined (IP_RECVERR) || defined (IPV6_RECVERR) int value; #endif /* * Only allocate a new link if the local address is different */ knet_list_for_each_entry(info, &handle_info->links_list, list) { if (memcmp(&info->local_address, &kn_link->src_addr, sizeof(struct sockaddr_storage)) == 0) { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Re-using existing UDP socket for new link"); kn_link->outsock = info->socket_fd; kn_link->transport_link = info; kn_link->transport_connected = 1; return 0; } } info = malloc(sizeof(udp_link_info_t)); if (!info) { err = -1; goto exit_error; } memset(info, 0, sizeof(udp_link_info_t)); sock = socket(kn_link->src_addr.ss_family, SOCK_DGRAM, 0); if (sock < 0) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to create listener socket: %s", strerror(savederrno)); goto exit_error; } if (_configure_transport_socket(knet_h, sock, &kn_link->src_addr, kn_link->flags, "UDP") < 0) { savederrno = errno; err = -1; goto exit_error; } #ifdef IP_RECVERR if (kn_link->src_addr.ss_family == AF_INET) { value = 1; if (setsockopt(sock, SOL_IP, IP_RECVERR, &value, sizeof(value)) <0) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to set RECVERR on socket: %s", strerror(savederrno)); goto exit_error; } log_debug(knet_h, KNET_SUB_TRANSP_UDP, "IP_RECVERR enabled on socket: %i", sock); } #else log_debug(knet_h, KNET_SUB_TRANSP_UDP, "IP_RECVERR not available in this build/platform"); #endif #ifdef IPV6_RECVERR if (kn_link->src_addr.ss_family == AF_INET6) { value = 1; if (setsockopt(sock, SOL_IPV6, IPV6_RECVERR, &value, sizeof(value)) <0) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to set RECVERR on socket: %s", strerror(savederrno)); goto exit_error; } log_debug(knet_h, KNET_SUB_TRANSP_UDP, "IPV6_RECVERR enabled on socket: %i", sock); } #else log_debug(knet_h, KNET_SUB_TRANSP_UDP, "IPV6_RECVERR not available in this build/platform"); #endif if (bind(sock, (struct sockaddr *)&kn_link->src_addr, sockaddr_len(&kn_link->src_addr))) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to bind listener socket: %s", strerror(savederrno)); goto exit_error; } memset(&ev, 0, sizeof(struct epoll_event)); ev.events = EPOLLIN; ev.data.fd = sock; if (epoll_ctl(knet_h->recv_from_links_epollfd, EPOLL_CTL_ADD, sock, &ev)) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to add listener to epoll pool: %s", strerror(savederrno)); goto exit_error; } info->on_epoll = 1; if (_set_fd_tracker(knet_h, sock, KNET_TRANSPORT_UDP, 0, info) < 0) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to set fd tracker: %s", strerror(savederrno)); goto exit_error; } memmove(&info->local_address, &kn_link->src_addr, sizeof(struct sockaddr_storage)); info->socket_fd = sock; knet_list_add(&info->list, &handle_info->links_list); kn_link->outsock = sock; kn_link->transport_link = info; kn_link->transport_connected = 1; exit_error: if (err) { if (info) { if (info->on_epoll) { epoll_ctl(knet_h->recv_from_links_epollfd, EPOLL_CTL_DEL, sock, &ev); } free(info); } if (sock >= 0) { close(sock); } } errno = savederrno; return err; } int udp_transport_link_clear_config(knet_handle_t knet_h, struct knet_link *kn_link) { int err = 0, savederrno = 0; int found = 0; struct knet_host *host; int link_idx; udp_link_info_t *info = kn_link->transport_link; struct epoll_event ev; for (host = knet_h->host_head; host != NULL; host = host->next) { for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) { if (&host->link[link_idx] == kn_link) continue; if (host->link[link_idx].transport_link == info) { found = 1; break; } } } if (found) { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "UDP socket %d still in use", info->socket_fd); savederrno = EBUSY; err = -1; goto exit_error; } if (info->on_epoll) { memset(&ev, 0, sizeof(struct epoll_event)); ev.events = EPOLLIN; ev.data.fd = info->socket_fd; if (epoll_ctl(knet_h->recv_from_links_epollfd, EPOLL_CTL_DEL, info->socket_fd, &ev) < 0) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to remove UDP socket from epoll poll: %s", strerror(errno)); goto exit_error; } info->on_epoll = 0; } if (_set_fd_tracker(knet_h, info->socket_fd, KNET_MAX_TRANSPORTS, 0, NULL) < 0) { savederrno = errno; err = -1; log_err(knet_h, KNET_SUB_TRANSP_UDP, "Unable to set fd tracker: %s", strerror(savederrno)); goto exit_error; } close(info->socket_fd); knet_list_del(&info->list); free(kn_link->transport_link); exit_error: errno = savederrno; return err; } int udp_transport_free(knet_handle_t knet_h) { udp_handle_info_t *handle_info; if (!knet_h->transports[KNET_TRANSPORT_UDP]) { errno = EINVAL; return -1; } handle_info = knet_h->transports[KNET_TRANSPORT_UDP]; /* * keep it here while we debug list usage and such */ if (!knet_list_empty(&handle_info->links_list)) { log_err(knet_h, KNET_SUB_TRANSP_UDP, "Internal error. handle list is not empty"); return -1; } free(handle_info); knet_h->transports[KNET_TRANSPORT_UDP] = NULL; return 0; } int udp_transport_init(knet_handle_t knet_h) { udp_handle_info_t *handle_info; if (knet_h->transports[KNET_TRANSPORT_UDP]) { errno = EEXIST; return -1; } handle_info = malloc(sizeof(udp_handle_info_t)); if (!handle_info) { return -1; } memset(handle_info, 0, sizeof(udp_handle_info_t)); knet_h->transports[KNET_TRANSPORT_UDP] = handle_info; knet_list_init(&handle_info->links_list); return 0; } #if defined (IP_RECVERR) || defined (IPV6_RECVERR) static int read_errs_from_sock(knet_handle_t knet_h, int sockfd) { int err = 0, savederrno = 0; int got_err = 0; char buffer[1024]; struct iovec iov; struct msghdr msg; struct cmsghdr *cmsg; struct sock_extended_err *sock_err; struct icmphdr icmph; struct sockaddr_storage remote; struct sockaddr_storage *origin; char addr_str[KNET_MAX_HOST_LEN]; char port_str[KNET_MAX_PORT_LEN]; char addr_remote_str[KNET_MAX_HOST_LEN]; char port_remote_str[KNET_MAX_PORT_LEN]; iov.iov_base = &icmph; iov.iov_len = sizeof(icmph); msg.msg_name = (void*)&remote; msg.msg_namelen = sizeof(remote); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_flags = 0; msg.msg_control = buffer; msg.msg_controllen = sizeof(buffer); for (;;) { err = recvmsg(sockfd, &msg, MSG_ERRQUEUE); savederrno = errno; if (err < 0) { if (!got_err) { errno = savederrno; return -1; } else { return 0; } } got_err = 1; for (cmsg = CMSG_FIRSTHDR(&msg);cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { if (((cmsg->cmsg_level == SOL_IP) && (cmsg->cmsg_type == IP_RECVERR)) || ((cmsg->cmsg_level == SOL_IPV6 && (cmsg->cmsg_type == IPV6_RECVERR)))) { sock_err = (struct sock_extended_err*)(void *)CMSG_DATA(cmsg); if (sock_err) { switch (sock_err->ee_origin) { case SO_EE_ORIGIN_NONE: /* no origin */ case SO_EE_ORIGIN_LOCAL: /* local source (EMSGSIZE) */ if (sock_err->ee_errno == EMSGSIZE) { if (pthread_mutex_lock(&knet_h->kmtu_mutex) != 0) { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Unable to get mutex lock"); knet_h->kernel_mtu = 0; break; } else { knet_h->kernel_mtu = sock_err->ee_info; + log_debug(knet_h, KNET_SUB_TRANSP_UDP, "detected kernel MTU: %u", knet_h->kernel_mtu); pthread_mutex_unlock(&knet_h->kmtu_mutex); } force_pmtud_run(knet_h, KNET_SUB_TRANSP_UDP, 0); } /* * those errors are way too noisy */ break; case SO_EE_ORIGIN_ICMP: /* ICMP */ case SO_EE_ORIGIN_ICMP6: /* ICMP6 */ origin = (struct sockaddr_storage *)(void *)SO_EE_OFFENDER(sock_err); if (knet_addrtostr(origin, sizeof(*origin), addr_str, KNET_MAX_HOST_LEN, port_str, KNET_MAX_PORT_LEN) < 0) { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from unknown source: %s", strerror(sock_err->ee_errno)); } else { if (knet_addrtostr(&remote, sizeof(remote), addr_remote_str, KNET_MAX_HOST_LEN, port_remote_str, KNET_MAX_PORT_LEN) < 0) { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno)); } else { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str); } } break; } } else { log_debug(knet_h, KNET_SUB_TRANSP_UDP, "No data in MSG_ERRQUEUE"); } } } } } #else static int read_errs_from_sock(knet_handle_t knet_h, int sockfd) { return 0; } #endif int udp_transport_rx_sock_error(knet_handle_t knet_h, int sockfd, int recv_err, int recv_errno) { if (recv_errno == EAGAIN) { read_errs_from_sock(knet_h, sockfd); } return 0; } int udp_transport_tx_sock_error(knet_handle_t knet_h, int sockfd, int recv_err, int recv_errno) { if (recv_err < 0) { if (recv_errno == EMSGSIZE) { read_errs_from_sock(knet_h, sockfd); return 0; } if (recv_errno == EINVAL || recv_errno == EPERM) { return -1; } if ((recv_errno == ENOBUFS) || (recv_errno == EAGAIN)) { #ifdef DEBUG log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Sock: %d is overloaded. Slowing TX down", sockfd); #endif usleep(knet_h->threads_timer_res / 16); } else { read_errs_from_sock(knet_h, sockfd); } return 1; } return 0; } int udp_transport_rx_is_data(knet_handle_t knet_h, int sockfd, struct knet_mmsghdr *msg) { if (msg->msg_len == 0) return 0; return 2; } int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet_link *kn_link) { kn_link->status.dynconnected = 1; return 0; } int udp_transport_link_get_acl_fd(knet_handle_t knet_h, struct knet_link *kn_link) { return kn_link->outsock; }