diff --git a/src/config.c b/src/config.c index 7fb3cd1..b575561 100644 --- a/src/config.c +++ b/src/config.c @@ -1,977 +1,977 @@ /* * Copyright (C) 2011 Jiaju Zhang * Copyright (C) 2013-2014 Philipp Marek * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include #include #include #include #include #include "b_config.h" #include "booth.h" #include "config.h" #include "raft.h" #include "ticket.h" #include "log.h" static int ticket_size = 0; static int ticket_realloc(void) { const int added = 5; int had, want; void *p; had = booth_conf->ticket_allocated; want = had + added; p = realloc(booth_conf->ticket, sizeof(struct ticket_config) * want); if (!p) { log_error("can't alloc more tickets"); return -ENOMEM; } booth_conf->ticket = p; memset(booth_conf->ticket + had, 0, sizeof(struct ticket_config) * added); booth_conf->ticket_allocated = want; return 0; } int add_site(char *address, int type); int add_site(char *addr_string, int type) { int rv; struct booth_site *site; uLong nid; uint32_t mask; int i; rv = 1; if (booth_conf->site_count == MAX_NODES) { log_error("too many nodes"); goto out; } if (strlen(addr_string)+1 >= sizeof(booth_conf->site[0].addr_string)) { log_error("site address \"%s\" too long", addr_string); goto out; } site = booth_conf->site + booth_conf->site_count; site->family = AF_INET; site->type = type; /* Make site_id start at a non-zero point. * Perhaps use hash over string or address? */ strcpy(site->addr_string, addr_string); site->index = booth_conf->site_count; site->bitmask = 1 << booth_conf->site_count; /* Catch site overflow */ assert(site->bitmask); booth_conf->all_bits |= site->bitmask; if (type == SITE) booth_conf->sites_bits |= site->bitmask; site->tcp_fd = -1; booth_conf->site_count++; rv = 0; memset(&site->sa6, 0, sizeof(site->sa6)); nid = crc32(0L, NULL, 0); /* Using the ASCII representation in site->addr_string (both sizeof() * and strlen()) gives quite a lot of collisions; a brute-force run * from 0.0.0.0 to 24.0.0.0 gives ~4% collisions, and this tends to * increase even more. * Whether there'll be a collision in real-life, with 3 or 5 nodes, is * another question ... but for now get the ID from the binary * representation - that had *no* collisions up to 32.0.0.0. */ if (inet_pton(AF_INET, site->addr_string, &site->sa4.sin_addr) > 0) { site->family = AF_INET; site->sa4.sin_family = site->family; site->sa4.sin_port = htons(booth_conf->port); site->saddrlen = sizeof(site->sa4); site->addrlen = sizeof(site->sa4.sin_addr); site->site_id = crc32(nid, (void*)&site->sa4.sin_addr, site->addrlen); } else if (inet_pton(AF_INET6, site->addr_string, &site->sa6.sin6_addr) > 0) { site->family = AF_INET6; site->sa6.sin6_family = site->family; site->sa6.sin6_flowinfo = 0; site->sa6.sin6_port = htons(booth_conf->port); site->saddrlen = sizeof(site->sa6); site->addrlen = sizeof(site->sa6.sin6_addr); site->site_id = crc32(nid, (void*)&site->sa6.sin6_addr, site->addrlen); } else { log_error("Address string \"%s\" is bad", site->addr_string); rv = EINVAL; } /* Make sure we will never collide with NO_ONE, * or be negative (to get "get_local_id() < 0" working). */ mask = 1 << (sizeof(site->site_id)*8 -1); assert(NO_ONE & mask); site->site_id &= ~mask; /* Test for collisions with other sites */ for(i=0; iindex; i++) if (booth_conf->site[i].site_id == site->site_id) { log_error("Got a site-ID collision. Please file a bug on https://github.com/ClusterLabs/booth/issues/new, attaching the configuration file."); exit(1); } out: return rv; } inline static char *skip_while_in(const char *cp, int (*fn)(int), const char *allowed) { /* strchr() returns a pointer to the terminator if *cp == 0. */ while (*cp && (fn(*cp) || strchr(allowed, *cp))) cp++; /* discard "const" qualifier */ return (char*)cp; } inline static char *skip_while(char *cp, int (*fn)(int)) { while (fn(*cp)) cp++; return cp; } inline static char *skip_until(char *cp, char expected) { while (*cp && *cp != expected) cp++; return cp; } static inline int is_end_of_line(char *cp) { char c = *cp; return c == '\n' || c == 0 || c == '#'; } static int add_ticket(const char *name, struct ticket_config **tkp, const struct ticket_config *def) { int rv; struct ticket_config *tk; if (booth_conf->ticket_count == booth_conf->ticket_allocated) { rv = ticket_realloc(); if (rv < 0) return rv; } tk = booth_conf->ticket + booth_conf->ticket_count; booth_conf->ticket_count++; tk->last_valid_tk = malloc(sizeof(struct ticket_config)); if (!tk->last_valid_tk) { log_error("out of memory"); return -ENOMEM; } memset(tk->last_valid_tk, 0, sizeof(struct ticket_config)); if (!check_max_len_valid(name, sizeof(tk->name))) { log_error("ticket name \"%s\" too long.", name); return -EINVAL; } if (find_ticket_by_name(name, NULL)) { log_error("ticket name \"%s\" used again.", name); return -EINVAL; } if (* skip_while_in(name, isalnum, "-/")) { log_error("ticket name \"%s\" invalid; only alphanumeric names.", name); return -EINVAL; } strcpy(tk->name, name); tk->timeout = def->timeout; tk->term_duration = def->term_duration; tk->retries = def->retries; memcpy(tk->weight, def->weight, sizeof(tk->weight)); if (tkp) *tkp = tk; return 0; } static int postproc_ticket(struct ticket_config *tk) { if (!tk) return 1; if (!tk->renewal_freq) { tk->renewal_freq = tk->term_duration/2; } if (tk->timeout*(tk->retries+1) >= tk->renewal_freq) { log_error("%s: total amount of time to " "retry sending packets cannot exceed " "renewal frequency " "(%d*(%d+1) >= %d)", tk->name, tk->timeout, tk->retries, tk->renewal_freq); return 0; } return 1; } /* returns number of weights, or -1 on bad input. */ static int parse_weights(const char *input, int weights[MAX_NODES]) { int i, v; char *cp; for(i=0; i= MAX_ARGS) { log_error("too many arguments for the acquire-handler"); free(tk_test.prog); return -1; } tk_test.argv[i++] = p; } while (p); return 0; } struct toktab grant_type[] = { { "auto", GRANT_AUTO}, { "manual", GRANT_MANUAL}, { NULL, 0}, }; struct toktab attr_op[] = { {"eq", ATTR_OP_EQ}, {"ne", ATTR_OP_NE}, {NULL, 0}, }; static int lookup_tokval(char *key, struct toktab *tab) { struct toktab *tp; for (tp = tab; tp->str; tp++) { if (!strcmp(tp->str, key)) return tp->val; } return 0; } /* attribute prerequisite */ static int parse_attr_prereq(char *val, struct ticket_config *tk) { struct attr_prereq *ap = NULL; char *p; ap = (struct attr_prereq *)calloc(1, sizeof(struct attr_prereq)); if (!ap) { log_error("out of memory"); return -1; } p = strtok(val, " \t"); if (!p) { log_error("not enough arguments to attr-prereq"); goto err_out; } ap->grant_type = lookup_tokval(p, grant_type); if (!ap->grant_type) { log_error("%s is not a grant type", p); goto err_out; } p = strtok(NULL, " \t"); if (!p) { log_error("not enough arguments to attr-prereq"); goto err_out; } if (!(ap->attr_name = strdup(p))) { log_error("out of memory"); goto err_out; } p = strtok(NULL, " \t"); if (!p) { log_error("not enough arguments to attr-prereq"); goto err_out; } ap->op = lookup_tokval(p, attr_op); if (!ap->op) { log_error("%s is not an attribute operation", p); goto err_out; } p = strtok(NULL, " \t"); if (!p) { log_error("not enough arguments to attr-prereq"); goto err_out; } if (!(ap->attr_val = strdup(p))) { log_error("out of memory"); goto err_out; } tk->attr_prereqs = g_list_append(tk->attr_prereqs, ap); if (!tk->attr_prereqs) { log_error("out of memory"); goto err_out; } return 0; err_out: if (ap) { if (ap->attr_val) free(ap->attr_val); if (ap->attr_name) free(ap->attr_name); free(ap); } return -1; } extern int poll_timeout; int read_config(const char *path, int type) { char line[1024]; FILE *fp; char *s, *key, *val, *end_of_key; const char *error; char *cp, *cp2; int i; int lineno = 0; int got_transport = 0; int min_timeout = 0; struct ticket_config defaults = { { 0 } }; struct ticket_config *current_tk = NULL; fp = fopen(path, "r"); if (!fp) { log_error("failed to open %s: %s", path, strerror(errno)); return -1; } booth_conf = malloc(sizeof(struct booth_config) + TICKET_ALLOC * sizeof(struct ticket_config)); if (!booth_conf) { log_error("failed to alloc memory for booth config"); return -ENOMEM; } memset(booth_conf, 0, sizeof(struct booth_config) + TICKET_ALLOC * sizeof(struct ticket_config)); ticket_size = TICKET_ALLOC; booth_conf->proto = UDP; booth_conf->port = BOOTH_DEFAULT_PORT; booth_conf->maxtimeskew = BOOTH_DEFAULT_MAX_TIME_SKEW; booth_conf->authkey[0] = '\0'; /* Provide safe defaults. -1 is reserved, though. */ booth_conf->uid = -2; booth_conf->gid = -2; strcpy(booth_conf->site_user, "hacluster"); strcpy(booth_conf->site_group, "haclient"); strcpy(booth_conf->arb_user, "nobody"); strcpy(booth_conf->arb_group, "nobody"); parse_weights("", defaults.weight); defaults.clu_test.prog = NULL; defaults.clu_test.pid = 0; defaults.clu_test.status = 0; defaults.clu_test.progstate = EXTPROG_IDLE; defaults.term_duration = DEFAULT_TICKET_EXPIRY; defaults.timeout = DEFAULT_TICKET_TIMEOUT; defaults.retries = DEFAULT_RETRIES; defaults.acquire_after = 0; error = ""; log_debug("reading config file %s", path); while (fgets(line, sizeof(line), fp)) { lineno++; s = skip_while(line, isspace); - if (is_end_of_line(s)) + if (is_end_of_line(s) || *s == '#') continue; key = s; /* Key */ end_of_key = skip_while_in(key, isalnum, "-_"); if (end_of_key == key) { error = "No key"; goto err; } if (!*end_of_key) goto exp_equal; /* whitespace, and something else but nothing more? */ s = skip_while(end_of_key, isspace); if (*s != '=') { exp_equal: error = "Expected '=' after key"; goto err; } s++; /* It's my buffer, and I terminate if I want to. */ /* But not earlier than that, because we had to check for = */ *end_of_key = 0; /* Value tokenizing */ s = skip_while(s, isspace); switch (*s) { case '"': case '\'': val = s+1; s = skip_until(val, *s); /* Terminate value */ if (!*s) { error = "Unterminated quoted string"; goto err; } /* Remove and skip quote */ *s = 0; s++; - if (* skip_while(s, isspace)) { + if (*(s = skip_while(s, isspace)) && *s != '#') { error = "Surplus data after value"; goto err; } *s = 0; break; case 0: no_value: error = "No value"; goto err; break; default: val = s; /* Rest of line. */ i = strlen(s); /* i > 0 because of "case 0" above. */ while (i > 0 && isspace(s[i-1])) i--; s += i; *s = 0; } if (val == s) goto no_value; if (strlen(key) > BOOTH_NAME_LEN || strlen(val) > BOOTH_NAME_LEN) { error = "key/value too long"; goto err; } if (strcmp(key, "transport") == 0) { if (got_transport) { error = "config file has multiple transport lines"; goto err; } if (strcasecmp(val, "UDP") == 0) booth_conf->proto = UDP; else if (strcasecmp(val, "SCTP") == 0) booth_conf->proto = SCTP; else { error = "invalid transport protocol"; goto err; } got_transport = 1; continue; } if (strcmp(key, "port") == 0) { booth_conf->port = atoi(val); continue; } if (strcmp(key, "name") == 0) { safe_copy(booth_conf->name, val, BOOTH_NAME_LEN, "name"); continue; } #if HAVE_LIBGCRYPT || HAVE_LIBMHASH if (strcmp(key, "authfile") == 0) { safe_copy(booth_conf->authfile, val, BOOTH_PATH_LEN, "authfile"); continue; } if (strcmp(key, "maxtimeskew") == 0) { booth_conf->maxtimeskew = atoi(val); continue; } #endif if (strcmp(key, "site") == 0) { if (add_site(val, SITE)) goto out; continue; } if (strcmp(key, "arbitrator") == 0) { if (add_site(val, ARBITRATOR)) goto out; continue; } if (strcmp(key, "site-user") == 0) { safe_copy(booth_conf->site_user, optarg, BOOTH_NAME_LEN, "site-user"); continue; } if (strcmp(key, "site-group") == 0) { safe_copy(booth_conf->site_group, optarg, BOOTH_NAME_LEN, "site-group"); continue; } if (strcmp(key, "arbitrator-user") == 0) { safe_copy(booth_conf->arb_user, optarg, BOOTH_NAME_LEN, "arbitrator-user"); continue; } if (strcmp(key, "arbitrator-group") == 0) { safe_copy(booth_conf->arb_group, optarg, BOOTH_NAME_LEN, "arbitrator-group"); continue; } if (strcmp(key, "debug") == 0) { if (type != CLIENT && type != GEOSTORE) debug_level = max(debug_level, atoi(val)); continue; } if (strcmp(key, "ticket") == 0) { if (current_tk && strcmp(current_tk->name, "__defaults__")) { if (!postproc_ticket(current_tk)) { goto out; } } if (!strcmp(val, "__defaults__")) { current_tk = &defaults; } else if (add_ticket(val, ¤t_tk, &defaults)) { goto out; } continue; } /* current_tk must be allocated at this point, otherwise * we don't know to which ticket the key refers */ if (!current_tk) { error = "Unexpected keyword"; goto err; } if (strcmp(key, "expire") == 0) { current_tk->term_duration = read_time(val); if (current_tk->term_duration <= 0) { error = "Expected time >0 for expire"; goto err; } continue; } if (strcmp(key, "timeout") == 0) { current_tk->timeout = read_time(val); if (current_tk->timeout <= 0) { error = "Expected time >0 for timeout"; goto err; } if (!min_timeout) { min_timeout = current_tk->timeout; } else { min_timeout = min(min_timeout, current_tk->timeout); } continue; } if (strcmp(key, "retries") == 0) { current_tk->retries = strtol(val, &s, 0); if (*s || s == val || current_tk->retries<3 || current_tk->retries > 100) { error = "Expected plain integer value in the range [3, 100] for retries"; goto err; } continue; } if (strcmp(key, "renewal-freq") == 0) { current_tk->renewal_freq = read_time(val); if (current_tk->renewal_freq <= 0) { error = "Expected time >0 for renewal-freq"; goto err; } continue; } if (strcmp(key, "acquire-after") == 0) { current_tk->acquire_after = read_time(val); if (current_tk->acquire_after < 0) { error = "Expected time >=0 for acquire-after"; goto err; } continue; } if (strcmp(key, "before-acquire-handler") == 0) { if (parse_extprog(val, current_tk)) { goto err; } continue; } if (strcmp(key, "attr-prereq") == 0) { if (parse_attr_prereq(val, current_tk)) { goto err; } continue; } if (strcmp(key, "weights") == 0) { if (parse_weights(val, current_tk->weight) < 0) goto out; continue; } error = "Unknown keyword"; goto err; } if ((booth_conf->site_count % 2) == 0) { log_warn("Odd number of nodes is strongly recommended!"); } /* Default: make config name match config filename. */ if (!booth_conf->name[0]) { cp = strrchr(path, '/'); cp = cp ? cp+1 : (char *)path; cp2 = strrchr(cp, '.'); if (!cp2) cp2 = cp + strlen(cp); if (cp2-cp >= BOOTH_NAME_LEN) { log_error("booth config file name too long"); goto err; } strncpy(booth_conf->name, cp, cp2-cp); *(booth_conf->name+(cp2-cp)) = '\0'; } if (!postproc_ticket(current_tk)) { goto out; } poll_timeout = min(POLL_TIMEOUT, min_timeout/10); if (!poll_timeout) poll_timeout = POLL_TIMEOUT; return 0; err: out: log_error("%s in config file line %d", error, lineno); free(booth_conf); booth_conf = NULL; return -1; } int check_config(int type) { struct passwd *pw; struct group *gr; char *cp, *input; if (!booth_conf) return -1; input = (type == ARBITRATOR) ? booth_conf->arb_user : booth_conf->site_user; if (!*input) goto u_inval; if (isdigit(input[0])) { booth_conf->uid = strtol(input, &cp, 0); if (*cp != 0) { u_inval: log_error("User \"%s\" cannot be resolved into a UID.", input); return ENOENT; } } else { pw = getpwnam(input); if (!pw) goto u_inval; booth_conf->uid = pw->pw_uid; } input = (type == ARBITRATOR) ? booth_conf->arb_group : booth_conf->site_group; if (!*input) goto g_inval; if (isdigit(input[0])) { booth_conf->gid = strtol(input, &cp, 0); if (*cp != 0) { g_inval: log_error("Group \"%s\" cannot be resolved into a UID.", input); return ENOENT; } } else { gr = getgrnam(input); if (!gr) goto g_inval; booth_conf->gid = gr->gr_gid; } return 0; } static int get_other_site(struct booth_site **node) { struct booth_site *n; int i; *node = NULL; if (!booth_conf) return 0; for (i = 0; i < booth_conf->site_count; i++) { n = booth_conf->site + i; if (n != local && n->type == SITE) { if (!*node) { *node = n; } else { return 0; } } } return !*node ? 0 : 1; } int find_site_by_name(unsigned char *site, struct booth_site **node, int any_type) { struct booth_site *n; int i; if (!booth_conf) return 0; if (!strcmp(site, OTHER_SITE)) return get_other_site(node); for (i = 0; i < booth_conf->site_count; i++) { n = booth_conf->site + i; if ((n->type == SITE || any_type) && strcmp(n->addr_string, site) == 0) { *node = n; return 1; } } return 0; } int find_site_by_id(uint32_t site_id, struct booth_site **node) { struct booth_site *n; int i; if (site_id == NO_ONE) { *node = no_leader; return 1; } if (!booth_conf) return 0; for (i = 0; i < booth_conf->site_count; i++) { n = booth_conf->site + i; if (n->site_id == site_id) { *node = n; return 1; } } return 0; } const char *type_to_string(int type) { switch (type) { case ARBITRATOR: return "arbitrator"; case SITE: return "site"; case CLIENT: return "client"; case GEOSTORE: return "attr"; } return "??invalid-type??"; } diff --git a/test/live_test.sh b/test/live_test.sh index 0289d1c..43caf0e 100755 --- a/test/live_test.sh +++ b/test/live_test.sh @@ -1,1252 +1,1259 @@ #!/bin/sh # # see README-testing for more information # do some basic booth operation tests for the given config # PROG=`basename $0` usage() { cat<[:]] $PROG [ ...] EOF if [ $1 -eq 0 ]; then list_all examples fi exit } list_all() { echo "Tests:" grep "^test_.*{$" $0 | sed 's/test_//;s/(.*//;s/^/ /' echo echo "Netem functions:" grep "^NETEM_ENV_.*{$" $0 | sed 's/NETEM_ENV_//;s/(.*//;s/^/ /' } examples() { cat< /dev/null } stop_site() { manage_site $1 stop } stop_arbitrator() { manage_arbitrator $1 stop } restart_site() { manage_site $1 restart } cleanup_site() { manage_site $1 cleanup } reload_site() { runcmd $1 OCF_ROOT=/usr/lib/ocf /usr/lib/ocf/resource.d/pacemaker/booth-site reload } restart_arbitrator() { manage_arbitrator $1 restart } booth_status() { test "`runcmd $1 booth status | get_stat_fld booth_state`" = "started" } cleanup_booth() { local h procs for h in $sites; do cleanup_site $h & procs="$! $procs" done >/dev/null 2>&1 wait $procs wait_timeout } cleanup_dep_rsc() { local dep_rsc=`get_rsc` test -z "$dep_rsc" && return local h procs for h in $sites; do runcmd $h crm -w resource cleanup $dep_rsc & procs="$! $procs" done >/dev/null 2>&1 wait $procs } check_dep_rsc() { local dep_rsc=`get_rsc` test -z "$dep_rsc" && return 0 local h for h in $sites; do runcmd $h BOOTH_TICKET=$tkt /usr/share/booth/service-runnable $dep_rsc || return 1 done return 0 } stop_booth() { local h rc for h in $sites; do stop_site $h rc=$((rc|$?)) done >/dev/null 2>&1 for h in $arbitrators; do stop_arbitrator $h rc=$((rc|$?)) done >/dev/null 2>&1 wait_timeout return $rc } start_booth() { local h rc for h in $sites; do start_site $h rc=$((rc|$?)) done >/dev/null 2>&1 for h in $arbitrators; do start_arbitrator $h rc=$((rc|$?)) done >/dev/null 2>&1 wait_timeout return $rc } restart_booth() { local h procs for h in $sites; do restart_site $h & procs="$! $procs" done >/dev/null 2>&1 for h in $arbitrators; do restart_arbitrator $h done >/dev/null 2>&1 wait $procs wait_timeout } reboot_test() { cleanup_booth restart_booth cleanup_dep_rsc } is_we_server() { local h for h in $sites $arbitrators; do ip a l | fgrep -wq $h && return done return 1 } is_pacemaker_running() { local h for h in $sites; do crmadmin -D >/dev/null || return 1 done return 0 } sync_conf() { local h rc=0 for h in $sites $arbitrators; do rsync -q $cnf root@$h:$run_cnf rc=$((rc|$?)) if [ -n "$authfile" ]; then run_site 1 rsync -q $authfile root@$h:$BOOTH_DIR rc=$((rc|$?)) fi done return $rc } dump_conf() { echo "test configuration file $cnf:" grep -v '^#' $cnf | grep -v '^[[:space:]]*$' | sed "s/^/$cnf: /" } forall() { local h rc=0 for h in $sites $arbitrators; do runcmd $h $@ rc=$((rc|$?)) done return $rc } forall_withname() { local h rc=0 output for h in $sites $arbitrators; do output=`runcmd $h $@` rc=$((rc|$?)) echo $h: $output done return $rc } forall_sites() { local h rc=0 for h in $sites; do runcmd $h $@ rc=$((rc|$?)) done return $rc } forall_fun() { local h rc=0 f=$1 for h in $sites $arbitrators; do $f $h rc=$((rc|$?)) [ $rc -ne 0 ] && break done return $rc } # run on all hosts whatever function produced on stdout forall_fun2() { local h rc=0 f f=$1 shift 1 for h in $sites $arbitrators; do $f $@ | ssh $SSH_OPTS $h rc=$((rc|$?)) [ $rc -ne 0 ] && break done return $rc } run_site() { local n=$1 h shift 1 h=`echo $sites | awk '{print $'$n'}'` runcmd $h $@ } run_arbitrator() { local n=$1 h shift 1 h=`echo $arbitrators | awk '{print $'$n'}'` runcmd $h $@ } # need to get logs from _all_ clusters' nodes get_all_nodes() { for h in $sites; do runcmd $h crm_node -l | awk '{print $2}' done } +get_servers() { + grep "^$1" | + awk ' + { if(/# *external-ip=/) print $NF; else print; } + ' | + sed 's/ *#.*//;s/.*=//;s/"//g' +} get_value() { grep "^$1" | - sed 's/.*=[ "]*//;s/"//' + sed 's/ *#.*//;s/.*=//;s/"//g' } get_rsc() { awk ' n && /^[[:space:]]*before-acquire-handler/ {print $NF; exit} n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $cnf } get_attr() { awk ' n && /^[[:space:]]*attr-prereq = auto .* eq / {print $4,$6; exit} n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $cnf } set_site_attr() { local site site=`get_site $1` set -- `get_attr` geostore set -s $site $1 $2 } del_site_attr() { local site site=`get_site $1` set -- `get_attr` geostore delete -s $site $1 } break_external_prog() { run_site $1 crm configure "location $PREFNAME `get_rsc` rule -inf: defined \#uname" } show_pref() { run_site $1 crm configure show $PREFNAME > /dev/null } repair_external_prog() { run_site $1 crm configure delete __pref_booth_live_test } get_tkt() { grep "^ticket=" | head -1 | sed 's/ticket=//;s/"//g' } get_tkt_settings() { awk ' n && /^[[:space:]]*(expire|timeout|renewal-freq)/ { sub(" = ", "=", $0); gsub("-", "_", $0); sub("^[[:space:]]*", "T_", $0); if ($0 ~ /ms$/) { sub("ms$", "", $0); eq = match($0, "="); print substr($0, 1, eq)""substr($0, eq+1)/1000; } else { print; } next } n && (/^$/ || /^ticket.*/) {exit} /^ticket.*'$tkt'/ {n=1} ' $cnf } wait_exp() { sleep $T_expire } wait_renewal() { sleep $T_renewal_freq } wait_timeout() { sleep $MIN_TIMEOUT } set_netem_env() { local modfun args modfun=`echo $1 | sed 's/:.*//'` args=`echo $1 | sed 's/[^:]*//;s/:/ /g'` if ! is_function NETEM_ENV_$modfun; then echo "NETEM_ENV_$modfun: doesn't exist" exit 1 fi NETEM_ENV_$modfun $args } reset_netem_env() { [ -z "$NETEM_ENV" ] && return [ -n "$__NETEM_RESET" ] && return __NETEM_RESET=1 forall $ABSPATH $run_cnf __netem__ netem_reset } setup_netem() { [ -z "$NETEM_ENV" ] && return __NETEM_RESET= echo "-------------------------------------------------- (netem)" | logmsg for env in $NETEM_ENV; do set_netem_env $env done trap "reset_netem_env" EXIT } cib_status() { local h=$1 stat stat=`runcmd $h crm_ticket -L | grep "^$tkt" | awk '{print $2}'` test "$stat" != "-1" } is_cib_granted() { local stat h=$1 stat=`runcmd $h crm_ticket -L | grep "^$tkt" | awk '{print $2}'` [ "$stat" = "granted" ] } check_cib_consistency() { local h gh="" rc=0 for h in $sites; do if is_cib_granted $h; then [ -n "$gh" ] && rc=1 # granted twice gh="$gh $h" fi done [ -z "$gh" ] && gh="none" if [ $rc -eq 0 ]; then echo $gh return $rc fi cat<= 0 ? x : -x; } } ' | sort -n | tail -1 } booth_leader_consistency() { test `booth_list_fld 2 | sort -u | wc -l` -eq 1 } # are there two leaders or is it just that some booths are outdated booth_leader_consistency_2() { test `booth_list_fld 2 | sort -u | grep -iv none | wc -l` -le 1 } # do all booths have the same info? # possible differences: # a) more than one leader # b) some booths not uptodate (have no leader for the ticket) # c) ticket expiry times differ check_booth_consistency() { local tlist tlist_validate rc rc_lead maxdiff tlist=`forall_withname booth list 2>/dev/null | grep $tkt` tlist_validate=`echo "$tlist" | sed 's/[^:]*: //;s/commit:.*//;s/NONE/none/'` maxdiff=`echo "$tlist" | max_booth_time_diff` test "$maxdiff" -eq 0 rc=$? echo "$tlist" | booth_leader_consistency rc_lead=$? if [ $rc_lead -ne 0 ]; then echo "$tlist" | booth_leader_consistency_2 rc_lead=$(($rc_lead + $?)) # rc_lead=2 if the prev test failed fi rc=$(($rc | $rc_lead<<1)) test $rc -eq 0 && return cat</dev/null wait_timeout } run_report() { local start_ts=$1 end_ts=$2 name=$3 local hb_report_opts="" local quick_opt="" logmsg "running hb_report" hb_report -Q 2>&1 | grep -sq "illegal.option" || quick_opt="-Q" if [ `id -u` != 0 ]; then hb_report_opts="-u root" fi hb_report $hb_report_opts $quick_opt -f "`date -d @$((start_ts-5))`" \ -t "`date -d @$((end_ts+60))`" \ -n "$all_nodes $arbitrators" $name 2>&1 | logmsg } runtest() { local start_ts end_ts local rc booth_status dep_rsc_status local start_time end_time local usrmsg TEST=$1 start_time=`date` start_ts=`date +%s` echo -n "Testing: $1... " can_run_test $1 || return 0 echo "==================================================" | logmsg echo "starting booth test $1 ..." | logmsg if is_function setup_$1; then echo "-------------------------------------------------- (setup)" | logmsg setup_$1 rc=$? [ "$rc" -ne 0 ] && rc=$ERR_SETUP_FAILED fi if [ "$rc" -eq 0 ]; then setup_netem echo "-------------------------------------------------- (test)" | logmsg test_$1 rc=$? fi case $rc in 0) # wait a bit more if we're losing packets [ -n "$PKT_LOSS" ] && wait_timeout echo "-------------------------------------------------- (check)" | logmsg check_$1 rc=$? if [ $rc -eq 0 ]; then usrmsg="SUCCESS" else usrmsg="check FAIL: $rc" fi ;; $ERR_SETUP_FAILED) usrmsg="setup FAIL" ;; *) usrmsg="test FAIL: $rc" ;; esac end_time=`date` end_ts=`date +%s` echo "finished booth test $1 ($usrmsg)" | logmsg echo "==================================================" | logmsg is_function recover_$1 && recover_$1 reset_netem_env #sleep 3 all_booth_status booth_status=$? check_dep_rsc dep_rsc_status=$? if [ $((rc|booth_status|dep_rsc_status)) -eq 0 ]; then echo OK [ "$GET_REPORT" ] && run_report $start_ts $end_ts $TEST else echo "$usrmsg (running hb_report ... $1.tar.bz2; see also $logf)" [ $booth_status -ne 0 ] && echo "unexpected: some booth daemons not running" [ $dep_rsc_status -ne 0 ] && echo "unexpected: dependent resource failure" run_report $start_ts $end_ts $TEST reboot_test master_rc=1 fi revoke_ticket } # # the tests # # most tests start by granting ticket grant_ticket() { run_site $1 booth grant -w $tkt >/dev/null } grant_ticket_cib() { run_site $1 booth grant -C $tkt >/dev/null } ## TEST: grant ## # just a grant test_grant() { grant_ticket 1 } check_grant() { check_consistency `get_site 1` } ## TEST: longgrant ## # just a grant followed by three expire times setup_longgrant() { grant_ticket 1 } test_longgrant() { wait_exp wait_exp wait_exp } check_longgrant() { check_consistency `get_site 1` } ## TEST: longgrant2 ## # just a grant followed by 10 expire times setup_longgrant2() { grant_ticket_cib 1 } test_longgrant2() { local i for i in `seq 10`; do wait_exp done } check_longgrant2() { check_consistency `get_site 1` } ## TEST: grant_noarb ## # just a grant with no arbitrators setup_grant_noarb() { local h for h in $arbitrators; do stop_arbitrator $h || return 1 done >/dev/null 2>&1 #sleep 1 } test_grant_noarb() { grant_ticket 1 } check_grant_noarb() { check_consistency `get_site 1` } recover_grant_noarb() { local h for h in $arbitrators; do start_arbitrator $h done >/dev/null 2>&1 } applicable_grant_noarb() { [ -n "$arbitrators" ] } ## TEST: revoke ## # just a revoke setup_revoke() { grant_ticket 1 } test_revoke() { revoke_ticket } check_revoke() { check_consistency } ## TEST: grant_elsewhere ## # just a grant to another site test_grant_elsewhere() { run_site 1 booth grant -w -s `get_site 2` $tkt >/dev/null } check_grant_elsewhere() { check_consistency `get_site 2` } ## TEST: grant_site_lost ## # grant with one site lost setup_grant_site_lost() { stop_site `get_site 2` booth_status `get_site 2` && return 1 return 0 } test_grant_site_lost() { grant_ticket 1 wait_exp } check_grant_site_lost() { check_consistency `get_site 1` } recover_grant_site_lost() { start_site `get_site 2` } ## TEST: grant_site_reappear ## # grant with one site lost then reappearing setup_grant_site_reappear() { stop_site `get_site 2` booth_status `get_site 2` && return 1 return 0 #sleep 1 } test_grant_site_reappear() { grant_ticket 1 || return $ERR_SETUP_FAILED check_cib `get_site 1` || return $ERR_SETUP_FAILED wait_timeout start_site `get_site 2` || return $ERR_SETUP_FAILED wait_timeout wait_timeout } check_grant_site_reappear() { check_consistency `get_site 1` && is_cib_granted `get_site 1` } recover_grant_site_reappear() { start_site `get_site 2` } ## TEST: simultaneous_start_even ## # simultaneous start of even number of members setup_simultaneous_start_even() { grant_ticket_cib 2 || return 1 stop_booth || return 1 #wait_timeout } test_simultaneous_start_even() { local serv for serv in $(echo $sites | sed "s/`get_site 1` //"); do start_site $serv & done for serv in $arbitrators; do start_arbitrator $serv & done wait_renewal start_site `get_site 1` wait_timeout wait_timeout } check_simultaneous_start_even() { check_consistency `get_site 2` } ## TEST: slow_start_granted ## # slow start setup_slow_start_granted() { grant_ticket_cib 1 || return 1 stop_booth || return 1 #wait_timeout } test_slow_start_granted() { for serv in $sites; do start_site $serv wait_timeout done for serv in $arbitrators; do start_arbitrator $serv wait_timeout done } check_slow_start_granted() { check_consistency `get_site 1` } ## TEST: restart_granted ## # restart with ticket granted setup_restart_granted() { grant_ticket_cib 1 } test_restart_granted() { restart_site `get_site 1` || return 1 wait_timeout } check_restart_granted() { check_consistency `get_site 1` } ## TEST: reload_granted ## # reload with ticket granted setup_reload_granted() { grant_ticket_cib 1 } test_reload_granted() { reload_site `get_site 1` || return 1 wait_timeout } check_reload_granted() { check_consistency `get_site 1` } ## TEST: restart_granted_nocib ## # restart with ticket granted (but cib empty) setup_restart_granted_nocib() { grant_ticket_cib 1 } test_restart_granted_nocib() { stop_site_clean `get_site 1` || return 1 #wait_timeout start_site `get_site 1` || return 1 wait_timeout wait_timeout wait_timeout } check_restart_granted_nocib() { check_consistency `get_site 1` } ## TEST: restart_notgranted ## # restart with ticket not granted setup_restart_notgranted() { grant_ticket_cib 1 } test_restart_notgranted() { stop_site `get_site 2` || return 1 #sleep 1 start_site `get_site 2` || return 1 wait_timeout } check_restart_notgranted() { check_consistency `get_site 1` } ## TEST: failover ## # ticket failover setup_failover() { grant_ticket 1 [ -n "`get_attr`" ] && set_site_attr 2 return 0 } test_failover() { stop_site_clean `get_site 1` || return 1 booth_status `get_site 1` && return 1 wait_exp wait_timeout wait_timeout wait_timeout } check_failover() { check_consistency any } recover_failover() { start_site `get_site 1` } ## TEST: split_leader ## # split brain (leader alone) setup_split_leader() { grant_ticket_cib 1 [ -n "`get_attr`" ] && set_site_attr 2 return 0 } test_split_leader() { run_site 1 $iprules stop $port >/dev/null wait_exp wait_timeout wait_timeout wait_timeout wait_timeout check_cib any || return 1 run_site 1 $iprules start $port >/dev/null wait_timeout wait_timeout wait_timeout } check_split_leader() { check_consistency any } recover_split_leader() { run_site 1 $iprules start $port >/dev/null } ## TEST: split_follower ## # split brain (follower alone) setup_split_follower() { grant_ticket_cib 1 } test_split_follower() { run_site 2 $iprules stop $port >/dev/null wait_exp wait_timeout run_site 2 $iprules start $port >/dev/null wait_timeout } check_split_follower() { check_consistency `get_site 1` } ## TEST: split_edge ## # split brain (leader alone) setup_split_edge() { grant_ticket_cib 1 } test_split_edge() { run_site 1 $iprules stop $port >/dev/null wait_exp run_site 1 $iprules start $port >/dev/null wait_timeout wait_timeout } check_split_edge() { check_consistency any } ## TEST: external_prog_failed ## # external test prog failed setup_external_prog_failed() { grant_ticket 1 || return 1 [ -n "`get_attr`" ] && set_site_attr 2 break_external_prog 1 show_pref 1 || return 1 } test_external_prog_failed() { wait_renewal wait_timeout } check_external_prog_failed() { check_consistency any && [ `booth_where_granted` != `get_site 1` ] } recover_external_prog_failed() { repair_external_prog 1 } applicable_external_prog_failed() { [ -n "`get_rsc`" ] } ## TEST: attr_prereq_ok ## # failover with attribute prerequisite setup_attr_prereq_ok() { grant_ticket 1 || return 1 set_site_attr 2 stop_site_clean `get_site 1` booth_status `get_site 1` && return 1 return 0 } test_attr_prereq_ok() { wait_exp wait_timeout } check_attr_prereq_ok() { check_consistency `get_site 2` } recover_attr_prereq_ok() { start_site `get_site 1` del_site_attr 2 } applicable_attr_prereq_ok() { [ -n "`get_attr`" ] } ## TEST: attr_prereq_fail ## # failover with failed attribute prerequisite setup_attr_prereq_fail() { grant_ticket 1 || return 1 del_site_attr 2 >/dev/null 2>&1 stop_site_clean `get_site 1` booth_status `get_site 1` && return 1 return 0 } test_attr_prereq_fail() { wait_exp wait_exp wait_exp } check_attr_prereq_fail() { check_consistency && booth_where_granted | grep -qwi none } recover_attr_prereq_fail() { start_site `get_site 1` } applicable_attr_prereq_fail() { [ -n "`get_attr`" ] } # # environment modifications # # packet loss at one site 30% NETEM_ENV_single_loss() { run_site 1 $ABSPATH $run_cnf __netem__ netem_loss ${1:-30} PKT_LOSS=${1:-30} } # packet loss everywhere 30% NETEM_ENV_loss() { forall $ABSPATH $run_cnf __netem__ netem_loss ${1:-30} PKT_LOSS=${1:-30} } # network delay 100ms NETEM_ENV_net_delay() { forall $ABSPATH $run_cnf __netem__ netem_delay ${1:-100} } # duplicate packets NETEM_ENV_duplicate() { forall $ABSPATH $run_cnf __netem__ netem_duplicate ${1:-10} } # reorder packets NETEM_ENV_reorder() { forall $ABSPATH $run_cnf __netem__ netem_reorder ${1:-25} ${2:-50} } # need this if we're run from a local directory or such get_prog_abspath() { local p p=`run_site 1 rpm -ql booth-test | fgrep -w $PROG` echo ${p:-/usr/share/booth/tests/test/live_test.sh} } [ -f "$cnf" ] || { echo "ERROR: configuration file $cnf doesn't exist" usage 1 } is_pacemaker_running || { echo "ERROR: sites must run pacemaker" exit 1 } -sites=`get_value site < $cnf` -arbitrators=`get_value arbitrator < $cnf` +sites=`get_servers site < $cnf` +arbitrators=`get_servers arbitrator < $cnf` all_nodes=`get_all_nodes` port=`get_value port < $cnf` : ${port:=9929} site_cnt=`echo $sites | wc -w` arbitrator_cnt=`echo $arbitrators | wc -w` tkt=`get_tkt < $cnf` eval `get_tkt_settings` MIN_TIMEOUT=`awk -v tm=$T_timeout 'BEGIN{ if (tm >= 2) print tm; else print 2*tm; }'` if [ "$1" = "__netem__" ]; then shift 1 _JUST_NETEM=1 local_netem_env $@ exit fi [ -z "$sites" ] && { echo no sites in $cnf usage 1 } [ -z "$T_expire" ] && { echo set $tkt expire time in $cnf usage 1 } if [ -z "$T_renewal_freq" ]; then T_renewal_freq=$((T_expire/2)) fi exec 2>$logf BASH_XTRACEFD=2 PS4='+ `date +"%T"`: ' set -x WE_SERVER="" is_we_server && WE_SERVER=1 PREFNAME=__pref_booth_live_test authfile=`get_value authfile < $cnf` run_site 1 'test -f '"$authfile"' || booth-keygen '"$authfile" sync_conf || exit reboot_test all_booth_status || { start_booth all_booth_status || { echo "some booth servers couldn't be started" exit 1 } } revoke_ticket ABSPATH=`get_prog_abspath` dump_conf | logmsg TESTS="$@" : ${TESTS:="grant longgrant grant_noarb grant_elsewhere grant_site_lost grant_site_reappear revoke simultaneous_start_even slow_start_granted restart_granted reload_granted restart_granted_nocib restart_notgranted failover split_leader split_follower split_edge external_prog_failed attr_prereq_ok attr_prereq_fail"} master_rc=0 # updated in runtest for t in $TESTS; do runtest $t done exit $master_rc