From 715816e5f84e543f08a995f623147f76e33f2da0 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Sat, 13 Oct 2018 04:14:01 +0000 Subject: [PATCH] btl tcp: Simplify modex address selection Simplify selection of the address to publish for a given BTL TCP module in the module exchange code. Rather than looping through all IP addresses associated with a node, looking for one that matches the kindex of a module, loop over the modules and use the address stored in the module structure. This also happens to be the address that the source will use to bind() in a connect() call, so this should eliminate any confusion (read: bugs) when an interface has multiple IPs associated with it. Refs #5818 Signed-off-by: Brian Barrett --- opal/mca/btl/tcp/btl_tcp_addr.h | 60 ++++++----- opal/mca/btl/tcp/btl_tcp_component.c | 143 +++++++++++---------------- opal/mca/btl/tcp/btl_tcp_proc.c | 130 ++++++++++++++---------- 3 files changed, 169 insertions(+), 164 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_addr.h b/opal/mca/btl/tcp/btl_tcp_addr.h index aa53535cce0..26acfd29304 100644 --- a/opal/mca/btl/tcp/btl_tcp_addr.h +++ b/opal/mca/btl/tcp/btl_tcp_addr.h @@ -33,40 +33,52 @@ /** - * Structure used to publish TCP connection information to peers. + * Modex address structure. + * + * One of these structures will be sent for every btl module in use by + * the local BTL TCP component. */ +struct mca_btl_tcp_modex_addr_t { + uint8_t addr[16]; /* endpoint address. for addr_family + of MCA_BTL_TCP_AF_INET, only the + first 4 bytes have meaning. */ + uint32_t addr_ifkindex; /* endpoint kernel index */ + uint16_t addr_port; /* endpoint listen port */ + uint8_t addr_family; /* endpoint address family. Note that + this is + MCA_BTL_TCP_AF_{INET,INET6}, not + the traditional + AF_INET/AF_INET6. */ + uint8_t padding[1]; /* padd out to an 8-byte word */ +}; +typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t; + +/** + * Remote peer address structure + * + * One of these structures will be allocated for every remote endpoint + * associated with a remote proc. The data is pulled from the + * mca_btl_tcp_modex_addr_t structure, except for the addr_inuse + * field, which is local. + */ struct mca_btl_tcp_addr_t { - /* the following information is exchanged between different - machines (read: byte order), so use network byte order - for everything and don't add padding - */ + union { + struct in_addr addr_inet; /* IPv6 listen address */ #if OPAL_ENABLE_IPV6 - struct in6_addr addr_inet; /**< IPv4/IPv6 listen address > */ -#else - /* Bug, FIXME: needs testing */ - struct my_in6_addr { - union { - uint32_t u6_addr32[4]; - struct _my_in6_addr { - struct in_addr _addr_inet; - uint32_t _pad[3]; - } _addr__inet; - } _union_inet; - } addr_inet; + struct in6_addr addr_inet6; /* IPv6 listen address */ #endif - in_port_t addr_port; /**< listen port */ - uint16_t addr_ifkindex; /**< remote interface index assigned with + }; + in_port_t addr_port; /**< listen port */ + int addr_ifkindex; /**< remote interface index assigned with this address */ - unsigned short addr_inuse; /**< local meaning only */ - uint8_t addr_family; /**< AF_INET or AF_INET6 */ + uint8_t addr_family; /**< AF_INET or AF_INET6 */ + bool addr_inuse; /**< local meaning only */ }; typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t; #define MCA_BTL_TCP_AF_INET 0 -#if OPAL_ENABLE_IPV6 -# define MCA_BTL_TCP_AF_INET6 1 -#endif +#define MCA_BTL_TCP_AF_INET6 1 #endif diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 911b47e1153..47d1621b25a 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -1117,97 +1117,64 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) static int mca_btl_tcp_component_exchange(void) { - int rc = 0, index; - size_t i = 0; - size_t size = mca_btl_tcp_component.tcp_addr_count * - mca_btl_tcp_component.tcp_num_links * sizeof(mca_btl_tcp_addr_t); - /* adi@2007-04-12: - * - * We'll need to explain things a bit here: - * 1. We normally have as many BTLs as physical NICs. - * 2. With num_links, we now have num_btl = num_links * #NICs - * 3. we might have more than one address per NIC - */ - size_t xfer_size = 0; /* real size to transfer (may differ from 'size') */ - size_t current_addr = 0; - - if(mca_btl_tcp_component.tcp_num_btls != 0) { - char ifn[32]; - mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size); - memset(addrs, 0, size); - - /* here we start populating our addresses */ - for( i = 0; i < mca_btl_tcp_component.tcp_num_btls; i++ ) { - for (index = opal_ifbegin(); index >= 0; - index = opal_ifnext(index)) { - struct sockaddr_storage my_ss; - - /* Look if the module's address belongs to this - * kernel IP interface. If not, go to next address. - */ - if (opal_ifindextokindex (index) != - mca_btl_tcp_component.tcp_btls[i]->tcp_ifkindex) { - continue; - } - - opal_ifindextoname(index, ifn, sizeof(ifn)); - opal_output_verbose(30, opal_btl_base_framework.framework_output, - "btl: tcp: component_exchange: examining interface %s", - ifn); - if (OPAL_SUCCESS != - opal_ifindextoaddr(index, (struct sockaddr*) &my_ss, - sizeof (my_ss))) { - opal_output (0, - "btl_tcp_component: problems getting address for index %i (kernel index %i)\n", - index, opal_ifindextokindex (index)); - continue; - } + int rc; + size_t i; + size_t num_btls = mca_btl_tcp_component.tcp_num_btls; + size_t size = num_btls * sizeof(mca_btl_tcp_modex_addr_t); + mca_btl_tcp_modex_addr_t *addrs; + + if (num_btls <= 0) { + return 0; + } + + addrs = (mca_btl_tcp_modex_addr_t*)malloc(size); + if (NULL == addrs) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + memset(addrs, 0, size); + + for (i = 0 ; i < num_btls ; i++) { + struct mca_btl_tcp_module_t *btl = mca_btl_tcp_component.tcp_btls[i]; + struct sockaddr *addr = (struct sockaddr*)&(btl->tcp_ifaddr); #if OPAL_ENABLE_IPV6 - if ((AF_INET6 == my_ss.ss_family) && - (6 != mca_btl_tcp_component.tcp_disable_family)) { - memcpy(&addrs[current_addr].addr_inet, - &((struct sockaddr_in6*)&my_ss)->sin6_addr, - sizeof(addrs[0].addr_inet)); - addrs[current_addr].addr_port = - mca_btl_tcp_component.tcp6_listen_port; - addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET6; - xfer_size += sizeof (mca_btl_tcp_addr_t); - addrs[current_addr].addr_inuse = 0; - addrs[current_addr].addr_ifkindex = - opal_ifindextokindex (index); - current_addr++; - opal_output_verbose(30, opal_btl_base_framework.framework_output, - "btl: tcp: component_exchange: " - "%s IPv6 %s", ifn, - opal_net_get_hostname((struct sockaddr*)&my_ss)); - } else + if (AF_INET6 == addr->sa_family) { + struct sockaddr_in6 *inaddr6 = (struct sockaddr_in6*)addr; + + memcpy(&addrs[i].addr, &(inaddr6->sin6_addr), + sizeof(struct in6_addr)); + addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port; + addrs[i].addr_ifkindex = btl->tcp_ifkindex; + addrs[i].addr_family = MCA_BTL_TCP_AF_INET6; + opal_output_verbose(5, opal_btl_base_framework.framework_output, + "btl: tcp: exchange: %d %d IPv6 %s", + (int)i, btl->tcp_ifkindex, + opal_net_get_hostname(addr)); + } else #endif - if ((AF_INET == my_ss.ss_family) && - (4 != mca_btl_tcp_component.tcp_disable_family)) { - memcpy(&addrs[current_addr].addr_inet, - &((struct sockaddr_in*)&my_ss)->sin_addr, - sizeof(struct in_addr)); - addrs[current_addr].addr_port = - mca_btl_tcp_component.tcp_listen_port; - addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET; - xfer_size += sizeof (mca_btl_tcp_addr_t); - addrs[current_addr].addr_inuse = 0; - addrs[current_addr].addr_ifkindex = - opal_ifindextokindex (index); - current_addr++; - opal_output_verbose(30, opal_btl_base_framework.framework_output, - "btl: tcp: component_exchange: " - "%s IPv4 %s", ifn, - opal_net_get_hostname((struct sockaddr*)&my_ss)); - } - } /* end of for opal_ifbegin() */ - } /* end of for tcp_num_btls */ - OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, - &mca_btl_tcp_component.super.btl_version, - addrs, xfer_size); - free(addrs); - } /* end if */ + if (AF_INET == addr->sa_family) { + struct sockaddr_in *inaddr = (struct sockaddr_in*)addr; + + memcpy(&addrs[i].addr, &(inaddr->sin_addr), + sizeof(struct in_addr)); + addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port; + addrs[i].addr_ifkindex = btl->tcp_ifkindex; + addrs[i].addr_family = MCA_BTL_TCP_AF_INET; + opal_output_verbose(5, opal_btl_base_framework.framework_output, + "btl: tcp: exchange: %d %d IPv4 %s", + (int)i, btl->tcp_ifkindex, + opal_net_get_hostname(addr)); + } else { + BTL_ERROR(("Unexpected address family: %d", addr->sa_family)); + return OPAL_ERR_BAD_PARAM; + } + } + + OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, + &mca_btl_tcp_component.super.btl_version, + addrs, size); + free(addrs); + return rc; } diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index a1bec00b828..e84a26cbf5c 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -117,73 +117,99 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc) mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) { mca_btl_tcp_proc_t* btl_proc; - size_t size; int rc; + mca_btl_tcp_modex_addr_t *remote_addrs = NULL; + size_t i, size; OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, (void**)&btl_proc); - if(OPAL_SUCCESS == rc) { + if (OPAL_SUCCESS == rc) { OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); return btl_proc; } - do { /* This loop is only necessary so that we can break out of the serial code */ - btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); - if(NULL == btl_proc) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - break; - } - - /* Retain the proc, but don't store the ref into the btl_proc just yet. This - * provides a way to release the btl_proc in case of failure without having to - * unlock the mutex. - */ - OBJ_RETAIN(proc); - - /* lookup tcp parameters exported by this proc */ - OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, - &proc->proc_name, (uint8_t**)&btl_proc->proc_addrs, &size); - if(rc != OPAL_SUCCESS) { - if(OPAL_ERR_NOT_FOUND != rc) - BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc)); - break; - } + /* proc was not found, so create one */ + btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); + if (NULL == btl_proc) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto cleanup; + } - if(0 != (size % sizeof(mca_btl_tcp_addr_t))) { - BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n", - (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t))); - rc = OPAL_ERROR; - break; + /* Retain the proc, but don't store the ref into the btl_proc just yet. This + * provides a way to release the btl_proc in case of failure without having to + * unlock the mutex. + */ + OBJ_RETAIN(proc); + + /* lookup tcp parameters exported by this proc */ + OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, + &proc->proc_name, (uint8_t**)&remote_addrs, &size); + if (OPAL_SUCCESS != rc) { + if (OPAL_ERR_NOT_FOUND != rc) { + BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc)); } + goto cleanup; + } - btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t); + if (0 != (size % sizeof(mca_btl_tcp_modex_addr_t))) { + BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n", + (unsigned long)size, + (unsigned long)sizeof(mca_btl_tcp_modex_addr_t))); + rc = OPAL_ERROR; + goto cleanup; + } - /* allocate space for endpoint array - one for each exported address */ - btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) - malloc((1 + btl_proc->proc_addr_count) * - sizeof(mca_btl_base_endpoint_t*)); - if(NULL == btl_proc->proc_endpoints) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - break; - } + btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_modex_addr_t); + btl_proc->proc_addrs = malloc(btl_proc->proc_addr_count * + sizeof(mca_btl_tcp_addr_t)); + if (NULL == btl_proc->proc_addrs) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto cleanup; + } - /* convert the OPAL addr_family field to OS constants, - * so we can check for AF_INET (or AF_INET6) and don't have - * to deal with byte ordering anymore. - */ - for (unsigned int i = 0; i < btl_proc->proc_addr_count; i++) { - if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) { - btl_proc->proc_addrs[i].addr_family = AF_INET; - } + /* the modex and proc structures differ slightly, so copy the + fields needed in the proc version */ + for (i = 0 ; i < btl_proc->proc_addr_count ; i++) { + if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) { + memcpy(&btl_proc->proc_addrs[i].addr_inet, + remote_addrs[i].addr, sizeof(struct in_addr)); + btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; + btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; + btl_proc->proc_addrs[i].addr_family = AF_INET; + btl_proc->proc_addrs[i].addr_inuse = false; + } else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) { #if OPAL_ENABLE_IPV6 - if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) { - btl_proc->proc_addrs[i].addr_family = AF_INET6; - } + memcpy(&btl_proc->proc_addrs[i].addr_inet6, + remote_addrs[i].addr, sizeof(struct in6_addr)); + btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; + btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; + btl_proc->proc_addrs[i].addr_family = AF_INET6; + btl_proc->proc_addrs[i].addr_inuse = false; +#else + rc = OPAL_ERR_NOT_SUPPORTED; + goto cleanup; #endif + } else { + BTL_ERROR(("Unexpected address family %d", + (int)remote_addrs[i].addr_family)); + rc = OPAL_ERR_BAD_PARAM; + goto cleanup; } - } while (0); + } + + free(remote_addrs); + + /* allocate space for endpoint array - one for each exported address */ + btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) + malloc((1 + btl_proc->proc_addr_count) * + sizeof(mca_btl_base_endpoint_t*)); + if (NULL == btl_proc->proc_endpoints) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto cleanup; + } +cleanup: if (OPAL_SUCCESS == rc) { btl_proc->proc_opal = proc; /* link with the proc */ /* add to hash table of all proc instance. */ @@ -669,7 +695,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc, } peer_interfaces[best]->inuse++; btl_endpoint->endpoint_addr = proc_data->best_addr[i][best]; - btl_endpoint->endpoint_addr->addr_inuse++; + btl_endpoint->endpoint_addr->addr_inuse = true; rc = OPAL_SUCCESS; break; } @@ -695,7 +721,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc, if (CQ_NO_CONNECTION != max) { peer_interfaces[j_max]->inuse++; btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max]; - btl_endpoint->endpoint_addr->addr_inuse++; + btl_endpoint->endpoint_addr->addr_inuse = true; rc = OPAL_SUCCESS; } } @@ -772,7 +798,7 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_ being removed early in the wireup sequence (e.g., if it is unreachable by all other procs) */ if (NULL != btl_endpoint->endpoint_addr) { - btl_endpoint->endpoint_addr->addr_inuse--; + btl_endpoint->endpoint_addr->addr_inuse = false; } break; }