Skip to content

Commit

Permalink
btl tcp: Simplify modex address selection
Browse files Browse the repository at this point in the history
Simplify selection of the address to publish for a given BTL TCP
module in the module exchange code.  Rather than looping through
all IP addresses associated with a node, looking for one that
matches the kindex of a module, loop over the modules and
use the address stored in the module structure.  This also
happens to be the address that the source will use to bind()
in a connect() call, so this should eliminate any confusion
(read: bugs) when an interface has multiple IPs associated with
it.

Refs open-mpi#5818

Signed-off-by: Brian Barrett <bbarrett@amazon.com>
  • Loading branch information
bwbarrett committed Oct 17, 2018
1 parent ebbcf80 commit 715816e
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 164 deletions.
60 changes: 36 additions & 24 deletions opal/mca/btl/tcp/btl_tcp_addr.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,40 +33,52 @@


/**
* Structure used to publish TCP connection information to peers.
* Modex address structure.
*
* One of these structures will be sent for every btl module in use by
* the local BTL TCP component.
*/
struct mca_btl_tcp_modex_addr_t {
uint8_t addr[16]; /* endpoint address. for addr_family
of MCA_BTL_TCP_AF_INET, only the
first 4 bytes have meaning. */
uint32_t addr_ifkindex; /* endpoint kernel index */
uint16_t addr_port; /* endpoint listen port */
uint8_t addr_family; /* endpoint address family. Note that
this is
MCA_BTL_TCP_AF_{INET,INET6}, not
the traditional
AF_INET/AF_INET6. */
uint8_t padding[1]; /* padd out to an 8-byte word */
};
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;


/**
* Remote peer address structure
*
* One of these structures will be allocated for every remote endpoint
* associated with a remote proc. The data is pulled from the
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
* field, which is local.
*/
struct mca_btl_tcp_addr_t {
/* the following information is exchanged between different
machines (read: byte order), so use network byte order
for everything and don't add padding
*/
union {
struct in_addr addr_inet; /* IPv6 listen address */
#if OPAL_ENABLE_IPV6
struct in6_addr addr_inet; /**< IPv4/IPv6 listen address > */
#else
/* Bug, FIXME: needs testing */
struct my_in6_addr {
union {
uint32_t u6_addr32[4];
struct _my_in6_addr {
struct in_addr _addr_inet;
uint32_t _pad[3];
} _addr__inet;
} _union_inet;
} addr_inet;
struct in6_addr addr_inet6; /* IPv6 listen address */
#endif
in_port_t addr_port; /**< listen port */
uint16_t addr_ifkindex; /**< remote interface index assigned with
};
in_port_t addr_port; /**< listen port */
int addr_ifkindex; /**< remote interface index assigned with
this address */
unsigned short addr_inuse; /**< local meaning only */
uint8_t addr_family; /**< AF_INET or AF_INET6 */
uint8_t addr_family; /**< AF_INET or AF_INET6 */
bool addr_inuse; /**< local meaning only */
};
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;

#define MCA_BTL_TCP_AF_INET 0
#if OPAL_ENABLE_IPV6
# define MCA_BTL_TCP_AF_INET6 1
#endif
#define MCA_BTL_TCP_AF_INET6 1

#endif

143 changes: 55 additions & 88 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -1117,97 +1117,64 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)

static int mca_btl_tcp_component_exchange(void)
{
int rc = 0, index;
size_t i = 0;
size_t size = mca_btl_tcp_component.tcp_addr_count *
mca_btl_tcp_component.tcp_num_links * sizeof(mca_btl_tcp_addr_t);
/* adi@2007-04-12:
*
* We'll need to explain things a bit here:
* 1. We normally have as many BTLs as physical NICs.
* 2. With num_links, we now have num_btl = num_links * #NICs
* 3. we might have more than one address per NIC
*/
size_t xfer_size = 0; /* real size to transfer (may differ from 'size') */
size_t current_addr = 0;

if(mca_btl_tcp_component.tcp_num_btls != 0) {
char ifn[32];
mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size);
memset(addrs, 0, size);

/* here we start populating our addresses */
for( i = 0; i < mca_btl_tcp_component.tcp_num_btls; i++ ) {
for (index = opal_ifbegin(); index >= 0;
index = opal_ifnext(index)) {
struct sockaddr_storage my_ss;

/* Look if the module's address belongs to this
* kernel IP interface. If not, go to next address.
*/
if (opal_ifindextokindex (index) !=
mca_btl_tcp_component.tcp_btls[i]->tcp_ifkindex) {
continue;
}

opal_ifindextoname(index, ifn, sizeof(ifn));
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl: tcp: component_exchange: examining interface %s",
ifn);
if (OPAL_SUCCESS !=
opal_ifindextoaddr(index, (struct sockaddr*) &my_ss,
sizeof (my_ss))) {
opal_output (0,
"btl_tcp_component: problems getting address for index %i (kernel index %i)\n",
index, opal_ifindextokindex (index));
continue;
}
int rc;
size_t i;
size_t num_btls = mca_btl_tcp_component.tcp_num_btls;
size_t size = num_btls * sizeof(mca_btl_tcp_modex_addr_t);
mca_btl_tcp_modex_addr_t *addrs;

if (num_btls <= 0) {
return 0;
}

addrs = (mca_btl_tcp_modex_addr_t*)malloc(size);
if (NULL == addrs) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
memset(addrs, 0, size);

for (i = 0 ; i < num_btls ; i++) {
struct mca_btl_tcp_module_t *btl = mca_btl_tcp_component.tcp_btls[i];
struct sockaddr *addr = (struct sockaddr*)&(btl->tcp_ifaddr);

#if OPAL_ENABLE_IPV6
if ((AF_INET6 == my_ss.ss_family) &&
(6 != mca_btl_tcp_component.tcp_disable_family)) {
memcpy(&addrs[current_addr].addr_inet,
&((struct sockaddr_in6*)&my_ss)->sin6_addr,
sizeof(addrs[0].addr_inet));
addrs[current_addr].addr_port =
mca_btl_tcp_component.tcp6_listen_port;
addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET6;
xfer_size += sizeof (mca_btl_tcp_addr_t);
addrs[current_addr].addr_inuse = 0;
addrs[current_addr].addr_ifkindex =
opal_ifindextokindex (index);
current_addr++;
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl: tcp: component_exchange: "
"%s IPv6 %s", ifn,
opal_net_get_hostname((struct sockaddr*)&my_ss));
} else
if (AF_INET6 == addr->sa_family) {
struct sockaddr_in6 *inaddr6 = (struct sockaddr_in6*)addr;

memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
sizeof(struct in6_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv6 %s",
(int)i, btl->tcp_ifkindex,
opal_net_get_hostname(addr));
} else
#endif
if ((AF_INET == my_ss.ss_family) &&
(4 != mca_btl_tcp_component.tcp_disable_family)) {
memcpy(&addrs[current_addr].addr_inet,
&((struct sockaddr_in*)&my_ss)->sin_addr,
sizeof(struct in_addr));
addrs[current_addr].addr_port =
mca_btl_tcp_component.tcp_listen_port;
addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET;
xfer_size += sizeof (mca_btl_tcp_addr_t);
addrs[current_addr].addr_inuse = 0;
addrs[current_addr].addr_ifkindex =
opal_ifindextokindex (index);
current_addr++;
opal_output_verbose(30, opal_btl_base_framework.framework_output,
"btl: tcp: component_exchange: "
"%s IPv4 %s", ifn,
opal_net_get_hostname((struct sockaddr*)&my_ss));
}
} /* end of for opal_ifbegin() */
} /* end of for tcp_num_btls */
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
&mca_btl_tcp_component.super.btl_version,
addrs, xfer_size);
free(addrs);
} /* end if */
if (AF_INET == addr->sa_family) {
struct sockaddr_in *inaddr = (struct sockaddr_in*)addr;

memcpy(&addrs[i].addr, &(inaddr->sin_addr),
sizeof(struct in_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv4 %s",
(int)i, btl->tcp_ifkindex,
opal_net_get_hostname(addr));
} else {
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
return OPAL_ERR_BAD_PARAM;
}
}

OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
&mca_btl_tcp_component.super.btl_version,
addrs, size);
free(addrs);

return rc;
}

Expand Down
Loading

0 comments on commit 715816e

Please sign in to comment.