Skip to content

Commit

Permalink
Check peer listener failed in IC-PROXY mode (#16438)
Browse files Browse the repository at this point in the history
In IC-PROXY mode, user doesn't get a notification when peer listener bind/listen failed. Because the related code is in IC-PROXY process, it only records warning logs. So, the user's query just hangs here silently. This behavior may make the user very confused.

In the commit, introduced a SHM variable to check the failure and give notification in time. An output example:
```
# using nc to occupy one ic-proxy process port
$ nc -l 2001
...
$ PGOPTIONS="-c gp_interconnect_type=proxy" psql demo
demo=# select count(*) from test;
ERROR:  SetupInterconnect: We are in IC_PROXY mode, but IC-Proxy Listener failed, please check. (ic_tcp.c:1296)  (seg1 slice1 127.0.1.1:6001 pid=189355) (ic_tcp.c:1296)
```
  • Loading branch information
interma authored Oct 8, 2023
1 parent 64e9d0e commit 8915cd0
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 1 deletion.
20 changes: 20 additions & 0 deletions src/backend/cdb/motion/ic_proxy_backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "cdb/cdbvars.h"
#include "cdb/ml_ipc.h"
#include "executor/execdesc.h"
#include "storage/shmem.h"

#include "ic_proxy.h"
#include "ic_proxy_backend.h"
Expand Down Expand Up @@ -517,6 +518,25 @@ ic_proxy_backend_init_context(ChunkTransportState *state)
uv_unref((uv_handle_t *)&context->connectTimer);
}

/*
* Check if current Segment's IC_PROXY listener failed
*/
bool
ic_proxy_backend_check_listener_failed(void)
{
bool found;
ic_proxy_peer_listener_failed = ShmemInitStruct("IC_PROXY Listener Failure Flag",
sizeof(*ic_proxy_peer_listener_failed),
&found);

Assert(ic_proxy_peer_listener_failed != NULL);
/* init it to 0 when the backend accesses it firstly */
if (!found)
pg_atomic_init_u32(ic_proxy_peer_listener_failed, 0);

return pg_atomic_read_u32(ic_proxy_peer_listener_failed) > 0;
}

/*
* Close the icproxy backend context
*/
Expand Down
4 changes: 4 additions & 0 deletions src/backend/cdb/motion/ic_proxy_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define IC_PROXY_BACKEND_H

#include "postgres.h"
#include "port/atomics.h"

#include "cdb/cdbinterconnect.h"

Expand All @@ -38,12 +39,15 @@ typedef struct ICProxyBackendContext
ChunkTransportState *transportState;
} ICProxyBackendContext;

extern pg_atomic_uint32 *ic_proxy_peer_listener_failed;

extern void ic_proxy_backend_connect(ICProxyBackendContext *context,
ChunkTransportStateEntry *pEntry,
MotionConn *conn, bool isSender);

extern void ic_proxy_backend_init_context(ChunkTransportState *state);
extern void ic_proxy_backend_close_context(ChunkTransportState *state);
extern void ic_proxy_backend_run_loop(ICProxyBackendContext *context);
extern bool ic_proxy_backend_check_listener_failed(void);

#endif /* IC_PROXY_BACKEND_H */
21 changes: 20 additions & 1 deletion src/backend/cdb/motion/ic_proxy_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "storage/shmem.h"

#include "ic_proxy_server.h"
#include "ic_proxy_addr.h"
Expand All @@ -39,6 +40,8 @@ static uv_timer_t ic_proxy_server_timer;
static uv_tcp_t ic_proxy_peer_listener;
static bool ic_proxy_peer_listening;
static bool ic_proxy_peer_relistening;
/* flag (in SHM) for incidaing if peer listener bind/listen failed */
pg_atomic_uint32 *ic_proxy_peer_listener_failed;

static uv_pipe_t ic_proxy_client_listener;
static bool ic_proxy_client_listening;
Expand Down Expand Up @@ -154,8 +157,12 @@ ic_proxy_server_peer_listener_init(uv_loop_t *loop)
if (ic_proxy_addrs == NIL)
return;

Assert(ic_proxy_peer_listener_failed != NULL);
if (ic_proxy_peer_listening)
{
Assert(pg_atomic_read_u32(ic_proxy_peer_listener_failed) == 0);
return;
}

/* Get the addr from the gp_interconnect_proxy_addresses */
addr = ic_proxy_get_my_addr();
Expand Down Expand Up @@ -195,6 +202,7 @@ ic_proxy_server_peer_listener_init(uv_loop_t *loop)
{
elog(WARNING, "ic-proxy: tcp: failed to bind: %s",
uv_strerror(ret));
pg_atomic_exchange_u32(ic_proxy_peer_listener_failed, 1);
return;
}

Expand All @@ -204,13 +212,15 @@ ic_proxy_server_peer_listener_init(uv_loop_t *loop)
{
elog(WARNING, "ic-proxy: tcp: failed to listen: %s",
uv_strerror(ret));
pg_atomic_exchange_u32(ic_proxy_peer_listener_failed, 1);
return;
}

uv_fileno((uv_handle_t *) listener, &fd);
elogif(gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE, LOG,
"ic-proxy: tcp: listening on socket %d", fd);

pg_atomic_exchange_u32(ic_proxy_peer_listener_failed, 0);
ic_proxy_peer_listening = true;
}

Expand Down Expand Up @@ -527,10 +537,19 @@ int
ic_proxy_server_main(void)
{
char path[MAXPGPATH];

bool found;
elogif(gp_log_interconnect >= GPVARS_VERBOSITY_TERSE,
LOG, "ic-proxy: server setting up");

/* get and init failure flag */
ic_proxy_peer_listener_failed = ShmemInitStruct("IC_PROXY Listener Failure Flag",
sizeof(*ic_proxy_peer_listener_failed),
&found);
if (!found)
pg_atomic_init_u32(ic_proxy_peer_listener_failed, 0);
else
pg_atomic_exchange_u32(ic_proxy_peer_listener_failed, 0);

ic_proxy_pkt_cache_init(IC_PROXY_MAX_PKT_SIZE);

uv_loop_init(&ic_proxy_server_loop);
Expand Down
2 changes: 2 additions & 0 deletions src/backend/cdb/motion/ic_tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,8 @@ SetupTCPInterconnect(EState *estate)
interconnect_context->doSendStopMessage = doSendStopMessageTCP;

#ifdef ENABLE_IC_PROXY
if (ic_proxy_backend_check_listener_failed())
elog(ERROR, "SetupInterconnect: We are in IC_PROXY mode, but IC-Proxy Listener failed, please check.");
ic_proxy_backend_init_context(interconnect_context);
#endif /* ENABLE_IC_PROXY */

Expand Down
93 changes: 93 additions & 0 deletions src/test/isolation2/expected/ic_proxy_listen_failed.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
-- Test case for the scenario which ic-proxy peer listener port has been occupied

-- start_matchsubs
-- m/ic_tcp.c:\d+/
-- s/ic_tcp.c:\d+/ic_tcp.c:LINE/
-- end_matchsubs

1:create table PR_16438 (i int);
CREATE TABLE
1:insert into PR_16438 select generate_series(1,100);
INSERT 0 100
1q: ... <quitting>

-- get one port and occupy it (start_py_httpserver.sh), then restart cluster
-- start_ignore
! ic_proxy_port=`psql postgres -Atc "show gp_interconnect_proxy_addresses;" | awk -F ',' '{print $1}' | awk -F ':' '{print $4}'` && gpstop -ai && ./script/start_py_httpserver.sh $ic_proxy_port;
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Starting gpstop with args: -ai
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Gathering information and validating the environment...
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Obtaining Greenplum Coordinator catalog information
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Obtaining Segment details from coordinator...
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Greenplum Version: 'postgres (Greenplum Database) 7.0.0-alpha.0+dev.19124.g5c36a44ab0 build dev'
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Commencing Coordinator instance shutdown with mode='immediate'
20230917:16:14:48:484878 gpstop:gpdb:ubuntu-[INFO]:-Coordinator segment instance directory=/home/ubuntu/gp/data7/master/gpseg-1
20230917:16:14:49:484878 gpstop:gpdb:ubuntu-[INFO]:-Attempting forceful termination of any leftover coordinator process
20230917:16:14:49:484878 gpstop:gpdb:ubuntu-[INFO]:-Terminating processes for segment /home/ubuntu/gp/data7/master/gpseg-1
20230917:16:14:52:484878 gpstop:gpdb:ubuntu-[INFO]:-No standby coordinator host configured
20230917:16:14:52:484878 gpstop:gpdb:ubuntu-[INFO]:-Targeting dbid [2, 5, 3, 6, 4, 7] for shutdown
20230917:16:14:52:484878 gpstop:gpdb:ubuntu-[INFO]:-Commencing parallel primary segment instance shutdown, please wait...
20230917:16:14:52:484878 gpstop:gpdb:ubuntu-[INFO]:-0.00% of jobs completed
20230917:16:14:56:484878 gpstop:gpdb:ubuntu-[INFO]:-100.00% of jobs completed
20230917:16:14:56:484878 gpstop:gpdb:ubuntu-[INFO]:-Commencing parallel mirror segment instance shutdown, please wait...
20230917:16:14:56:484878 gpstop:gpdb:ubuntu-[INFO]:-0.00% of jobs completed
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:-100.00% of jobs completed
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:-----------------------------------------------------
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:- Segments stopped successfully = 6
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:- Segments with errors during stop = 0
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:-----------------------------------------------------
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:-Successfully shutdown 6 of 6 segment instances
20230917:16:14:58:484878 gpstop:gpdb:ubuntu-[INFO]:-Database successfully shutdown with no errors reported
started a http server

! sleep 2 && gpstart -a;
20230917:16:15:00:488914 gpstart:gpdb:ubuntu-[INFO]:-Starting gpstart with args: -a
20230917:16:15:00:488914 gpstart:gpdb:ubuntu-[INFO]:-Gathering information and validating the environment...
20230917:16:15:00:488914 gpstart:gpdb:ubuntu-[INFO]:-Greenplum Binary Version: 'postgres (Greenplum Database) 7.0.0-alpha.0+dev.19124.g5c36a44ab0 build dev'
20230917:16:15:00:488914 gpstart:gpdb:ubuntu-[INFO]:-Greenplum Catalog Version: '302307241'
20230917:16:15:00:488914 gpstart:gpdb:ubuntu-[INFO]:-Starting Coordinator instance in admin mode
20230917:16:15:00:488914 gpstart:gpdb:ubuntu-[INFO]:-CoordinatorStart pg_ctl cmd is env GPSESSID=0000000000 GPERA=None $GPHOME/bin/pg_ctl -D /home/ubuntu/gp/data7/master/gpseg-1 -l /home/ubuntu/gp/data7/master/gpseg-1/log/startup.log -w -t 600 -o " -c gp_role=utility " start
20230917:16:15:01:488914 gpstart:gpdb:ubuntu-[INFO]:-Obtaining Greenplum Coordinator catalog information
20230917:16:15:01:488914 gpstart:gpdb:ubuntu-[INFO]:-Obtaining Segment details from coordinator...
20230917:16:15:01:488914 gpstart:gpdb:ubuntu-[INFO]:-Setting new coordinator era
20230917:16:15:01:488914 gpstart:gpdb:ubuntu-[INFO]:-Coordinator Started...
20230917:16:15:01:488914 gpstart:gpdb:ubuntu-[INFO]:-Shutting down coordinator
20230917:16:15:04:488914 gpstart:gpdb:ubuntu-[INFO]:-Commencing parallel primary and mirror segment instance startup, please wait...
.
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-Process results...
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-----------------------------------------------------
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:- Successful segment starts = 6
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:- Failed segment starts = 0
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:- Skipped segment starts (segments are marked down in configuration) = 0
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-----------------------------------------------------
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-Successfully started 6 of 6 segment instances
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-----------------------------------------------------
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-Starting Coordinator instance gpdb directory /home/ubuntu/gp/data7/master/gpseg-1
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-CoordinatorStart pg_ctl cmd is env GPSESSID=0000000000 GPERA=c25b3b36b0fe1320_230917161501 $GPHOME/bin/pg_ctl -D /home/ubuntu/gp/data7/master/gpseg-1 -l /home/ubuntu/gp/data7/master/gpseg-1/log/startup.log -w -t 600 -o " -c gp_role=dispatch " start
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-Command pg_ctl reports Coordinator gpdb instance active
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-Connecting to db template1 on host localhost
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-No standby coordinator configured. skipping...
20230917:16:15:05:488914 gpstart:gpdb:ubuntu-[INFO]:-Database successfully started

-- end_ignore

-- execute a query (should failed)
2&:select count(*) from PR_16438; <waiting ...>
FAILED: Forked command is not blocking; got output: ERROR: SetupInterconnect: We are in IC_PROXY mode, but IC-Proxy Listener failed, please check. (ic_tcp.c:1296)
2<: <... completed>
FAILED: Execution failed

-- kill the script to release port and execute query again (should successfully)
-- start_ignore
! ps aux | grep http.server | grep -v grep | awk '{print $2}' | xargs kill;

! sleep 2;

-- end_ignore
3:select count(*) from PR_16438;
count
-------
100
(1 row)
3:drop table PR_16438;
DROP TABLE
3 changes: 3 additions & 0 deletions src/test/isolation2/isolation2_ic_proxy_schedule
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ test: tcp_ic_teardown

# test TCP proxy peer shutdown
test: ic_proxy_peer_shutdown

# test ic-proxy listen failed
test: ic_proxy_listen_failed
21 changes: 21 additions & 0 deletions src/test/isolation2/script/start_py_httpserver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
# start a python http server (port is $1) in background

which python3 > /dev/null
if [ $? -eq 0 ]
then
python3 -m http.server $1 >/dev/null 2>&1 &
echo "started a http server"
exit 0
fi

which python2 > /dev/null
if [ $? -eq 0 ]
then
python2 -m SimpleHTTPServer $1 >/dev/null 2>&1 &
echo "started a http server"
exit 0
fi

echo "no python found"
exit 1
28 changes: 28 additions & 0 deletions src/test/isolation2/sql/ic_proxy_listen_failed.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Test case for the scenario which ic-proxy peer listener port has been occupied

-- start_matchsubs
-- m/ic_tcp.c:\d+/
-- s/ic_tcp.c:\d+/ic_tcp.c:LINE/
-- end_matchsubs

1:create table PR_16438 (i int);
1:insert into PR_16438 select generate_series(1,100);
1q:

-- get one port and occupy it (start_py_httpserver.sh), then restart cluster
-- start_ignore
! ic_proxy_port=`psql postgres -Atc "show gp_interconnect_proxy_addresses;" | awk -F ',' '{print $1}' | awk -F ':' '{print $4}'` && gpstop -ai && ./script/start_py_httpserver.sh $ic_proxy_port;
! sleep 2 && gpstart -a;
-- end_ignore

-- execute a query (should failed)
2&:select count(*) from PR_16438;
2<:

-- kill the script to release port and execute query again (should successfully)
-- start_ignore
! ps aux | grep http.server | grep -v grep | awk '{print $2}' | xargs kill;
! sleep 2;
-- end_ignore
3:select count(*) from PR_16438;
3:drop table PR_16438;

0 comments on commit 8915cd0

Please sign in to comment.