Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable brpc use rdma #1836

Merged
merged 7 commits into from
Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ COPTS = [
}) + select({
"//bazel/config:brpc_with_thrift_legacy_version": [],
"//conditions:default": ["-DTHRIFT_STDCXX=std"],
}) + select({
"//bazel/config:brpc_with_rdma": ["-DBRPC_WITH_RDMA=1"],
"//conditions:default": [""],
})

LINKOPTS = [
Expand All @@ -68,6 +71,11 @@ LINKOPTS = [
"-lmesalink",
],
"//conditions:default": [],
}) + select({
"//bazel/config:brpc_with_rdma": [
"-libverbs",
],
"//conditions:default": [],
})

genrule(
Expand Down
21 changes: 20 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ option(DEBUG "Print debug logs" OFF)
option(WITH_DEBUG_SYMBOLS "With debug symbols" ON)
option(WITH_THRIFT "With thrift framed protocol supported" OFF)
option(WITH_SNAPPY "With snappy" OFF)
option(WITH_RDMA "With RDMA" OFF)
option(BUILD_UNIT_TESTS "Whether to build unit tests" OFF)
option(BUILD_BRPC_TOOLS "Whether to build brpc tools" ON)
option(DOWNLOAD_GTEST "Download and build a fresh copy of googletest. Requires Internet access." ON)
Expand Down Expand Up @@ -68,6 +69,11 @@ if(WITH_THRIFT)
set(THRIFT_LIB "thrift")
endif()

set(WITH_RDMA_VAL "0")
if(WITH_RDMA)
set(WITH_RDMA_VAL "1")
endif()

include(GNUInstallDirs)

configure_file(${PROJECT_SOURCE_DIR}/config.h.in ${PROJECT_SOURCE_DIR}/src/butil/config.h @ONLY)
Expand Down Expand Up @@ -106,7 +112,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -Wno-deprecated-declarations -Wno-inconsistent-missing-override")
endif()

set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DGFLAGS_NS=${GFLAGS_NS}")
set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} ${DEFINE_CLOCK_GETTIME} -DBRPC_WITH_GLOG=${WITH_GLOG_VAL} -DBRPC_WITH_RDMA=${WITH_RDMA_VAL} -DGFLAGS_NS=${GFLAGS_NS}")
if(WITH_MESALINK)
set(CMAKE_CPP_FLAGS "${CMAKE_CPP_FLAGS} -DUSE_MESALINK")
endif()
Expand Down Expand Up @@ -182,6 +188,15 @@ if(WITH_MESALINK)
include_directories(${MESALINK_INCLUDE_PATH})
endif()

if(WITH_RDMA)
message("brpc compile with rdma")
find_path(RDMA_INCLUDE_PATH NAMES infiniband/verbs.h)
find_library(RDMA_LIB NAMES ibverbs)
if((NOT RDMA_INCLUDE_PATH) OR (NOT RDMA_LIB))
message(FATAL_ERROR "Fail to find ibverbs")
endif()
endif()

find_library(PROTOC_LIB NAMES protoc)
if(NOT PROTOC_LIB)
message(FATAL_ERROR "Fail to find protoc lib")
Expand Down Expand Up @@ -220,6 +235,10 @@ else()
list(APPEND DYNAMIC_LIB ${OPENSSL_SSL_LIBRARY})
endif()

if(WITH_RDMA)
list(APPEND DYNAMIC_LIB ${RDMA_LIB})
endif()

set(BRPC_PRIVATE_LIBS "-lgflags -lprotobuf -lleveldb -lprotoc -lssl -lcrypto -ldl -lz")

if(WITH_GLOG)
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ JSON2PB_DIRS = src/json2pb
JSON2PB_SOURCES = $(foreach d,$(JSON2PB_DIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
JSON2PB_OBJS = $(addsuffix .o, $(basename $(JSON2PB_SOURCES)))

BRPC_DIRS = src/brpc src/brpc/details src/brpc/builtin src/brpc/policy
BRPC_DIRS = src/brpc src/brpc/details src/brpc/builtin src/brpc/policy src/brpc/rdma
THRIFT_SOURCES = $(foreach d,$(BRPC_DIRS),$(wildcard $(addprefix $(d)/thrift*,$(SRCEXTS))))
EXCLUDE_SOURCES = $(foreach d,$(BRPC_DIRS),$(wildcard $(addprefix $(d)/event_dispatcher_*,$(SRCEXTS))))
BRPC_SOURCES_ALL = $(foreach d,$(BRPC_DIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
Expand Down
6 changes: 6 additions & 0 deletions bazel/config/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,9 @@ config_setting(
values = {"cpu": "darwin"},
visibility = ["//:__subpkgs__"],
)

config_setting(
name = "brpc_with_rdma",
define_values = {"BRPC_WITH_RDMA": "true"},
visibility = ["//visibility:public"],
)
16 changes: 15 additions & 1 deletion config_brpc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,10 @@ else
LDD=ldd
fi

TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-mesalink,nodebugsymbols -n 'config_brpc' -- "$@"`
TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-mesalink,nodebugsymbols -n 'config_brpc' -- "$@"`
WITH_GLOG=0
WITH_THRIFT=0
WITH_RDMA=0
WITH_MESALINK=0
DEBUGSYMBOLS=-g

Expand All @@ -64,6 +65,7 @@ while true; do
--cxx ) CXX=$2; shift 2 ;;
--with-glog ) WITH_GLOG=1; shift 1 ;;
--with-thrift) WITH_THRIFT=1; shift 1 ;;
--with-rdma) WITH_RDMA=1; shift 1 ;;
--with-mesalink) WITH_MESALINK=1; shift 1 ;;
--nodebugsymbols ) DEBUGSYMBOLS=; shift 1 ;;
-- ) shift; break ;;
Expand Down Expand Up @@ -352,6 +354,18 @@ if [ $WITH_THRIFT != 0 ]; then
fi
fi

if [ $WITH_RDMA != 0 ]; then
RDMA_LIB=$(find_dir_of_lib_or_die ibverbs)
RDMA_HDR=$(find_dir_of_header_or_die infiniband/verbs.h)
append_to_output_libs "$RDMA_LIB"
append_to_output_headers "$RDMA_HDR"

CPPFLAGS="${CPPFLAGS} -DBRPC_WITH_RDMA"

append_to_output "DYNAMIC_LINKINGS+=-libverbs"
append_to_output "WITH_RDMA=1"
fi

if [ $WITH_MESALINK != 0 ]; then
CPPFLAGS="${CPPFLAGS} -DUSE_MESALINK"
fi
Expand Down
60 changes: 60 additions & 0 deletions docs/cn/rdma.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# 编译

由于RDMA对驱动与硬件有要求,目前仅支持在Linux系统编译并运行RDMA功能。

使用config_brpc:
```bash
sh config_brpc.sh --with-rdma --headers="/usr/include" --libs="/usr/lib64 /usr/bin"
make

cd example/rdma_performance # 示例程序
make
```

使用cmake:
```bash
mkdir bld && cd bld && cmake -DWITH_RDMA=ON ..
make

cd example/rdma_performance # 示例程序
mkdir bld && cd bld && cmake ..
make
```

# 基本实现

RDMA与TCP不同,不使用socket接口进行通信。但是在实现上仍然复用了brpc中原本的Socket类。当用户选择ChannelOptions或ServerOptions中的use_rdma为true时,创建出的Socket类中则有对应的RdmaEndpoint(参见src/brpc/rdma/rdma_endpoint.cpp)。当RDMA被使能时,写入Socket的数据会通过RdmaEndpoint提交给RDMA QP(通过verbs API),而非拷贝到fd。对于数据读取,RdmaEndpoint中则调用verbs API从RDMA CQ中获取对应完成信息(事件获取有独立的fd,复用EventDispatcher,处理函数采用RdmaEndpoint::PollCq),最后复用InputMessenger完成RPC消息解析。

brpc内部使用RDMA RC模式,每个RdmaEndpoint对应一个QP。RDMA连接建立依赖于前置TCP建连,TCP建连后双方交换必要参数,如GID、QPN等,再发起RDMA连接并实现数据传输。这个过程我们称为握手(参见RdmaEndpoint)。因为握手需要TCP连接,因此RdmaEndpoint所在的Socket类中,原本的TCP fd仍然有效。握手过程采用了brpc中已有的AppConnect逻辑。注意,握手用的TCP连接在后续数据传输阶段并不会收发数据,但仍保持为EST状态。一旦TCP连接中断,其上对应的RDMA连接同样会置错。

RdmaEndpoint数据传输逻辑的第一个重要特性是零拷贝。要发送的所有数据默认都存放在IOBuf的Block中,因此所发送的Block需要等到对端确认接收完成后才可以释放,这些Block的引用被存放于RdmaEndpoint::_sbuf中。而要实现接收零拷贝,则需要确保接受端所预提交的接收缓冲区必须直接在IOBuf的Block里面,被存放于RdmaEndpoint::_rbuf。注意,接收端预提交的每一段Block,有一个固定的大小(recv_block_size)。发送端发送时,一个请求最多只能有这么大,否则接收端则无法成功接收。

RdmaEndpoint数据传输逻辑的第二个重要特性是滑动窗口流控。这一流控机制是为了避免发送端持续在发送,其速度超过了接收端处理的速度。TCP传输中也有类似的逻辑,但是是由内核协议栈来实现的。RdmaEndpoint内实现了这一流控机制,通过接收端显式回复ACK来确认接收端处理完毕。为了减少ACK本身的开销,让ACK以立即数形式返回,可以被附在数据消息里。

RdmaEndpoint数据传输逻辑的第三个重要特性是事件聚合。每个消息的大小被限定在一个recv_block_size,默认为8KB。如果每个消息都触发事件进行处理,会导致性能退化严重,甚至不如TCP传输(TCP拥有GSO、GRO等诸多优化)。因此,RdmaEndpoint综合考虑数据大小、窗口与ACK的情况,对每个发送消息选择性设置solicited标志,来控制是否在发送端触发事件通知。

RDMA要求数据收发所使用的内存空间必须被注册(memory register),把对应的页表映射注册给网卡,这一操作非常耗时,所以通常都会使用内存池方案来加速。brpc内部的数据收发都使用IOBuf,为了在兼容IOBuf的情况下实现完全零拷贝,整个IOBuf所使用的内存空间整体由统一内存池接管(参见src/brpc/rdma/block_pool.cpp)。注意,由于IOBuf内存池不由用户直接控制,因此实际使用中需要注意IOBuf所消耗的总内存,建议根据实际业务需求,一次性注册足够的内存池以实现性能最大化。

RDMA是硬件相关的通信技术,有很多独特的概念,比如device、port、GID、LID、MaxSge等。这些参数在初始化时会从对应的网卡中读取出来,并且做出默认的选择(参见src/brpc/rdma/rdma_helper.cpp)。有时默认的选择并非用户的期望,则可以通过flag参数方式指定。

# 参数

可配置参数说明:
* rdma_trace_verbose: 日志中打印RDMA建连相关信息,默认false
* rdma_recv_zerocopy: 是否启用接收零拷贝,默认true
* rdma_zerocopy_min_size: 接收零拷贝最小的msg大小,默认512B
* rdma_recv_block_type: 为接收数据预准备的block类型,分为三类default(8KB)/large(64KB)/huge(2MB),默认为default
* rdma_prepared_qp_size: 程序启动预生成的QP的大小,默认128
* rdma_prepared_qp_cnt: 程序启动预生成的QP的数量,默认1024
* rdma_max_sge: 允许的最大发送SGList长度,默认为0,即采用硬件所支持的最大长度
* rdma_sq_size: SQ大小,默认128
* rdma_rq_size: RQ大小,默认128
* rdma_cqe_poll_once: 从CQ中一次性poll出的CQE数量,默认32
* rdma_gid_index: 使用本地GID表中的Index,默认为-1,即选用最大的可用GID Index
* rdma_port: 使用IB设备的port number,默认为1
* rdma_device: 使用IB设备的名称,默认为空,即使用第一个active的设备
* rdma_memory_pool_initial_size_mb: 内存池的初始大小,单位MB,默认1024
* rdma_memory_pool_increase_size_mb: 内存池每次动态增长的大小,单位MB,默认1024
* rdma_memory_pool_max_regions: 最大的内存池块数,默认16
* rdma_memory_pool_buckets: 内存池中为避免竞争采用的bucket数目,默认为4
* rdma_memory_pool_tls_cache_num: 内存池中thread local的缓存block数目,默认为128
60 changes: 60 additions & 0 deletions docs/en/rdma.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Build

Since RDMA requires driver and hardware support, only the build on linux is verified.

With config_brpc:
```bash
sh config_brpc.sh --with-rdma --headers="/usr/include" --libs="/usr/lib64 /usr/bin"
make

cd example/rdma_performance # example for rdma
make
```

With cmake:
```bash
mkdir bld && cd bld && cmake -DWITH_RDMA=ON ..
make

cd example/rdma_performance # example for rdma
mkdir bld && cd bld && cmake ..
make
```

# Basic Implementation

RDMA does not use socket API like TCP. However, the brpc::Socket class is still used. If a user sets ChannelOptions.use_rdma or ServerOptions.use_rdma to true, the Socket class created has RdmaEndpoint (see src/brpc/rdma/rdma_endpoint.cpp). When RDMA is enabled, the data which need to transmit will be posted to RDMA QP with verbs API, not written to TCP fd. For data receiving, RdmaEndpoint will get completions from RDMA CQ with verbs API (the event will be generated from a dedicated fd and be added into EventDispatcher, the handling function is RdmaEndpoint::PollCq) before parsing RPC messages with InputMessenger.

brpc uses RDMA RC mode. Every RdmaEndpoint has its own QP. Before establishing RDMA connection, a TCP connection is necessary to exchange some information such as GID and QPN. We call this procedure handshake. Since handshake needs TCP connection, the TCP fd in the corresponding Socket is still valid. The handshake procedure is completed in the AppConnect way in brpc. The TCP connection will keep in EST state but not be used for data transmission after RDMA connection is established. Once the TCP connection is closed, the corresponding RDMA connection will be set error.

The first key feature in RdmaEndpoint data transmission is zero copy. All data which need to transmit is in the Blocks of IOBuf. Thus all the Blocks need to be released after the remote side completes the receiving. The reference of these Blocks are stored in RdmaEndpoint::_sbuf. In order to realize receiving zero copy, the receive side must post receive buffers in Blocks of IOBuf, which are stored in RdmaEndpoint::_rbuf. Note that all the Blocks posted in the receive side has a fixed size (recv_block_size). The transmit side can only send message smaller than that. Otherwise the receive side cannot receive data successfully.

The second key feature in RdmaEndpoint data transmission is sliding window flow control. The flow control is to avoid fast transmit side overwhelming slow receive side. TCP has similar mechanism in kernel TCP stack. RdmaEndpoint implements this mechanism with explicit ACKs from receive side. to reduce the overhead of ACKs, the ACK number can be piggybacked in ordinary data message as immediate data.

The third key feature in RdmaEndpoint data transmission is event suppression. The size of every message is limited to recv_block_size (defaulty 8KB). If every message will generate an event, the performance will be very poor, even worse than TCP (TCP has GSO/GRO). Therefore, RdmaEndpoint set solicited flag for every message according to data size, window and ACKS. The flag can control whether to generate an event in remove side or not.

All the memory used for data transmission in RDMA must be registered, which is very inefficient. Generally, a memory pool is employed to avoid frequent memory registration. In fact, brpc uses IOBuf for data transmission. In order to realize total zerocopy and compatibility with IOBuf, the memory used by IOBuf is taken over by the RDMA memory pool (see src/brpc/rdma/block_pool.cpp). Since IOBuf buffer cannot be controlled by user directly, the total memory consumption in IOBuf should be carefully managed. It is suggested that the application registers enough memory at one time according to its requirement.

RDMA is hardware-related. It has some different concepts such as device, port, GID, LID, MaxSge and so on. These parameters can be read from NICs at initialization, and brpc will make the default choice (see src/brpc/rdma/rdma_helper.cpp). Sometimes the default choice is not the expectation, then it can be changed in the flag way.

# Parameters

Congifurable parameterss:
* rdma_trace_verbose: to print RDMA connection information in log,default is false
* rdma_recv_zerocopy: enable zero copy in receive side,default is true
* rdma_zerocopy_min_size: the min message size for receive zero copy (in Byte),default is 512
* rdma_recv_block_type: the block type used for receiving, can be default(8KB)/large(64KB)/huge(2MB),default is default
* rdma_prepared_qp_size: the size of QP created at the begining of the application,default is 128
* rdma_prepared_qp_cnt: the number of QPs created at the begining of the application,default is 1024
* rdma_max_sge: the max length of sglist, default is 0, which is the max length allowed by the device
* rdma_sq_size: the size of SQ,default is 128
* rdma_rq_size: the size of RQ,default is 128
* rdma_cqe_poll_once: the number of CQE pooled from CQ once,default is 32
* rdma_gid_index: the index of local GID table used,default is -1,which is the maximum GID index
* rdma_port: the port number used,default is 1
* rdma_device: the IB device name,default is empty,which is the first active device
* rdma_memory_pool_initial_size_mb: the initial region size of RDMA memory pool (in MB),default is 1024
* rdma_memory_pool_increase_size_mb: the step increase region size of RDMA memory pool (in MB),default is 1024
* rdma_memory_pool_max_regions: the max number of regions in RDMA memory pool,default is 16
* rdma_memory_pool_buckets: the number of buckets for avoiding mutex contention in RDMA memory pool,default is 4
* rdma_memory_pool_tls_cache_num: the number of thread local cached blocks in RDMA memory pool,default is 128
Loading