Skip to content

Commit

Permalink
[apache#1472] fix(server): Inaccurate flow control leads to Shuffle s…
Browse files Browse the repository at this point in the history
…erver OOM when enabling Netty
  • Loading branch information
rickyma committed Feb 5, 2024
1 parent 947bbf3 commit 0ead743
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,7 @@ public void createClient() throws Exception {
RssConf rssConf = new RssConf();
rssConf.set(RssClientConf.RSS_CLIENT_TYPE, ClientType.GRPC_NETTY);
shuffleServerNettyClient =
new ShuffleServerGrpcNettyClient(
rssConf,
LOCALHOST,
SHUFFLE_SERVER_PORT,
NETTY_PORT);
new ShuffleServerGrpcNettyClient(rssConf, LOCALHOST, SHUFFLE_SERVER_PORT, NETTY_PORT);
}

@AfterEach
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.MockedStatic;
import org.mockito.Mockito;
import org.roaringbitmap.longlong.Roaring64NavigableMap;

import org.apache.uniffle.client.TestUtils;
Expand All @@ -47,6 +50,7 @@
import org.apache.uniffle.common.config.RssClientConf;
import org.apache.uniffle.common.config.RssConf;
import org.apache.uniffle.common.util.ByteBufUtils;
import org.apache.uniffle.common.util.NettyUtils;
import org.apache.uniffle.coordinator.CoordinatorConf;
import org.apache.uniffle.coordinator.CoordinatorServer;
import org.apache.uniffle.server.MockedShuffleServer;
Expand All @@ -62,17 +66,21 @@
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.mockStatic;
import static org.mockito.Mockito.when;

public class ShuffleServerFaultToleranceTest extends ShuffleReadWriteBase {

private List<ShuffleServerClient> shuffleServerClients;
private List<ShuffleServerClient> shuffleServerNettyClients;
private static MockedStatic<NettyUtils> nettyUtils;

private String remoteStoragePath = HDFS_URI + "rss/test";

@BeforeEach
public void setupServers(@TempDir File tmpDir) throws Exception {
nettyUtils = mockStatic(NettyUtils.class, Mockito.CALLS_REAL_METHODS);
nettyUtils.when(NettyUtils::getMaxDirectMemory).thenReturn(600L);
CoordinatorConf coordinatorConf = getCoordinatorConf();
createCoordinatorServer(coordinatorConf);
shuffleServers.add(createServer(0, tmpDir));
Expand All @@ -91,7 +99,9 @@ public void setupServers(@TempDir File tmpDir) throws Exception {
rssConf,
LOCALHOST,
SHUFFLE_SERVER_PORT,
shuffleServer.getShuffleServerConf().getInteger(ShuffleServerConf.NETTY_SERVER_PORT)));
shuffleServer
.getShuffleServerConf()
.getInteger(ShuffleServerConf.NETTY_SERVER_PORT)));
}
}

Expand All @@ -108,6 +118,11 @@ public void cleanEnv() throws Exception {
cleanCluster();
}

@AfterAll
public static void tearDown() {
nettyUtils.close();
}

@Test
public void testReadFaultTolerance() throws Exception {
testReadFaultTolerance(true);
Expand Down Expand Up @@ -316,6 +331,13 @@ public static MockedShuffleServer createServer(int id, File tmpDir) throws Excep
shuffleServerConf.setInteger("rss.rpc.server.port", SHUFFLE_SERVER_PORT + 20 + id);
shuffleServerConf.setInteger("rss.jetty.http.port", 19081 + id * 100);
shuffleServerConf.setString("rss.storage.basePath", basePath);
shuffleServerConf.set(
ShuffleServerConf.SERVER_PRE_ALLOCATION_RESERVED_OFF_HEAP_SIZE,
(long)
(NettyUtils.getMaxDirectMemory()
/ 100
* shuffleServerConf.getDouble(
ShuffleServerConf.SERVER_MEMORY_SHUFFLE_LOWWATERMARK_PERCENTAGE)));
return new MockedShuffleServer(shuffleServerConf);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ public static void setup() throws Exception {
ShuffleServerConf shuffleServerConf = getShuffleServerConf();
shuffleServer = new ShuffleServer(shuffleServerConf);
shuffleServer.start();

}

@AfterAll
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.MockedStatic;
import org.mockito.Mockito;
import org.roaringbitmap.longlong.Roaring64NavigableMap;

import org.apache.uniffle.client.impl.grpc.ShuffleServerGrpcClient;
Expand All @@ -44,6 +47,7 @@
import org.apache.uniffle.common.config.RssClientConf;
import org.apache.uniffle.common.config.RssConf;
import org.apache.uniffle.common.util.ByteBufUtils;
import org.apache.uniffle.common.util.NettyUtils;
import org.apache.uniffle.coordinator.CoordinatorConf;
import org.apache.uniffle.server.ShuffleServerConf;
import org.apache.uniffle.server.buffer.ShuffleBuffer;
Expand All @@ -59,16 +63,20 @@
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import static org.mockito.Mockito.mockStatic;

public class ShuffleServerWithMemLocalHadoopTest extends ShuffleReadWriteBase {

private ShuffleServerGrpcClient shuffleServerClient;
private ShuffleServerGrpcNettyClient shuffleServerNettyClient;
private static String REMOTE_STORAGE = HDFS_URI + "rss/test";
private static ShuffleServerConf shuffleServerConfig;
private static MockedStatic<NettyUtils> nettyUtils;

@BeforeAll
public static void setupServers(@TempDir File tmpDir) throws Exception {
nettyUtils = mockStatic(NettyUtils.class, Mockito.CALLS_REAL_METHODS);
nettyUtils.when(NettyUtils::getMaxDirectMemory).thenReturn(500L);
CoordinatorConf coordinatorConf = getCoordinatorConf();
createCoordinatorServer(coordinatorConf);
ShuffleServerConf shuffleServerConf = getShuffleServerConf();
Expand All @@ -83,6 +91,13 @@ public static void setupServers(@TempDir File tmpDir) throws Exception {
shuffleServerConf.set(ShuffleServerConf.SERVER_MEMORY_SHUFFLE_HIGHWATERMARK_PERCENTAGE, 40.0);
shuffleServerConf.set(ShuffleServerConf.SERVER_BUFFER_CAPACITY, 500L);
shuffleServerConf.set(ShuffleServerConf.SERVER_TRIGGER_FLUSH_CHECK_INTERVAL, 500L);
shuffleServerConf.set(
ShuffleServerConf.SERVER_PRE_ALLOCATION_RESERVED_OFF_HEAP_SIZE,
(long)
(NettyUtils.getMaxDirectMemory()
/ 100
* shuffleServerConf.getDouble(
ShuffleServerConf.SERVER_MEMORY_SHUFFLE_LOWWATERMARK_PERCENTAGE)));
createShuffleServer(shuffleServerConf);
startServers();
shuffleServerConfig = shuffleServerConf;
Expand All @@ -107,6 +122,11 @@ public void closeClient() {
shuffleServerNettyClient.close();
}

@AfterAll
public static void tearDown() {
nettyUtils.close();
}

@Test
public void memoryLocalFileHadoopReadWithFilterAndSkipTest() throws Exception {
runTest(true, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
import org.apache.uniffle.server.ShuffleServerMetrics;
import org.apache.uniffle.server.ShuffleTaskManager;

import static org.apache.uniffle.common.config.RssBaseConf.RSS_TEST_MODE_ENABLE;

public class ShuffleBufferManager {

private static final Logger LOG = LoggerFactory.getLogger(ShuffleBufferManager.class);
Expand Down Expand Up @@ -78,6 +80,7 @@ public class ShuffleBufferManager {
private long reservedOnHeapPreAllocatedBufferBytes;
private long reservedOffHeapPreAllocatedBufferBytes;
private long maxAvailableOffHeapBytes;
private boolean testMode;

protected long bufferSize = 0;
protected AtomicLong preAllocatedSize = new AtomicLong(0L);
Expand Down Expand Up @@ -122,6 +125,7 @@ public ShuffleBufferManager(ShuffleServerConf conf, ShuffleFlushManager shuffleF
conf.getSizeAsBytes(ShuffleServerConf.SERVER_PRE_ALLOCATION_RESERVED_ON_HEAP_SIZE);
this.reservedOffHeapPreAllocatedBufferBytes =
conf.getSizeAsBytes(ShuffleServerConf.SERVER_PRE_ALLOCATION_RESERVED_OFF_HEAP_SIZE);
this.testMode = conf.getBoolean(RSS_TEST_MODE_ENABLE);
if (heapSize < this.reservedOnHeapPreAllocatedBufferBytes) {
throw new IllegalArgumentException(
"The reserved on-heap memory size for pre-allocated buffer "
Expand Down Expand Up @@ -689,7 +693,13 @@ private void addPickedShuffle(String shuffleIdKey, Map<String, Set<Integer>> pic
shuffleIdSet.add(shuffleId);
}

private static long getPinnedDirectMemory() {
private long getPinnedDirectMemory() {
if (testMode) {
// In test mode, return the actual pinned direct memory immediately
long pinnedDirectMemory = NettyUtils.getNettyBufferAllocator().pinnedDirectMemory();
return pinnedDirectMemory == -1L ? 0L : pinnedDirectMemory;
}
// To optimize performance, return a periodically retrieved pinned direct memory
long pinnedDirectMemory =
(long) ShuffleServerMetrics.gaugePinnedDirectMemorySize.get() == -1L
? 0L
Expand Down

0 comments on commit 0ead743

Please sign in to comment.