Skip to content

Commit

Permalink
Fix TaskGroupCoordinator might cause OOM when there is a lot of waiti…
Browse files Browse the repository at this point in the history
…ng TaskGroupQueue (#15773)
  • Loading branch information
ruanwenjun authored Mar 30, 2024
1 parent ae1fe84 commit dc4dad1
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,18 @@ List<TaskGroupQueue> queryUsingTaskGroupQueueByGroupId(@Param("taskGroupId") Int
@Param("status") int status,
@Param("inQueue") int inQueue,
@Param("forceStart") int forceStart);

int countUsingTaskGroupQueueByGroupId(@Param("taskGroupId") Integer taskGroupId,
@Param("status") int status,
@Param("inQueue") int inQueue,
@Param("forceStart") int forceStart);

List<TaskGroupQueue> queryInQueueTaskGroupQueue(@Param("inQueue") int inQueue,
@Param("minTaskGroupQueueId") int minTaskGroupQueueId,
@Param("limit") int limit);

List<TaskGroupQueue> queryWaitNotifyForceStartTaskGroupQueue(@Param("inQueue") int inQueue,
@Param("forceStart") int forceStart,
@Param("minTaskGroupQueueId") int minTaskGroupQueueId,
@Param("limit") int limit);
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ public interface TaskGroupQueueDao extends IDao<TaskGroupQueue> {
*/
List<TaskGroupQueue> queryAllInQueueTaskGroupQueue();

/**
* Query all {@link TaskGroupQueue} which
* in_queue is {@link org.apache.dolphinscheduler.common.enums.Flag#YES}
* and id > minTaskGroupQueueId
* ordered by id asc
* limit #{limit}
*
* @return TaskGroupQueue ordered by id asc
*/
List<TaskGroupQueue> queryInQueueTaskGroupQueue(int minTaskGroupQueueId, int limit);

/**
* Query all {@link TaskGroupQueue} which in_queue is {@link org.apache.dolphinscheduler.common.enums.Flag#YES} and taskGroupId is taskGroupId
*
Expand All @@ -61,4 +72,24 @@ public interface TaskGroupQueueDao extends IDao<TaskGroupQueue> {
* @return TaskGroupQueue
*/
List<TaskGroupQueue> queryAcquiredTaskGroupQueueByGroupId(Integer taskGroupId);

/**
* Count all {@link TaskGroupQueue} which status is TaskGroupQueueStatus.ACQUIRE_SUCCESS and forceStart is {@link org.apache.dolphinscheduler.common.enums.Flag#NO}.
*
* @param taskGroupId taskGroupId
* @return TaskGroupQueue
*/
int countUsingTaskGroupQueueByGroupId(Integer taskGroupId);

/**
* Query all {@link TaskGroupQueue} which
* in_queue is {@link org.apache.dolphinscheduler.common.enums.Flag#YES}
* and forceStart is {@link org.apache.dolphinscheduler.common.enums.Flag#YES}
* and id > minTaskGroupQueueId
* order by id asc
* limit #{limit}
*
* @return TaskGroupQueue ordered by priority desc
*/
List<TaskGroupQueue> queryWaitNotifyForceStartTaskGroupQueue(int minTaskGroupQueueId, int limit);
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ public List<TaskGroupQueue> queryAllInQueueTaskGroupQueue() {
return mybatisMapper.queryAllTaskGroupQueueByInQueue(Flag.YES.getCode());
}

@Override
public List<TaskGroupQueue> queryInQueueTaskGroupQueue(int minTaskGroupQueueId, int limit) {
return mybatisMapper.queryInQueueTaskGroupQueue(Flag.YES.getCode(), minTaskGroupQueueId, limit);
}

@Override
public List<TaskGroupQueue> queryAllInQueueTaskGroupQueueByGroupId(Integer taskGroupId) {
return mybatisMapper.queryAllInQueueTaskGroupQueueByGroupId(taskGroupId, Flag.YES.getCode());
Expand All @@ -70,4 +75,21 @@ public List<TaskGroupQueue> queryAcquiredTaskGroupQueueByGroupId(Integer taskGro
Flag.YES.getCode(),
Flag.NO.getCode());
}

@Override
public int countUsingTaskGroupQueueByGroupId(Integer taskGroupId) {
return mybatisMapper.countUsingTaskGroupQueueByGroupId(taskGroupId,
TaskGroupQueueStatus.ACQUIRE_SUCCESS.getCode(),
Flag.YES.ordinal(),
Flag.NO.getCode());
}

@Override
public List<TaskGroupQueue> queryWaitNotifyForceStartTaskGroupQueue(int minTaskGroupQueueId, int limit) {
return mybatisMapper.queryWaitNotifyForceStartTaskGroupQueue(
Flag.YES.getCode(),
Flag.YES.getCode(),
minTaskGroupQueueId,
limit);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,16 @@
where in_queue = #{inQueue} order by priority desc
</select>

<select id="queryInQueueTaskGroupQueue" resultType="org.apache.dolphinscheduler.dao.entity.TaskGroupQueue">
select
<include refid="baseSql"/>
from t_ds_task_group_queue
where in_queue = #{inQueue}
and id &gt; #{minTaskGroupQueueId}
order by id asc
limit #{limit}
</select>

<select id="queryByTaskInstanceId" resultType="org.apache.dolphinscheduler.dao.entity.TaskGroupQueue">
select
<include refid="baseSql" />
Expand All @@ -233,4 +243,21 @@
where group_id = #{taskGroupId} and status = #{status} and force_start = #{forceStart} and in_queue = #{inQueue}
</select>

<select id="countUsingTaskGroupQueueByGroupId" resultType="java.lang.Integer">
select count(1)
from t_ds_task_group_queue
where group_id = #{taskGroupId} and status = #{status} and force_start = #{forceStart} and in_queue = #{inQueue}
</select>

<select id="queryWaitNotifyForceStartTaskGroupQueue" resultType="org.apache.dolphinscheduler.dao.entity.TaskGroupQueue">
select
<include refid="baseSql"/>
from t_ds_task_group_queue
where in_queue = #{inQueue}
and force_start = #{forceStart}
and id &gt; #{minTaskGroupQueueId}
order by id asc
limit #{limit}
</select>

</mapper>
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@
import org.apache.dolphinscheduler.dao.entity.TaskGroupQueue;
import org.apache.dolphinscheduler.dao.repository.TaskGroupQueueDao;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.RandomUtils;

import java.util.Date;
import java.util.List;

import org.assertj.core.util.Lists;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -55,6 +59,35 @@ void queryAllInQueueTaskGroupQueue() {
assertEquals(1, taskGroupQueueDao.queryAllInQueueTaskGroupQueue().size());
}

@Test
void queryInQueueTaskGroupQueue_withMinId() {
// Insert 1w ~ 10w records
int insertCount = RandomUtils.nextInt(10000, 100000);
List<TaskGroupQueue> insertTaskGroupQueue = Lists.newArrayList();
for (int i = 0; i < insertCount; i++) {
TaskGroupQueue taskGroupQueue = createTaskGroupQueue(Flag.NO, TaskGroupQueueStatus.ACQUIRE_SUCCESS);
insertTaskGroupQueue.add(taskGroupQueue);
}
taskGroupQueueDao.insertBatch(insertTaskGroupQueue);

int minTaskGroupQueueId = -1;
int limit = 1000;
int queryCount = 0;
while (true) {
List<TaskGroupQueue> taskGroupQueues =
taskGroupQueueDao.queryInQueueTaskGroupQueue(minTaskGroupQueueId, limit);
if (CollectionUtils.isEmpty(taskGroupQueues)) {
break;
}
queryCount += taskGroupQueues.size();
if (taskGroupQueues.size() < limit) {
break;
}
minTaskGroupQueueId = taskGroupQueues.get(taskGroupQueues.size() - 1).getId();
}
assertEquals(insertCount, queryCount);
}

@Test
void queryAllInQueueTaskGroupQueueByGroupId() {
TaskGroupQueue taskGroupQueue = createTaskGroupQueue(Flag.NO, TaskGroupQueueStatus.ACQUIRE_SUCCESS);
Expand Down Expand Up @@ -91,6 +124,49 @@ void queryUsingTaskGroupQueueByGroupId() {
assertEquals(1, taskGroupQueueDao.queryAcquiredTaskGroupQueueByGroupId(1).size());
}

@Test
void countUsingTaskGroupQueueByGroupId() {
assertEquals(0, taskGroupQueueDao.countUsingTaskGroupQueueByGroupId(1));

TaskGroupQueue taskGroupQueue = createTaskGroupQueue(Flag.NO, TaskGroupQueueStatus.ACQUIRE_SUCCESS);
taskGroupQueueDao.insert(taskGroupQueue);
assertEquals(1, taskGroupQueueDao.countUsingTaskGroupQueueByGroupId(1));

taskGroupQueue = createTaskGroupQueue(Flag.YES, TaskGroupQueueStatus.WAIT_QUEUE);
taskGroupQueueDao.insert(taskGroupQueue);
assertEquals(1, taskGroupQueueDao.countUsingTaskGroupQueueByGroupId(1));
}

@Test
void queryWaitNotifyForceStartTaskGroupQueue() {
// Insert 1w records
int insertCount = RandomUtils.nextInt(10000, 20000);
List<TaskGroupQueue> insertTaskGroupQueue = Lists.newArrayList();
for (int i = 0; i < insertCount; i++) {
TaskGroupQueue taskGroupQueue = createTaskGroupQueue(Flag.YES, TaskGroupQueueStatus.ACQUIRE_SUCCESS);

insertTaskGroupQueue.add(taskGroupQueue);
}
taskGroupQueueDao.insertBatch(insertTaskGroupQueue);

int beginTaskGroupQueueId = -1;
int limit = 1000;
int queryCount = 0;
while (true) {
List<TaskGroupQueue> taskGroupQueues =
taskGroupQueueDao.queryWaitNotifyForceStartTaskGroupQueue(beginTaskGroupQueueId, limit);
if (CollectionUtils.isEmpty(taskGroupQueues)) {
break;
}
queryCount += taskGroupQueues.size();
if (taskGroupQueues.size() < limit) {
break;
}
beginTaskGroupQueueId = taskGroupQueues.get(taskGroupQueues.size() - 1).getId();
}
assertEquals(insertCount, queryCount);
}

private TaskGroupQueue createTaskGroupQueue(Flag forceStart, TaskGroupQueueStatus taskGroupQueueStatus) {
return TaskGroupQueue.builder()
.taskId(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ public class TaskGroupCoordinator extends BaseDaemonThread {
@Autowired
private ProcessInstanceDao processInstanceDao;

private static int DEFAULT_LIMIT = 1000;

public TaskGroupCoordinator() {
super("TaskGroupCoordinator");
}
Expand Down Expand Up @@ -147,10 +149,10 @@ private void amendTaskGroupUseSize() {
if (CollectionUtils.isEmpty(taskGroups)) {
return;
}
StopWatch taskGroupCoordinatorRoundTimeCost = StopWatch.createStarted();

for (TaskGroup taskGroup : taskGroups) {
List<TaskGroupQueue> taskGroupQueues =
taskGroupQueueDao.queryAcquiredTaskGroupQueueByGroupId(taskGroup.getId());
int actualUseSize = taskGroupQueues.size();
int actualUseSize = taskGroupQueueDao.countUsingTaskGroupQueueByGroupId(taskGroup.getId());
if (taskGroup.getUseSize() == actualUseSize) {
continue;
}
Expand All @@ -160,13 +162,35 @@ private void amendTaskGroupUseSize() {
taskGroup.setUseSize(actualUseSize);
taskGroupDao.updateById(taskGroup);
}
log.info("Success amend TaskGroup useSize cost: {}/ms", taskGroupCoordinatorRoundTimeCost.getTime());
}

/**
* Make sure the TaskGroupQueue status is {@link TaskGroupQueueStatus#RELEASE} when the related {@link TaskInstance} is not exist or status is finished.
*/
private void amendTaskGroupQueueStatus() {
List<TaskGroupQueue> taskGroupQueues = taskGroupQueueDao.queryAllInQueueTaskGroupQueue();
int minTaskGroupQueueId = -1;
int limit = DEFAULT_LIMIT;
StopWatch taskGroupCoordinatorRoundTimeCost = StopWatch.createStarted();
while (true) {
List<TaskGroupQueue> taskGroupQueues =
taskGroupQueueDao.queryInQueueTaskGroupQueue(minTaskGroupQueueId, limit);
if (CollectionUtils.isEmpty(taskGroupQueues)) {
break;
}
amendTaskGroupQueueStatus(taskGroupQueues);
if (taskGroupQueues.size() < limit) {
break;
}
minTaskGroupQueueId = taskGroupQueues.get(taskGroupQueues.size() - 1).getId();
}
log.info("Success amend TaskGroupQueue status cost: {}/ms", taskGroupCoordinatorRoundTimeCost.getTime());
}

/**
* Make sure the TaskGroupQueue status is {@link TaskGroupQueueStatus#RELEASE} when the related {@link TaskInstance} is not exist or status is finished.
*/
private void amendTaskGroupQueueStatus(List<TaskGroupQueue> taskGroupQueues) {
List<Integer> taskInstanceIds = taskGroupQueues.stream()
.map(TaskGroupQueue::getTaskId)
.collect(Collectors.toList());
Expand Down Expand Up @@ -198,10 +222,30 @@ private void dealWithForceStartTaskGroupQueue() {
// Find the force start task group queue(Which is inQueue and forceStart is YES)
// Notify the related waiting task instance
// Set the taskGroupQueue status to RELEASE and remove it from queue
List<TaskGroupQueue> taskGroupQueues = taskGroupQueueDao.queryAllInQueueTaskGroupQueue()
.stream()
.filter(taskGroupQueue -> Flag.YES.getCode() == taskGroupQueue.getForceStart())
.collect(Collectors.toList());
// We use limit here to avoid OOM, and we will retry to notify force start queue at next time
int minTaskGroupQueueId = -1;
int limit = DEFAULT_LIMIT;
StopWatch taskGroupCoordinatorRoundTimeCost = StopWatch.createStarted();
while (true) {
List<TaskGroupQueue> taskGroupQueues =
taskGroupQueueDao.queryWaitNotifyForceStartTaskGroupQueue(minTaskGroupQueueId, limit);
if (CollectionUtils.isEmpty(taskGroupQueues)) {
break;
}
dealWithForceStartTaskGroupQueue(taskGroupQueues);
if (taskGroupQueues.size() < limit) {
break;
}
minTaskGroupQueueId = taskGroupQueues.get(taskGroupQueues.size() - 1).getId();
}
log.info("Success deal with force start TaskGroupQueue cost: {}/ms",
taskGroupCoordinatorRoundTimeCost.getTime());
}

private void dealWithForceStartTaskGroupQueue(List<TaskGroupQueue> taskGroupQueues) {
// Find the force start task group queue(Which is inQueue and forceStart is YES)
// Notify the related waiting task instance
// Set the taskGroupQueue status to RELEASE and remove it from queue
for (TaskGroupQueue taskGroupQueue : taskGroupQueues) {
try {
LogUtils.setTaskInstanceIdMDC(taskGroupQueue.getTaskId());
Expand Down

0 comments on commit dc4dad1

Please sign in to comment.