Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Managed mem support #1466

Merged
merged 13 commits into from
Aug 13, 2024
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,7 @@ if(NOT BUILD_LEGION_ONLY)
if(NOT CARGO_RESULT EQUAL 0)
message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
endif()
set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON)
add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
target_link_libraries(flexflow tokenizers_cpp)
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "flexflow/node.h"
#include "flexflow/operator_params.h"
#include "flexflow/utils/hash_utils.h"
#include "flexflow/utils/memory_allocator.h"
#include "flexflow/utils/tuple.h"
#include "initializer.h"
#include "layer.h"
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/ops/batch_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _FLEXFLOW_BATCH_NORM_H

#include "flexflow/model.h"
#include "flexflow/utils/memory_allocator.h"

namespace FlexFlow {

Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/utils/memory_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class MemoryAllocator {
size_t instance_total_size, instance_allocated_size;
};

Legion::Memory get_proc_mem(Legion::Machine machine, Legion::Processor proc);

}; // namespace FlexFlow

#endif // _FLEXFLOW_RUNTIME_H_
7 changes: 2 additions & 5 deletions src/mapper/mapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
*/

#include "flexflow/mapper.h"
#include "flexflow/utils/memory_allocator.h"

namespace FlexFlow {

Expand Down Expand Up @@ -81,11 +82,7 @@ FFMapper::FFMapper(MapperRuntime *rt,
if (it->address_space() == node_id) {
local_gpus.push_back(*it);
}
Machine::MemoryQuery fb_query(machine);
fb_query.only_kind(Memory::GPU_FB_MEM);
fb_query.best_affinity_to(*it);
assert(fb_query.count() == 1);
proc_fbmems[*it] = *(fb_query.begin());
proc_fbmems[*it] = get_proc_mem(machine, *it);
Machine::MemoryQuery zc_query(machine);
zc_query.only_kind(Memory::Z_COPY_MEM);
zc_query.has_affinity_to(*it);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/add_bias_residual_layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,7 @@ OpMeta *AddBiasResidualLayerNorm::init_task(
Runtime *runtime) {
AddBiasResidualLayerNorm *ln = (AddBiasResidualLayerNorm *)task->args;
FFHandler handle = *((FFHandler const *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
AddBiasResidualLayerNormMeta *meta =
new AddBiasResidualLayerNormMeta(handle, ln, gpu_mem_allocator);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/argmax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,7 @@ OpMeta *ArgMax::init_task(Task const *task,
ctx, task->regions[1].region.get_index_space());
int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1;
int batch_size = acc_input.domain.get_volume() / length;
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);

ArgMaxMeta *m = new ArgMaxMeta(handle,
Expand Down
5 changes: 1 addition & 4 deletions src/ops/attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -514,10 +514,7 @@ OpMeta *
acc_output.rect.hi[1] - acc_output.rect.lo[1] + 1);
assert(attn->oProjSize == acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1);

Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MultiHeadAttentionMeta *m =
new MultiHeadAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads);
m->profiling = attn->profiling;
Expand Down
5 changes: 1 addition & 4 deletions src/ops/batch_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,7 @@ __host__ OpMeta *
int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1;
int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1;

Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
BatchNormMeta *m = new BatchNormMeta(
handle, bm, gpu_mem, output_n, output_c, output_h, output_w);
return m;
Expand Down
5 changes: 1 addition & 4 deletions src/ops/batch_norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,7 @@ __host__ OpMeta *
int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1;
int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1;

Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
BatchNormMeta *m = new BatchNormMeta(
handle, bm, gpu_mem, output_n, output_c, output_h, output_w);
return m;
Expand Down
5 changes: 1 addition & 4 deletions src/ops/beam_topk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,7 @@ OpMeta *BeamTopK::init_task(Task const *task,
Runtime *runtime) {
BeamTopK *topk = (BeamTopK *)task->args;
FFHandler handle = *((FFHandler *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator);
m->profiling = topk->profiling;
Expand Down
5 changes: 1 addition & 4 deletions src/ops/dropout.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,7 @@ OpMeta *Dropout::init_task(Task const *task,
ctx, task->regions[0].region.get_index_space());
Domain output_domain = runtime->get_index_space_domain(
ctx, task->regions[1].region.get_index_space());
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
assert(input_domain == output_domain);
DropoutMeta *m = new DropoutMeta(handle, dropout, gpu_mem, output_domain);
std::strcpy(m->op_name, dropout->name);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -698,10 +698,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task(

assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);

Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
if (attn->offload) {
// cpu-offload enabled
Expand Down
5 changes: 1 addition & 4 deletions src/ops/layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -380,10 +380,7 @@ OpMeta *LayerNorm::init_task(Task const *task,
Runtime *runtime) {
LayerNorm *ln = (LayerNorm *)task->args;
FFHandler handle = *((FFHandler const *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
LayerNormMeta *meta = new LayerNormMeta(handle, ln, gpu_mem_allocator);
std::strcpy(meta->op_name, ln->name);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
// in_dim,
// out_dim,
// batch_size);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
if (linear->offload) {
// cpu-offload enabled
Expand Down
5 changes: 1 addition & 4 deletions src/ops/residual_layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -489,10 +489,7 @@ OpMeta *ResidualLayerNorm::init_task(Task const *task,
Runtime *runtime) {
ResidualLayerNorm *ln = (ResidualLayerNorm *)task->args;
FFHandler handle = *((FFHandler const *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
ResidualLayerNormMeta *meta =
new ResidualLayerNormMeta(handle, ln, gpu_mem_allocator);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/residual_rms_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,7 @@ OpMeta *ResidualRMSNorm::init_task(Task const *task,
Runtime *runtime) {
ResidualRMSNorm *rn = (ResidualRMSNorm *)task->args;
FFHandler handle = *((FFHandler const *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
ResidualRMSNormMeta *meta =
new ResidualRMSNormMeta(handle, rn, gpu_mem_allocator);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/rms_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,7 @@ OpMeta *RMSNorm::init_task(Task const *task,
Runtime *runtime) {
RMSNorm *rn = (RMSNorm *)task->args;
FFHandler handle = *((FFHandler const *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
RMSNormMeta *meta = new RMSNormMeta(handle, rn, gpu_mem_allocator);
std::strcpy(meta->op_name, rn->name);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/sampling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,7 @@ OpMeta *Sampling::init_task(Task const *task,

int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1;
int batch_size = acc_input.domain.get_volume() / length;
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
SamplingMeta *m = new SamplingMeta(
handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/sigmoid_silu_multi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,7 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task,
Runtime *runtime) {
SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)task->args;
FFHandler handle = *((FFHandler const *)task->local_args);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
SigmoidSiluMultiMeta *meta =
new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator);
Expand Down
5 changes: 1 addition & 4 deletions src/ops/spec_inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -640,10 +640,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
int num_kv_heads = attn->num_kv_heads;
assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);

Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
// We don't do offloading for SSMs (small speculative models)
SpecIncMultiHeadSelfAttentionMeta *m =
Expand Down
5 changes: 1 addition & 4 deletions src/ops/tree_inc_multihead_self_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -697,10 +697,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(

assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);

Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MemoryAllocator gpu_mem_allocator(gpu_mem);
if (attn->offload) {
// cpu-offload enabled
Expand Down
5 changes: 1 addition & 4 deletions src/runtime/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1914,10 +1914,7 @@ std::pair<std::unique_ptr<Graph>, std::unordered_map<Node, MachineView>>
model->config.workersPerNode,
model->config.cpusPerNode,
model->all_valid_views);
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
MachineModel *machine;
if (model->config.machine_model_version == 0) {
machine =
Expand Down
12 changes: 12 additions & 0 deletions src/runtime/memory_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ namespace FlexFlow {

// declare Legion names
using Legion::coord_t;
using Legion::Machine;
using Legion::Memory;
using Legion::Processor;
using Realm::RegionInstance;

MemoryAllocator::MemoryAllocator(Memory _memory)
Expand Down Expand Up @@ -51,4 +53,14 @@ void MemoryAllocator::register_reserved_work_space(void *base, size_t size) {
reserved_allocated_size = 0;
}

// Now it's for allocating FB memory, in the future we can
// add more types of memory allocation if needed
Memory get_proc_mem(Machine machine, Processor proc) {
Machine::MemoryQuery proc_mem = Machine::MemoryQuery(machine)
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(proc);
assert(proc_mem.count() > 0);
return proc_mem.first();
}

}; // namespace FlexFlow
4 changes: 2 additions & 2 deletions src/runtime/model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4273,8 +4273,8 @@ void FFConfig::parse_args(char **argv, int argc) {
workersPerNode = atoi(argv[++i]);
continue;
}
if (!strcmp(argv[i], "-ll:fsize")) {
device_mem = atoi(argv[++i]);
if ((!strcmp(argv[i], "-ll:fsize")) || (!strcmp(argv[i], "-ll:msize"))) {
device_mem += atoi(argv[++i]);
continue;
}
if (!strcmp(argv[i], "--nodes")) {
Expand Down
15 changes: 3 additions & 12 deletions src/runtime/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,7 @@ FFHandler
// handle.workSpace = memFBImpl->get_direct_ptr(offset, 0);
{
// allocate memory for workspace
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
Realm::Rect<1, coord_t> bounds(
Realm::Point<1, coord_t>(0),
Realm::Point<1, coord_t>(handle.workSpaceSize - 1));
Expand All @@ -133,10 +130,7 @@ FFHandler
}
if (handle.offload_reserve_space_size > 0) {
// allocate memory for offload reserve space
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
Realm::Rect<1, coord_t> bounds(
Realm::Point<1, coord_t>(0),
Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1));
Expand All @@ -157,10 +151,7 @@ FFHandler
}
if (handle.batch_config_metadata_size > 0) {
// allocate memory for offload reserve space
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
Realm::Rect<1, coord_t> bounds(
Realm::Point<1, coord_t>(0),
Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
Expand Down
15 changes: 3 additions & 12 deletions src/runtime/model.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,7 @@ FFHandler
// handle.workSpace = memFBImpl->get_direct_ptr(offset, 0);
{
// allocate memory for workspace
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
Realm::Rect<1, coord_t> bounds(
Realm::Point<1, coord_t>(0),
Realm::Point<1, coord_t>(handle.workSpaceSize - 1));
Expand All @@ -129,10 +126,7 @@ FFHandler
}
if (handle.offload_reserve_space_size > 0) {
// allocate memory for offload reserve space
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
Realm::Rect<1, coord_t> bounds(
Realm::Point<1, coord_t>(0),
Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1));
Expand All @@ -153,10 +147,7 @@ FFHandler
}
if (handle.batch_config_metadata_size > 0) {
// allocate memory for offload reserve space
Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
.only_kind(Memory::GPU_FB_MEM)
.best_affinity_to(task->target_proc)
.first();
Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
Realm::Rect<1, coord_t> bounds(
Realm::Point<1, coord_t>(0),
Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
Expand Down
Loading