Skip to content

Commit

Permalink
1.push master grad creation before all optimizer ops; 2.remove useles…
Browse files Browse the repository at this point in the history
…s unittest; 3.use a function to create master grad states
  • Loading branch information
shaojiewang committed May 15, 2023
1 parent c67eab1 commit e7623dd
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 118 deletions.
5 changes: 1 addition & 4 deletions python/paddle/optimizer/adamw.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,10 +285,7 @@ def __init__(
self._auxiliary_vars = {}
self._already_create_accumulater = set()

# master gradients
self._already_create_master_grad = set()
self._master_grads = {}
self._master_grad = False
self._create_master_grad_states()

def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val
Expand Down
63 changes: 6 additions & 57 deletions python/paddle/optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,11 @@ def __init__(
self._auxiliary_vars = {}
self._already_create_accumulater = set()

# master gradients
self._already_create_master_grad = set()
# create master gradients' states
self._create_master_grad_states()

def _create_master_grad_states(self):
# master gradients states
self._master_grads = {}
self._master_grad = False

Expand Down Expand Up @@ -677,6 +680,7 @@ def _create_master_weight(self, param):
return var

def _create_master_grad(self, grad):
assert self._is_dtype_fp16_or_bf16(grad.dtype)
if grad.name in self._master_grads:
var = self._master_grads[grad.name]
else:
Expand Down Expand Up @@ -1162,59 +1166,6 @@ def backward(
self._append_dgc_ops(params_grads)
return params_grads

def _append_cast_to_master_grad_op(self, param_grads):
"""
Add ops to cast gradient to master gradient
Args:
param_grads(list(tuple(Tensor, Tensor))):
A list of (parameter, gradient) pair to update.
Returns:
params_master_grads:
A list of (parameter, master_gradient) pair.
In the following grad clip step and optimizer step, params can be updated by master gradient.
main_prog will also append cast ops before grad clip ops.
"""

if not self._master_grad:
return param_grads

global_block = framework.default_main_program().global_block()
target_block = global_block
current_block = framework.default_main_program().current_block()
if current_block.idx != global_block.idx:
target_block = framework.default_main_program().blocks[
current_block.backward_block_idx
]

start = len(target_block.ops)

params_master_grads = []

assert isinstance(target_block, framework.Block)
# create
for p, g in param_grads:
if g.name not in self._already_create_master_grad:
if self._is_dtype_fp16_or_bf16(g.dtype):
master_g = self._create_master_grad(g)
params_master_grads.append((p, master_g))
self._already_create_master_grad.add(g.name)
target_block.append_op(
type="cast",
inputs={"X": [g]},
outputs={"Out": [master_g]},
attrs={
"in_dtype": g.dtype,
"out_dtype": master_g.dtype,
},
)
else:
params_master_grads.append((p, g))

return params_master_grads

def apply_gradients(self, params_grads):
"""
Second part of `minimize`, appending optimization operators for
Expand Down Expand Up @@ -1246,8 +1197,6 @@ def apply_gradients(self, params_grads):

# 'optimizer(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
# create master gradients
params_grads = self._append_cast_to_master_grad_op(params_grads)
params_grads = self._grad_clip(params_grads)
else:
params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
Expand Down
58 changes: 54 additions & 4 deletions python/paddle/static/amp/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class OptimizerWithMixedPrecision:
the loss scaling.
use_amp_guard(bool): Whether to use `fp16_guard` when constructing the program.
Default None, which means that its value is equal to `use_pure_fp16`.
use_master_grad(bool): Whether to use fp32 master gradients during optimizer. Default is False.
use_promote(bool): Whether to promotes to fp32 when op has any float32 inputs. Default is False.
"""

Expand Down Expand Up @@ -406,6 +407,51 @@ def run_example_code():
use_promote=self.use_promote,
)

def _append_cast_to_master_grad_op(self, param_grads):
"""
Create master gradient vars and add cast gradient to master gradient op in main program
Args:
param_grads(list(tuple(Tensor, Tensor))): A list of (parameter, gradient) pair to update.
Returns:
list: A list of (parameter, master_gradient) pair. In the following grad clip step and optimizer step, params can be updated by master gradient. main_prog will also append cast ops before grad clip ops.
"""

if not self._use_master_grad:
return param_grads

global_block = self._train_program.global_block()
target_block = global_block
current_block = self._train_program.current_block()
if current_block.idx != global_block.idx:
target_block = self._train_program.blocks[
current_block.backward_block_idx
]
params_master_grads = []

assert isinstance(target_block, paddle.fluid.framework.Block)
# create
for p, g in param_grads:
if g.name not in self._optimizer._master_grads.keys():
if self._optimizer._is_dtype_fp16_or_bf16(g.dtype):
master_g = self._optimizer._create_master_grad(g)
params_master_grads.append((p, master_g))
target_block.append_op(
type="cast",
inputs={"X": [g]},
outputs={"Out": [master_g]},
attrs={
"in_dtype": g.dtype,
"out_dtype": master_g.dtype,
},
)
else:
params_master_grads.append((p, g))

return params_master_grads

def apply_gradients(self, params_grads):
"""
Check scaled gradients to determine whether to update loss scaling and update
Expand All @@ -422,6 +468,9 @@ def apply_gradients(self, params_grads):
# transferred across GPUs can be FP16.
update_role_var_grad(self._train_program, params_grads)

# Create master grad and add cast op into program
params_grads = self._append_cast_to_master_grad_op(params_grads)

# When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
# the model can be optimized.
if (
Expand Down Expand Up @@ -662,7 +711,6 @@ def decorate(
use_pure_fp16=False,
use_fp16_guard=None,
use_bf16=False,
use_master_grad=False,
use_promote=False,
):
"""
Expand Down Expand Up @@ -776,7 +824,6 @@ def run_example_code():
incr_ratio=incr_ratio,
decr_ratio=decr_ratio,
use_amp_guard=use_fp16_guard,
use_master_grad=use_master_grad,
use_promote=use_promote,
)

Expand All @@ -790,6 +837,7 @@ def decorate(
level='O1',
dtype='float16',
master_weight=None,
master_grad=False,
init_loss_scaling=2**15,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2,
Expand All @@ -798,7 +846,6 @@ def decorate(
use_dynamic_loss_scaling=None,
use_amp_guard=False,
use_promote=False,
use_master_grad=False,
):
"""
Decorate the given optimizer to adapt to the mixed-precision training.
Expand All @@ -818,6 +865,9 @@ def decorate(
master_weight(bool, optinal): For level='O2', whether to use multi-precision
during weight updating. If master_weight is None, in O2 level optimizer
will use multi-precision. Default is None.
master_grad(bool, optinal): For level='O2', whether to use master_grad
during weight updating. If master_grad is False, in O2 level optimizer
will not use master grad. Default is False.
init_loss_scaling(float, optional): The initial loss scaling factor.
Default is 32768.
incr_every_n_steps(int, optional): Increases loss scaling every n
Expand Down Expand Up @@ -912,7 +962,7 @@ def forward(self, x):
decr_ratio=decr_ratio,
use_amp_guard=use_amp_guard,
use_promote=use_promote,
use_master_grad=use_master_grad,
use_master_grad=master_grad,
)

return mp_optimizer
7 changes: 3 additions & 4 deletions test/amp/amp_base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,16 @@ def _build_optimizer(
beta2=0.836,
epsilon=1e-4,
weight_decay=0.01,
multi_precision=True,
)
if use_amp:
optimizer = paddle.static.amp.decorate(
optimizer,
amp_lists,
level=amp_level,
dtype=amp_dtype,
use_master_grad=use_master_grad,
master_grad=use_master_grad,
use_promote=use_promote,
master_weight=True,
init_loss_scaling=1,
)
return optimizer

Expand Down Expand Up @@ -263,6 +261,7 @@ def forward(self, x):

def build_MLP_model(
use_amp,
use_grad_clip=False,
amp_dtype="float16",
amp_level="O1",
use_promote=False,
Expand Down Expand Up @@ -296,7 +295,7 @@ def build_MLP_model(
amp_dtype,
amp_level,
amp_lists,
True,
use_grad_clip=use_grad_clip,
use_promote=use_promote,
use_master_grad=use_master_grad,
)
Expand Down
90 changes: 41 additions & 49 deletions test/amp/test_amp_master_grad_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,25 +49,6 @@ def _check_optimizer(self, program, expected_num_mp):
f"The number of optimizers with multi_precison = True is expected to be {expected_num_mp}, but recieved {actual_num_mp}.",
)

def test_amp_fp16_o1(self):
main_program, _, _, _, _ = build_embedding_model(True, "float16", "O1")
self.assertEqual(main_program.num_blocks, 1)
self._check_optimizer(main_program, 0)

amp.debugging.collect_operator_stats(main_program)
op_stats_list = amp.debugging._get_op_stats_list(main_program)
expected_fp16_calls = {
"matmul_v2": 1,
"elementwise_add": 1,
"dropout": 1,
"lookup_table_v2": 0,
"squared_l2_norm": 0,
"adamw": 0,
}
self._check_op_calls(
op_stats_list[0], expected_fp16_calls=expected_fp16_calls
)

def amp_fp16_o2(self, use_master_grad):
main_program, _, _, _, _ = build_embedding_model(
True, "float16", "O2", use_master_grad=use_master_grad
Expand Down Expand Up @@ -136,6 +117,7 @@ def _run(
x_np,
max_iters,
level,
use_grad_clip,
dtype="float16",
use_master_grad=False,
):
Expand All @@ -146,7 +128,11 @@ def _run(
feed_vars,
fetch_vars,
) = build_MLP_model(
True, dtype, level, use_master_grad=use_master_grad
True,
use_grad_clip=use_grad_clip,
amp_dtype=dtype,
amp_level=level,
use_master_grad=use_master_grad,
)

seed = 0
Expand All @@ -173,37 +159,43 @@ def _run(
x_f32, x_f16 = self._generate_feed_x(dtype)
place = paddle.CUDAPlace(0)
exe = paddle.static.Executor(place)
losses_o1 = _run(place, exe, x_f32, max_iters, 'O1', dtype=dtype)
losses_o2_no_master_grad = _run(
place,
exe,
x_f16,
max_iters,
'O2',
dtype=dtype,
use_master_grad=False,
)
losses_o2_master_grad = _run(
place,
exe,
x_f16,
max_iters,
'O2',
dtype=dtype,
use_master_grad=True,
)
use_grad_clip_list = [False, True]
for use_grad_clip in use_grad_clip_list:
losses_o1 = _run(
place, exe, x_f32, max_iters, 'O1', use_grad_clip, dtype=dtype
)
losses_o2_no_master_grad = _run(
place,
exe,
x_f16,
max_iters,
'O2',
use_grad_clip,
dtype=dtype,
use_master_grad=False,
)
losses_o2_master_grad = _run(
place,
exe,
x_f16,
max_iters,
'O2',
use_grad_clip,
dtype=dtype,
use_master_grad=True,
)

self.assertNotEqual(
losses_o1,
losses_o2_no_master_grad,
f"dtype: {dtype}, loss of o1 and o2-wo-master_grad should not be equal, but recieved loss o1: {losses_o1}, loss o2: {losses_o2_no_master_grad}",
)
self.assertNotEqual(
losses_o1,
losses_o2_no_master_grad,
f"dtype: {dtype}, loss of o1 and o2-wo-master_grad should not be equal, but recieved loss o1: {losses_o1}, loss o2: {losses_o2_no_master_grad}",
)

self.assertEqual(
losses_o1,
losses_o2_master_grad,
f"dtype: {dtype}, loss of o1 and o2-w-master_grad should be equal, but recieved loss o1: {losses_o1}, loss o2: {losses_o2_master_grad}",
)
self.assertEqual(
losses_o1,
losses_o2_master_grad,
f"dtype: {dtype}, loss of o1 and o2-w-master_grad should be equal, but recieved loss o1: {losses_o1}, loss o2: {losses_o2_master_grad}",
)


if __name__ == '__main__':
Expand Down

0 comments on commit e7623dd

Please sign in to comment.