Skip to content

Commit

Permalink
[Semi Auto] Entropy SPMD Rule (PaddlePaddle#55394)
Browse files Browse the repository at this point in the history
* base rule

* add sharidng merge

* add sharidng axis merge

* define unified data class for inferencing dist_attr

* test wrap DistTensorSpec in dygraph mode

* matmul main logic done

* shape int64

* common cc

* define unified data class for inferencing dist_attr

* test wrap DistTensorSpec in dygraph mode

* define python api and wrap function in static mode for DistTensorSpec

* revise syntax

* map bugfix

* broadcast func

* compile 1

* add unitest

* add registry

* update unitest

* bugfix

* bugfix

* add pybind

* bugfix

* bugfix macro gloabl name space

* bugfix macro gloabl name space

* pybind

* pybind test

* pybind bugfixed1

* pybind bugfixed2

* pybind unitest

* merge dev

* merge dev

* merge dev

* fixed cmake conflict

* fixed cmake conflict

* rename get method

* revise inferforward output type

* revise comment

* replicated rule

* replicated rule 2

* revert bug deps

* add rule

* add unitest

* add rule

* add unitest

* move ut of auto_parallel

* fix ut

* bugfix

* bugfix

* bugfix

* bugfix

* bugfix

* bugfix

* bugfix

* resolute input sharding conflict maybe

* fixed comment

* add rule

* add unitest

* fixed typoes

---------

Co-authored-by: Yichen Zhang <zhangyichen03@baidu.com>
Co-authored-by: zhiqiu <chenqiuliang@baidu.com>
  • Loading branch information
3 people authored and wz1qqx committed Jul 31, 2023
1 parent dd28c01 commit 58dfd0a
Show file tree
Hide file tree
Showing 4 changed files with 376 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"

namespace paddle {
namespace distributed {
namespace auto_parallel {

using phi::distributed::auto_parallel::str_join;

std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
CrossEntropyWithSoftmaxSPMDRule::InferForward(
const std::vector<DistTensorSpec>& input_specs,
const paddle::framework::AttributeMap& attrs) {
// step0: verify input args based on cross_entropy_with_softmax logic
auto input_specs_size = input_specs.size();
PADDLE_ENFORCE_EQ(
input_specs_size,
2,
phi::errors::InvalidArgument("The size of InputSpec of cross entropy "
"with softmax should be 2, but got [%d].",
input_specs_size));

auto x_shape = input_specs[0].shape();
int x_ndim = x_shape.size();
auto x_dist_attr_src = input_specs[0].dist_attr();
std::vector<int64_t> x_dims_mapping_src = x_dist_attr_src.dims_mapping();

auto label_shape = input_specs[1].shape();
auto label_dist_attr_src = input_specs[1].dist_attr();
std::vector<int64_t> label_dims_mapping_src =
label_dist_attr_src.dims_mapping();

int axis = ExtractAttr<int>("axis", attrs);
int ignore_index = ExtractAttr<int>("ignore_index", attrs);
bool numeric_stable_mode = ExtractAttr<bool>("numeric_stable_mode", attrs);
bool use_softmax = ExtractAttr<bool>("use_softmax", attrs);
bool soft_label = ExtractAttr<bool>("soft_label", attrs);

VLOG(6) << "CrossEntropyWithSoftmaxSPMDRule InferForward Inputs: "
<< "X shape: [" << str_join(x_shape) << "], x_dims_mapping_src: ["
<< str_join(x_dims_mapping_src) << "]; Label shape: ["
<< str_join(label_shape) << "], Label dims mapping: ["
<< str_join(label_dims_mapping_src) << "]; axis: "
<< "[" << axis << "], ignore_index: [" << ignore_index
<< "], numeric_stable_mode: [" << numeric_stable_mode
<< "], use_softmax: [" << use_softmax << "], soft_label: ["
<< soft_label << "].";

// normalize axis
if (axis < 0) {
axis = x_ndim + axis;
}

// trying to shard the normal axis of softmax, BUT
// c_softmax_with_entropy kernel not support:
// 1. soft label
// 2. axis != -1
// support above two features in future.
if (x_dims_mapping_src[axis] > -1) {
PADDLE_ENFORCE_EQ(
soft_label,
false,
phi::errors::InvalidArgument(
"Trying to shard the softmax_normalize axis of the input tensor, "
"but the soft_label is set as True, which is not supported yet!"));

PADDLE_ENFORCE_EQ(
axis,
x_ndim - 1,
phi::errors::InvalidArgument(
"Trying to shard the softmax_normalize axis of the input tensor, "
"but the softmax_normalize axis is not the last axis, which is not "
"supported yet! The softmax_normalize is [%d].",
axis));

PADDLE_ENFORCE_EQ(use_softmax,
true,
phi::errors::InvalidArgument(
"Trying to shard the softmax_normalize axis of the "
"input tensor, use_softmax must be set to True !"));
}

// step1: build Einsum Notation
std::string alphabet =
"abcdefghijlmnopqrstuvwxyz"; // k for softmax_normalize axis
std::string broadcast_axes =
GetBroadcastAxes(x_ndim - 1, x_ndim - 1, alphabet);
std::string x_axes = broadcast_axes;
x_axes.insert(axis, "k");
std::string label_axes;
if (soft_label) {
label_axes = x_axes;
} else {
label_axes = broadcast_axes;
label_axes.insert(axis, "1");
}
std::string loss_axes = broadcast_axes;
loss_axes.insert(axis, "1");
// optional output
std::string softmax_out_axes;
if (use_softmax) {
softmax_out_axes = x_axes;
} else {
softmax_out_axes = "";
}

// step2: Sharding Propogation
std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
axes_sharding_info =
GetAxesDimsMappingPair({x_axes, label_axes}, input_specs);
std::unordered_map<std::string, int64_t> axis_to_dim_map =
ShardingMergeForTensors(axes_sharding_info);

// step3: Infer dst Dims Mapping.
TensorDistAttr loss_dist_attr_dst =
CopyTensorDistAttrForOutput(label_dist_attr_src);
loss_dist_attr_dst.set_dims_mapping(
GetDimsMappingForAxes(loss_axes, axis_to_dim_map));
TensorDistAttr softmax_out_dist_attr_dst =
CopyTensorDistAttrForOutput(x_dist_attr_src);
softmax_out_dist_attr_dst.set_dims_mapping(
GetDimsMappingForAxes(softmax_out_axes, axis_to_dim_map));

TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
x_dist_attr_dst.set_dims_mapping(
GetDimsMappingForAxes(x_axes, axis_to_dim_map));
TensorDistAttr label_dist_attr_dst =
CopyTensorDistAttrForOutput(label_dist_attr_src);
label_dist_attr_dst.set_dims_mapping(
GetDimsMappingForAxes(label_axes, axis_to_dim_map));

VLOG(4) << "CrossEntropyWithSoftmaxSPMDRule InferForward Inputs: "
<< "Einsum notation: [" << x_axes << "," << label_axes << " --> "
<< softmax_out_axes << "," << loss_axes << "]. " << std::endl
<< "X shape: [" << str_join(x_shape) << "], x_dims_mapping_src: ["
<< str_join(x_dims_mapping_src) << "], x_dims_mapping_dst: ["
<< str_join(x_dist_attr_dst.dims_mapping()) << "]; Label shape: ["
<< str_join(label_shape) << "], label_dims_mapping_src: ["
<< str_join(label_dims_mapping_src) << "], label_dims_mapping_dst: ["
<< str_join(label_dist_attr_dst.dims_mapping())
<< "]; loss_dims_mapping: ["
<< str_join(loss_dist_attr_dst.dims_mapping())
<< "], softmax_out_dims_mapping_src: ["
<< str_join(softmax_out_dist_attr_dst.dims_mapping()) << "]; axis: "
<< "[" << axis << "], ignore_index: [" << ignore_index
<< "], numeric_stable_mode: ["
<< (numeric_stable_mode ? "true" : "false") << "], use_softmax: ["
<< (use_softmax ? "true" : "false") << "], soft_label: ["
<< (soft_label ? "true" : "false") << "].";

// todo if softmax_normalize axis is sharded, notify downstream phi api to
// select c_softmax_with_entropy_kernel.

// according to the phi api implemetation, the softmax_out tensor will alway
// be genereated not matter the value of use_softmax.
return {{x_dist_attr_dst, label_dist_attr_dst},
{softmax_out_dist_attr_dst, loss_dist_attr_dst}};
}

std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
CrossEntropyWithSoftmaxSPMDRule::InferBackward(
const std::vector<DistTensorSpec>& input_specs,
const paddle::framework::AttributeMap& attrs) {
PADDLE_THROW(phi::errors::Unimplemented(
"InferBackward of CrossEntropyWithSoftmaxSPMDRule is NOT implemented "
"yet."));
}

} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"

namespace paddle {
namespace distributed {
namespace auto_parallel {

class CrossEntropyWithSoftmaxSPMDRule : public SPMDRuleBase {
public:
std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
InferForward(const std::vector<DistTensorSpec>& input_specs,
const paddle::framework::AttributeMap& attrs) override;

std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
InferBackward(const std::vector<DistTensorSpec>& output_specs,
const paddle::framework::AttributeMap& attrs) override;
};
} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
5 changes: 5 additions & 0 deletions paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#pragma once

#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/elementwise_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
Expand Down Expand Up @@ -145,6 +146,10 @@ REGISTER_SPMD_RULE(lookup_table_v2, EmbeddingSPMDRule);
REGISTER_SPMD_RULE(softmax, SoftmaxSPMDRule);
REGISTER_SPMD_RULE(log_softmax, SoftmaxSPMDRule);

// cross_entropy_with_softmax
REGISTER_SPMD_RULE(cross_entropy_with_softmax, CrossEntropyWithSoftmaxSPMDRule);
REGISTER_SPMD_RULE(softmax_with_cross_entropy, CrossEntropyWithSoftmaxSPMDRule);

} // namespace auto_parallel
} // namespace distributed
} // namespace paddle
152 changes: 152 additions & 0 deletions test/auto_parallel/spmd_rules/test_cross_entropy_with_softmax_rule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
from paddle.distributed.auto_parallel.static.dist_attribute import (
DistTensorSpec,
TensorDistAttr,
)
from paddle.distributed.fleet import auto


class TestCrossEntropyWithSoftmaxSPMDRule(unittest.TestCase):
def setUp(self):
self.rule1 = get_spmd_rule("cross_entropy_with_softmax")

x_shape = [8, 1024, 50304] # [batch_size, max_seq_len, vocab_size]
label_shape = [8, 1024, 1]

process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
x_tensor_dist_attr = TensorDistAttr()
x_tensor_dist_attr.process_mesh = process_mesh
self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
label_tensor_dist_attr = TensorDistAttr()
label_tensor_dist_attr.process_mesh = process_mesh
self.lable_dist_tensor_spec = DistTensorSpec(
label_shape, label_tensor_dist_attr
)

self.attrs = {
'ignore_index': -1,
'axis': -1,
'numeric_stable_mode': True,
'use_softmax': True,
'soft_label': False,
}

def test_cross_entropy_with_softmax_infer_forward(self):
# GPT DP case
self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1])
self.lable_dist_tensor_spec.set_dims_mapping([-1, 0, -1])

result_dist_attrs = self.rule1.infer_forward(
[self.x_dist_tensor_spec, self.lable_dist_tensor_spec], self.attrs
)
self.assertEqual(len(result_dist_attrs), 2)
infered_input_dist_attrs = result_dist_attrs[0]
infered_output_dist_attrs = result_dist_attrs[1]

self.assertEqual(len(infered_input_dist_attrs), 2)
self.assertEqual(len(infered_output_dist_attrs), 2)

self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, 0, -1])
self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1, 0, -1])

self.assertEqual(
infered_output_dist_attrs[1].dims_mapping, [1, 0, -1]
) # loss
self.assertEqual(
infered_output_dist_attrs[0].dims_mapping, [1, 0, -1]
) # softmax output

# GPT MP case, shard normalized axis
self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0])
self.lable_dist_tensor_spec.set_dims_mapping([-1, -1, -1])

result_dist_attrs = self.rule1.infer_forward(
[self.x_dist_tensor_spec, self.lable_dist_tensor_spec], self.attrs
)
infered_input_dist_attrs = result_dist_attrs[0]
infered_output_dist_attrs = result_dist_attrs[1]

self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0])
self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1, -1, -1])

self.assertEqual(
infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1]
) # loss
self.assertEqual(
infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0]
) # softmax output

# GPT MP-DP case
self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0])
self.lable_dist_tensor_spec.set_dims_mapping([1, -1, -1])

result_dist_attrs = self.rule1.infer_forward(
[self.x_dist_tensor_spec, self.lable_dist_tensor_spec], self.attrs
)
infered_input_dist_attrs = result_dist_attrs[0]
infered_output_dist_attrs = result_dist_attrs[1]

self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1, 0])
self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1, -1, -1])

self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [1, -1, -1])
self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1, 0])

# Soft Label Error
self.attrs['soft_label'] = True
self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0])
self.lable_dist_tensor_spec.set_dims_mapping([1, -1, -1])
with self.assertRaises(ValueError):
result_dist_attrs = self.rule1.infer_forward(
[self.x_dist_tensor_spec, self.lable_dist_tensor_spec],
self.attrs,
)
self.attrs['soft_label'] = False

# Normalized axis
self.attrs['axis'] = 1
self.x_dist_tensor_spec.set_dims_mapping([1, -1, 0])
self.lable_dist_tensor_spec.set_dims_mapping([-1, -1, -1])
result_dist_attrs = self.rule1.infer_forward(
[self.x_dist_tensor_spec, self.lable_dist_tensor_spec], self.attrs
)
infered_input_dist_attrs = result_dist_attrs[0]
infered_output_dist_attrs = result_dist_attrs[1]

self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1, 0])
self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1, -1, 0])

self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [1, -1, 0])
self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1, 0])
self.attrs['axis'] = -1

# Soft Normalized axis Error
self.attrs['axis'] = 1
self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1])
self.lable_dist_tensor_spec.set_dims_mapping([1, -1, -1])
with self.assertRaises(ValueError):
result_dist_attrs = self.rule1.infer_forward(
[self.x_dist_tensor_spec, self.lable_dist_tensor_spec],
self.attrs,
)
self.attrs['axis'] = -1


if __name__ == "__main__":
unittest.main()

0 comments on commit 58dfd0a

Please sign in to comment.