PaddlePaddle · FeixLiu · Jul 26, 2023 · Aug 3, 2023 · Aug 8, 2023 · Aug 10, 2023
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -124,7 +124,10 @@ GradNodeAccumulation::operator()(
 
   if (!weak_grad_.expired() && !is_new_grad) {
     auto grad = weak_grad_.lock();
-    CopyOrAddTensor(grad.get(), grad_out, is_fake_empty_);
+    if (grad_out.defined() && grad_out.initialized()) {
+      CopyOrAddTensor(grad.get(), grad_out, is_fake_empty_);
+    }
+    // else { do nothing since there is no valid value in grad out tensor }
     is_fake_empty_ = false;
   }
 

diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
@@ -58,7 +58,7 @@
   support_dygraph_mode : true
 
 - op : fused_linear_param_grad_add
-  args : (Tensor x, Tensor dout, Tensor dweight, Tensor dbias, bool multi_precision = true)
+  args : (Tensor x, Tensor dout, Tensor dweight, Tensor dbias, bool multi_precision = true, bool has_bias = true)
   output : Tensor(dweight_out), Tensor(dbias_out)
   infer_meta:
     func : FusedLinearParamGradAddInferMeta

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -1261,6 +1261,7 @@ void FusedLinearParamGradAddInferMeta(const MetaTensor& x,
                                       const MetaTensor& dweight,
                                       const MetaTensor& dbias,
                                       bool multi_precision,
+                                      bool has_bias,
                                       MetaTensor* dweight_out,
                                       MetaTensor* dbias_out) {
   const auto dtype = dout.dtype();
@@ -1304,7 +1305,7 @@ void FusedLinearParamGradAddInferMeta(const MetaTensor& x,
           ? DataType::FLOAT32
           : dtype;
 
-  if (dbias_out) {
+  if (has_bias && dbias_out) {
     dbias_out->set_dims({weight_dims[1]});
     dbias_out->set_dtype(multi_precision ? mp_dtype : dtype);
   }

diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -265,6 +265,7 @@ void FusedLinearParamGradAddInferMeta(const MetaTensor& x,
                                       const MetaTensor& dweight,
                                       const MetaTensor& dbias,
                                       bool multi_precision,
+                                      bool has_bias,
                                       MetaTensor* dweight_out,
                                       MetaTensor* dbias_out);
 

diff --git a/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
@@ -40,6 +40,7 @@ void FusedLinearParamGradAddImpl(const Context &ctx,
                                  int64_t K,
                                  int64_t N,
                                  bool use_addto,
+                                 bool has_bias,
                                  DenseTensor *dweight_out,
                                  DenseTensor *dbias_out) {
   constexpr bool kIsMultiPrecision = !std::is_same<T, MT>::value;
@@ -65,7 +66,7 @@ void FusedLinearParamGradAddImpl(const Context &ctx,
         use_addto);
   }
 
-  if (dbias_out == nullptr) return;
+  if (!has_bias) return;
 
   if (!fuse_bias_grad) {
     auto dout_copy = dout;
@@ -126,6 +127,7 @@ void FusedLinearParamGradAdd(const Context &ctx,
                              const paddle::optional<DenseTensor> &dweight,
                              const paddle::optional<DenseTensor> &dbias,
                              bool multi_precision,
+                             bool has_bias,
                              DenseTensor *dweight_out,
                              DenseTensor *dbias_out) {
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -159,7 +161,7 @@ void FusedLinearParamGradAdd(const Context &ctx,
     multi_precision = false;
   }
 
-  if (dbias_out) {
+  if (has_bias && dbias_out) {
     ctx.template Alloc<T>(dbias_out);
   }
 
@@ -176,18 +178,37 @@ void FusedLinearParamGradAdd(const Context &ctx,
     PrintMeta<kLogLevel>(dweight_out, "dweight_out");
     PrintMeta<kLogLevel>(dbias_out, "dbias_out");
     VLOG(kLogLevel) << "multi_precision = " << multi_precision;
+    VLOG(kLogLevel) << "has_bias = " << has_bias;
     VLOG(kLogLevel) << "use_addto = " << use_addto;
     VLOG(kLogLevel) << "M = " << M;
     VLOG(kLogLevel) << "N = " << N;
     VLOG(kLogLevel) << "K = " << K;
   }
 
   if (multi_precision) {
-    FusedLinearParamGradAddImpl<T, MT, Context>(
-        ctx, x, dout, dbias, M, K, N, use_addto, dweight_out, dbias_out);
+    FusedLinearParamGradAddImpl<T, MT, Context>(ctx,
+                                                x,
+                                                dout,
+                                                dbias,
+                                                M,
+                                                K,
+                                                N,
+                                                use_addto,
+                                                has_bias,
+                                                dweight_out,
+                                                dbias_out);
   } else {
-    FusedLinearParamGradAddImpl<T, T, Context>(
-        ctx, x, dout, dbias, M, K, N, use_addto, dweight_out, dbias_out);
+    FusedLinearParamGradAddImpl<T, T, Context>(ctx,
+                                               x,
+                                               dout,
+                                               dbias,
+                                               M,
+                                               K,
+                                               N,
+                                               use_addto,
+                                               has_bias,
+                                               dweight_out,
+                                               dbias_out);
   }
 }
 
@@ -199,6 +220,7 @@ void FusedLinearParamGradAdd(const Context &ctx,
                              const paddle::optional<DenseTensor> &dweight,
                              const paddle::optional<DenseTensor> &dbias,
                              bool multi_precision,
+                             bool has_bias,
                              DenseTensor *dweight_out,
                              DenseTensor *dbias_out) {
   PADDLE_THROW(phi::errors::Unimplemented(

diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -52,16 +52,18 @@ def param_hook(tmp_grad):
             ), "In main_grad node, param.grad should be None, but find param[{}] has grad.".format(
                 param.name
             )
-            if param.main_grad is None:
-                param.main_grad = core.eager.Tensor(
-                    value=tmp_grad.cast(paddle.float32).value(),
-                    place=tmp_grad.place,
-                    name="main_grad@" + param.name,
-                )
-            else:
-                param.main_grad.add_(tmp_grad.cast(paddle.float32))
+            if tmp_grad._is_initialized():
+                # Some previous pylayer may return None, should check grad validation.
+                if param.main_grad is None:
+                    param.main_grad = core.eager.Tensor(
+                        value=tmp_grad.cast(paddle.float32).value(),
+                        place=tmp_grad.place,
+                        name="main_grad@" + param.name,
+                    )
+                else:
+                    param.main_grad.add_(tmp_grad)
 
-            tmp_grad._clear_data()
+                tmp_grad._clear_data()
             return None
 
         return param_hook

diff --git a/test/legacy_test/test_fused_linear_param_grad_add.py b/test/legacy_test/test_fused_linear_param_grad_add.py
@@ -54,7 +54,7 @@ def recreate(x, multi_precision):
     return paddle.to_tensor(x.numpy())
 
 
-def run_ground_truth(x, dy, dweight, dbias, multi_precision):
+def run_ground_truth(x, dy, dweight, dbias, multi_precision, has_bias):
     x, dy, dweight, dbias = recreate([x, dy, dweight, dbias], multi_precision)
 
     dweight_tmp = paddle.matmul(
@@ -69,24 +69,35 @@ def run_ground_truth(x, dy, dweight, dbias, multi_precision):
         assert dweight.dtype == dweight.dtype
         dweight += dweight_tmp
 
-    dbias_tmp = dy.reshape([-1, dy.shape[-1]]).sum(axis=0)
-    if dbias is None:
-        dbias = dbias_tmp
-    else:
-        assert dbias.shape == dbias_tmp.shape
-        assert dbias.dtype == dbias_tmp.dtype
-        dbias += dbias_tmp
+    if has_bias:
+        dbias_tmp = dy.reshape([-1, dy.shape[-1]]).sum(axis=0)
+        if dbias is None:
+            dbias = dbias_tmp
+        else:
+            assert dbias.shape == dbias_tmp.shape
+            assert dbias.dtype == dbias_tmp.dtype
+            dbias += dbias_tmp
 
-    return promote_dtype(dweight).numpy(), promote_dtype(dbias).numpy()
+        return promote_dtype(dweight).numpy(), promote_dtype(dbias).numpy()
+    else:
+        return promote_dtype(dweight).numpy()
 
 
-def run_fused_linear_param_grad_add(x, dy, dweight, dbias, multi_precision):
+def run_fused_linear_param_grad_add(
+    x, dy, dweight, dbias, multi_precision, has_bias
+):
     dweight_new, dbias_new = _C_ops.fused_linear_param_grad_add(
-        x, dy, dweight, dbias, multi_precision
+        x, dy, dweight, dbias, multi_precision, has_bias
     )
     if dweight is not None:
         assert dweight_new.data_ptr() == dweight.data_ptr()
-    return promote_dtype(dweight_new).numpy(), promote_dtype(dbias_new).numpy()
+    if has_bias:
+        return (
+            promote_dtype(dweight_new).numpy(),
+            promote_dtype(dbias_new).numpy(),
+        )
+    else:
+        return promote_dtype(dweight_new).numpy()
 
 
 class TestMainClassBase(unittest.TestCase):
@@ -103,7 +114,9 @@ def rand(self, shape, dtype=None):
         x = paddle.to_tensor(x)
         return x.astype(dtype or self.dtype)
 
-    def generate_rand_inputs(self, has_dweight, has_dbias, multi_precision):
+    def generate_rand_inputs(
+        self, has_dweight, has_dbias, multi_precision, has_bias
+    ):
         x_shape = self.shape
         dy_shape = self.shape[:-1] + [self.output_size]
         dweight_shape = [self.shape[-1], self.output_size]
@@ -118,22 +131,23 @@ def generate_rand_inputs(self, has_dweight, has_dbias, multi_precision):
         else:
             dweight = None
 
-        if has_dbias:
+        if has_bias and has_dbias:
             dbias = self.rand(dbias_shape)
             if multi_precision:
                 dbias = promote_dtype(dbias)
         else:
             dbias = None
         return x, dy, dweight, dbias
 
-    def check_main(self, has_dweight, has_dbias, multi_precision):
-        print(has_dweight, has_dbias, multi_precision)
+    def check_main(self, has_dweight, has_dbias, multi_precision, has_bias):
         x, dy, dweight, dbias = self.generate_rand_inputs(
-            has_dweight, has_dbias, multi_precision
+            has_dweight, has_dbias, multi_precision, has_bias
+        )
+        res1 = run_ground_truth(
+            x, dy, dweight, dbias, multi_precision, has_bias
         )
-        res1 = run_ground_truth(x, dy, dweight, dbias, multi_precision)
         res2 = run_fused_linear_param_grad_add(
-            x, dy, dweight, dbias, multi_precision
+            x, dy, dweight, dbias, multi_precision, has_bias
         )
         self.assertEqual(len(res1), len(res2))
         for r1, r2 in zip(res1, res2):
@@ -153,9 +167,12 @@ def test_main(self):
             return
 
         for has_dweight in [False, True]:
-            for has_dbias in [False, True]:
-                for multi_precision in [False, True]:
-                    self.check_main(has_dweight, has_dbias, multi_precision)
+            for has_bias in [False, True]:
+                for has_dbias in [False, True]:
+                    for multi_precision in [False, True]:
+                        self.check_main(
+                            has_dweight, has_dbias, multi_precision, has_bias
+                        )
 
 
 class TestMainClassBF16(TestMainClassBase):