PaddlePaddle · zhiqiu · Mar 1, 2022 · Feb 25, 2022 · Feb 25, 2022 · Feb 28, 2022
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
     }
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done inputs";
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto it = ctx.outputs.find(output_names[i]);
@@ -2118,6 +2119,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
 
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done outputs";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
@@ -2226,6 +2228,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     }
   }
+  VLOG(4) << "Done attributes";
 }
 
 }  // namespace framework

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(
-    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
-    paddle::operators::CPUUniformRandomKernel<double>,
-    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     uniform_random_batch_size_like,
     paddle::operators::CPUUniformRandomKernel<float>,

diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
                         paddle::operators::GPUUniformRandomKernel<float>,
                         paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+inline void UniformRealDistribution(T *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<float> dist(min, max);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context &dev_ctx,
+                            const ScalarArray &shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor *out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  VLOG(4) << out->dims();
+  T *data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+  UniformRealDistribution<T>(data, size, min, max, engine);
+  if (diag_num > 0) {
+    PADDLE_ENFORCE_GT(
+        size,
+        (diag_num - 1) * (diag_step + 1),
+        phi::errors::InvalidArgument(
+            "ShapeInvalid: the diagonal's elements is equal (num-1) "
+            "* (step-1) with num %d, step %d,"
+            "It should be smaller than %d, but received %d",
+            diag_num,
+            diag_step,
+            (diag_num - 1) * (diag_step + 1),
+            size));
+    for (int64_t i = 0; i < diag_num; ++i) {
+      int64_t pos = i * diag_step + i;
+      data[pos] = diag_val;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context &dev_ctx,
+                         const ScalarArray &shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor *out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.1 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.1
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+// Aligned vector generates vectorized load/store on CUDA.
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) AlignedVector {
+  T val[Size];
+
+  HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
+  HOSTDEVICE inline T& operator[](int i) { return val[i]; }
+};
+
+template <typename T, int Size>
+HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
+  const AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<const AlignedVector<T, Size>*>(addr);
+  *vec = *addr_vec;
+}
+
+template <typename T, int Size>
+HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
+  AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<AlignedVector<T, Size>*>(addr);
+  *addr_vec = vec;
+}
+
+/*
+* Only the address of input data is the multiplier of 1,2,4, vectorized load
+* with corresponding multiplier-value is possible. Moreover, the maximum length
+* of vectorized load is 128 bits once. Hence, valid length of vectorized load
+* shall be determined under both former constraints.
+*/
+template <typename T>
+int GetVectorizedSize(const T* pointer) {
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value;  // NOLINT
+  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec8 == 0) {
+    /*
+    * Currently, decide to deal with no more than 4 data once while adopting
+    * vectorization load/store, if performance test shows that dealing with
+    * 8 data once in vectorization load/store does get optimized, return code
+    * below can be changed into " return std::min(8, valid_vec_size); " .
+    */
+    return std::min(4, valid_vec_size);
+  } else if (address % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace phi