diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh
index 617ac79a24889e..e71f494aadf2c5 100755
--- a/paddle/.set_port.sh
+++ b/paddle/.set_port.sh
@@ -13,6 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-DIRNAME=`dirname $0`
-source $DIRNAME/.common_test_util.sh
-set_port $@
+DIRNAME="$(dirname "$0")"
+sh "$DIRNAME"/.common_test_util.sh
+set_port "$@"
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index 8fd58925ee4820..8da4565be617bd 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -24,12 +24,14 @@
 PYPATH=""
 set -x
 while getopts "d:" opt; do
-  case $opt in
+  case "$opt" in
     d)
       PYPATH=$OPTARG
       ;;
+    *)
+      ;;
   esac
 done
-shift $(($OPTIND - 1))
+shift $(("$OPTIND" - 1))
 export PYTHONPATH=$PYPATH:$PYTHONPATH
-$@
+"$@"
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 5ffaf28fe92f1f..20da74eca4ef87 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2101,7 +2101,8 @@ PDNode *patterns::QuantizePlacement::operator()(
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"conv2d", "fusion_gru"});
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "fusion_gru", "reshape2", "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
index b9797a4bfcc004..146e29249b7c61 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -40,6 +40,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0], inputs[1]});
   } else if (type == "pool2d") {
     op->SetInput("X", {inputs[0]});
+  } else if (type == "transpose2") {
+    op->SetInput("X", {inputs[0]});
+  } else if (type == "reshape2") {
+    op->SetInput("X", {inputs[0]});
   } else {
     FAIL() << "Unexpected operator type.";
   }
@@ -57,8 +61,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 ProgramDesc BuildProgramDesc() {
   ProgramDesc prog;
 
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l"})) {
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "f", "g", "h", "k", "l", "m", "n", "o", "p"})) {
     prog.MutableBlock(0)->Var(v);
   }
 
@@ -68,6 +72,9 @@ ProgramDesc BuildProgramDesc() {
   SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
   SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
   SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
+  SetOp(&prog, "concat", "concat2", {"l", "m"}, {"n"});
+  SetOp(&prog, "transpose2", "transpose", {"n"}, {"o"});
+  SetOp(&prog, "reshape2", "reshape", {"o"}, {"p"});
 
   return prog;
 }
@@ -115,7 +122,7 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "relu", "concat"}, 6);
+  MainTest({"conv2d", "pool2d", "relu", "concat"}, 7);
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
@@ -123,7 +130,7 @@ TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   MainTest({"conv2d", "pool2d"}, 3);
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(0); }
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(5); }
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index 0d9f3d2aa237ac..5f603465776f1e 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,4 +1,5 @@
+#!/bin/bash
 set -x
-cd `dirname $0`
+cd "$(dirname "$0")" || exit
 rm -rf build/ data/
 set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 6d283ca56cb652..aee013e8f36528 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -1,29 +1,29 @@
 #!/bin/bash
 set -x
-PADDLE_ROOT=$1
-TURN_ON_MKL=$2 # use MKL or Openblas
-TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
-DATA_DIR=$4 # dataset
-TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
-TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-MSVC_STATIC_CRT=$7
-inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
+PADDLE_ROOT="$1"
+TURN_ON_MKL="$2" # use MKL or Openblas
+TEST_GPU_CPU="$3" # test both GPU/CPU mode or only CPU mode
+DATA_DIR="$4" # dataset
+TENSORRT_INCLUDE_DIR="$5" # TensorRT header file dir, default to /usr/local/TensorRT/include
+TENSORRT_LIB_DIR="$6" # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+MSVC_STATIC_CRT="$7"
+inference_install_dir="${PADDLE_ROOT}"/build/paddle_inference_install_dir
 
-cd `dirname $0`
-current_dir=`pwd`
-if [ $2 == ON ]; then
+cd "$(dirname "$0")" || exit
+current_dir=$(pwd)
+if [ "$2" == ON ]; then
   # You can export yourself if move the install path
-  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
+  MKL_LIB="${inference_install_dir}"/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"${MKL_LIB}"
 fi
-if [ $3 == ON ]; then
+if [ "$3" == ON ]; then
   use_gpu_list='true false'
 else
   use_gpu_list='false'
 fi
 
 USE_TENSORRT=OFF
-if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
+if [ -d "$TENSORRT_INCLUDE_DIR" ] && [ -d "$TENSORRT_LIB_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -32,77 +32,79 @@ URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
-  dir_name=$1
-  mkdir -p $dir_name
-  cd $dir_name
+  dir_name="$1"
+  mkdir -p "$dir_name"
+  cd "$dir_name" || exit
   if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
     echo "${PREFIX}${dir_name}.tar.gz has been downloaded."
   else
-      wget -q ${URL_ROOT}$dir_name.tar.gz
-      tar xzf *.tar.gz
+      wget -q "${URL_ROOT}""$dir_name".tar.gz
+      tar xzf ./*.tar.gz
   fi
-  cd ..
+  cd .. || exit
 }
-mkdir -p $DATA_DIR
-cd $DATA_DIR
+mkdir -p "$DATA_DIR"
+cd "$DATA_DIR" || exit
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
-  download $vis_demo_name
+  download "$vis_demo_name"
 done
 
 # download word2vec data
 mkdir -p word2vec
-cd word2vec
+cd word2vec || exit
 if [[ -e "word2vec.inference.model.tar.gz" ]]; then
   echo "word2vec.inference.model.tar.gz has been downloaded."
 else
     wget -q http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
-    tar xzf *.tar.gz
+    tar xzf ./*.tar.gz
 fi
 
 # compile and test the demo
-cd $current_dir
+cd "$current_dir" || exit
 mkdir -p build
-cd build
-rm -rf *
+cd build || exit
+rm -rf ./*
 
 for WITH_STATIC_LIB in ON OFF; do
-  if [ $(echo `uname` | grep "Win") != "" ]; then
+  if [ "$(uname | grep Win)" != "" ]; then
     # -----simple_on_word2vec on windows-----
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=simple_on_word2vec \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
+      -DMSVC_STATIC_CRT="$MSVC_STATIC_CRT"
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
-        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-        --use_gpu=$use_gpu
-      if [ $? -ne 0 ]; then
+        --dirname="$DATA_DIR"/word2vec/word2vec.inference.model \
+        --use_gpu="$use_gpu"
+      EXCODE="$?"
+      if [ "$EXCODE" -ne 0 ]; then
         echo "simple_on_word2vec demo runs fail."
         exit 1
       fi
     done
 
     # -----vis_demo on windows-----
-    rm -rf *
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    rm -rf ./*
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=vis_demo \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
+      -DMSVC_STATIC_CRT="$MSVC_STATIC_CRT"
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
         Release/vis_demo.exe \
-          --modeldir=$DATA_DIR/$vis_demo_name/model \
-          --data=$DATA_DIR/$vis_demo_name/data.txt \
-          --refer=$DATA_DIR/$vis_demo_name/result.txt \
-          --use_gpu=$use_gpu
-        if [ $? -ne 0 ]; then
+          --modeldir="$DATA_DIR"/"$vis_demo_name"/model \
+          --data="$DATA_DIR"/"$vis_demo_name"/data.txt \
+          --refer="$DATA_DIR"/"$vis_demo_name"/result.txt \
+          --use_gpu="$use_gpu"
+        EXCODE="$?"
+        if [ "$EXCODE" -ne 0 ]; then
           echo "vis demo $vis_demo_name runs fail."
           exit 1
         fi
@@ -110,63 +112,66 @@ for WITH_STATIC_LIB in ON OFF; do
     done
   else
     # -----simple_on_word2vec on linux/mac-----
-    rm -rf *
-    cmake .. -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    rm -rf ./*
+    cmake .. -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=simple_on_word2vec \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-    make -j$(nproc)
-    word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
-    if [ -d $word2vec_model ]; then
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB"
+    make -j"$(nproc)"
+    word2vec_model="$DATA_DIR"'/word2vec/word2vec.inference.model'
+    if [ -d "$word2vec_model" ]; then
       for use_gpu in $use_gpu_list; do
         ./simple_on_word2vec \
-          --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-          --use_gpu=$use_gpu
-        if [ $? -ne 0 ]; then
+          --dirname="$DATA_DIR"/word2vec/word2vec.inference.model \
+          --use_gpu="$use_gpu"
+        EXCODE="$?"
+        if [ "$EXCODE" -ne 0 ]; then
           echo "simple_on_word2vec demo runs fail."
           exit 1
         fi
       done
     fi
     # ---------vis_demo on linux/mac---------
-    rm -rf *
-    cmake .. -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    rm -rf ./*
+    cmake .. -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=vis_demo \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-    make -j$(nproc)
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB"
+    make -j"$(nproc)"
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
         ./vis_demo \
-          --modeldir=$DATA_DIR/$vis_demo_name/model \
-          --data=$DATA_DIR/$vis_demo_name/data.txt \
-          --refer=$DATA_DIR/$vis_demo_name/result.txt \
-          --use_gpu=$use_gpu
-        if [ $? -ne 0 ]; then
+          --modeldir="$DATA_DIR"/"$vis_demo_name"/model \
+          --data="$DATA_DIR"/"$vis_demo_name"/data.txt \
+          --refer="$DATA_DIR"/"$vis_demo_name"/result.txt \
+          --use_gpu="$use_gpu"
+        EXCODE="$?"
+        if [ "$EXCODE" -ne 0 ]; then
           echo "vis demo $vis_demo_name runs fail."
           exit 1
         fi
       done
     done
     # --------tensorrt mobilenet on linux/mac------
-    if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
-      rm -rf *
-      cmake .. -DPADDLE_LIB=${inference_install_dir} \
-        -DWITH_MKL=$TURN_ON_MKL \
+    if [ "$USE_TENSORRT" == ON ] && [ "$TEST_GPU_CPU" == ON ]; then
+      rm -rf ./*
+      cmake .. -DPADDLE_LIB="${inference_install_dir}" \
+        -DWITH_MKL="$TURN_ON_MKL" \
         -DDEMO_NAME=trt_mobilenet_demo \
-        -DWITH_GPU=$TEST_GPU_CPU \
-        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-        -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-        -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
-      make -j$(nproc)
+        -DWITH_GPU="$TEST_GPU_CPU" \
+        -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
+        -DUSE_TENSORRT="$USE_TENSORRT" \
+        -DTENSORRT_INCLUDE_DIR="$TENSORRT_INCLUDE_DIR" \
+        -DTENSORRT_LIB_DIR="$TENSORRT_LIB_DIR"
+      make -j"$(nproc)"
       ./trt_mobilenet_demo \
-        --modeldir=$DATA_DIR/mobilenet/model \
-        --data=$DATA_DIR/mobilenet/data.txt \
-        --refer=$DATA_DIR/mobilenet/result.txt 
-      if [ $? -ne 0 ]; then
+        --modeldir="$DATA_DIR"/mobilenet/model \
+        --data="$DATA_DIR"/mobilenet/data.txt \
+        --refer="$DATA_DIR"/mobilenet/result.txt 
+      EXCODE="$?"
+      if [ "$EXCODE" != 0 ]; then
         echo "trt demo trt_mobilenet_demo runs fail."
         exit 1
       fi
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index a0f64796576c85..0c66946c4b8a1e 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,12 +1,12 @@
 #!/bin/sh
 
-lib=$1
-if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+lib="$1"
+if [ "$#" -ne 1 ]; then echo "No input library"; exit 1 ; fi
 
-num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
+num_paddle_syms=$(nm -D "${lib}" | grep -c paddle )
+num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -c "T " )
 
-if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
-if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+if [ "$num_paddle_syms" -le 0 ]; then echo "Have no paddle symbols"; exit 1 ; fi
+if [ "$num_google_syms" -ge 1 ]; then echo "Have some google symbols"; exit 1 ; fi
 
 exit 0
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index fd20581123c10f..0b2be0076fdb12 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -90,8 +90,6 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
   EXPECT_TRUE(quantizer_enable);
   PD_EnableMkldnnBfloat16(config);
-  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  EXPECT_TRUE(bfloat16_enable);
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index e4035c80341379..7c5757ce9d4c63 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -190,7 +190,7 @@ std::vector<double> Lexical_Test(
     // return acc_res;
   } else {
     EXPECT_GT(outputs->size(), 0UL);
-    EXPECT_EQ(outputs[0].size(), 1UL);
+    EXPECT_GT(outputs[0].size(), 0UL);
     LOG(INFO) << "No accuracy result. To get accuracy result provide a model "
                  "with accuracy layers in it and use --with_accuracy_layer "
                  "option.";
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index f8b76f387cc195..3f03df04ea3765 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -293,8 +293,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-    VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+    ScalingParamType<T> beta = 0.0f;
+
+    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    // VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
@@ -387,6 +391,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       if (input_grad) {
         ResizeToChannelFirst<platform::CUDADeviceContext, T>(
             ctx, input_grad, &transformed_input_grad_channel);
+        // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
+        // the data of input_grad to transformed_input_grad_channel.
+        if (ctx.Attr<bool>("use_addto")) {
+          TransToChannelFirst<platform::CUDADeviceContext, T>(
+              ctx, input_grad, &transformed_input_grad_channel);
+        }
       }
     } else {
       transformed_input_channel.ShareDataWith(*input);
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 0e835a62839b4b..7927410ef37862 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,6 +21,16 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
+inline std::vector<size_t> GetNmsLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 class MultiClassNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -321,6 +331,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto* outs = ctx.Output<LoDTensor>("Out");
     bool return_index = ctx.HasOutput("Index") ? true : false;
     auto index = ctx.Output<LoDTensor>("Index");
+    bool has_roisnum = ctx.HasInput("RoisNum") ? true : false;
+    auto rois_num = ctx.Input<Tensor>("RoisNum");
     auto score_dims = scores->dims();
     auto score_size = score_dims.size();
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
@@ -332,7 +344,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
     Tensor boxes_slice, scores_slice;
-    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    int n = 0;
+    if (has_roisnum) {
+      n = score_size == 3 ? batch_size : rois_num->numel();
+    } else {
+      n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    }
     for (int i = 0; i < n; ++i) {
       std::map<int, std::vector<int>> indices;
       if (score_size == 3) {
@@ -341,7 +358,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         boxes_slice = boxes->Slice(i, i + 1);
         boxes_slice.Resize({score_dims[2], box_dim});
       } else {
-        auto boxes_lod = boxes->lod().back();
+        std::vector<size_t> boxes_lod;
+        if (has_roisnum) {
+          boxes_lod = GetNmsLodFromRoisNum(rois_num);
+        } else {
+          boxes_lod = boxes->lod().back();
+        }
         if (boxes_lod[i] == boxes_lod[i + 1]) {
           all_indices.push_back(indices);
           batch_starts.push_back(batch_starts.back());
@@ -380,7 +402,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
             offset = i * score_dims[2];
           }
         } else {
-          auto boxes_lod = boxes->lod().back();
+          std::vector<size_t> boxes_lod;
+          if (has_roisnum) {
+            boxes_lod = GetNmsLodFromRoisNum(rois_num);
+          } else {
+            boxes_lod = boxes->lod().back();
+          }
           if (boxes_lod[i] == boxes_lod[i + 1]) continue;
           scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
           boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
@@ -403,6 +430,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         }
       }
     }
+    if (ctx.HasOutput("NmsRoisNum")) {
+      auto* nms_rois_num = ctx.Output<Tensor>("NmsRoisNum");
+      nms_rois_num->mutable_data<int>({n}, ctx.GetPlace());
+      int* num_data = nms_rois_num->data<int>();
+      for (int i = 1; i <= n; i++) {
+        num_data[i - 1] = batch_starts[i] - batch_starts[i - 1];
+      }
+      nms_rois_num->Resize({n});
+    }
 
     framework::LoD lod;
     lod.emplace_back(batch_starts);
@@ -535,6 +571,34 @@ class MultiClassNMS2OpMaker : public MultiClassNMSOpMaker {
   }
 };
 
+class MultiClassNMS3Op : public MultiClassNMS2Op {
+ public:
+  MultiClassNMS3Op(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : MultiClassNMS2Op(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    MultiClassNMS2Op::InferShape(ctx);
+
+    ctx->SetOutputDim("NmsRoisNum", {-1});
+  }
+};
+
+class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
+ public:
+  void Make() override {
+    MultiClassNMS2OpMaker::Make();
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
+    AddOutput("NmsRoisNum", "(Tensor), The number of NMS RoIs in each image")
+        .AsDispensable();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -551,3 +615,10 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(multiclass_nms2, ops::MultiClassNMSKernel<float>,
                        ops::MultiClassNMSKernel<double>);
+
+REGISTER_OPERATOR(
+    multiclass_nms3, ops::MultiClassNMS3Op, ops::MultiClassNMS3OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(multiclass_nms3, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index b2815cbdc65b53..bb475b4e543660 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -221,5 +221,6 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::ConcatMKLDNNOpKernel<float>,
+                   ops::ConcatMKLDNNOpKernel<paddle::platform::bfloat16>,
                    ops::ConcatMKLDNNOpKernel<int8_t>,
                    ops::ConcatMKLDNNOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 398bdb01b5c240..28cdd8413ab134 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -142,6 +142,11 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN,
                                     ops::kTransposeMKLDNNINT8,
                                     ops::TransposeMKLDNNOpKernel<int8_t>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    transpose2, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kTransposeMKLDNNFP32,
+    ops::TransposeMKLDNNOpKernel<paddle::platform::bfloat16>);
+
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index aa8e39037062e5..7cf85420c579b6 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -622,7 +622,9 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int8_t, ops::ReshapeKernel,
                                uint8_t, ops::ReshapeKernel, int,
                                ops::ReshapeKernel, int64_t, ops::ReshapeKernel,
-                               bool, ops::ReshapeKernel);
+                               bool, ops::ReshapeKernel,
+                               paddle::platform::bfloat16, ops::ReshapeKernel);
+
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 7f2736a9b1d414..92006bff2cc16c 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -52,6 +52,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"hierarchical_sigmoid",
      {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
     {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
+    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
+    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -78,6 +80,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
     {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/paddle/fluid/train/demo/clean.sh b/paddle/fluid/train/demo/clean.sh
index a2064492c08b84..192bdf8752c159 100755
--- a/paddle/fluid/train/demo/clean.sh
+++ b/paddle/fluid/train/demo/clean.sh
@@ -15,6 +15,6 @@
 # limitations under the License.
 
 set -x
-cd "$(dirname "$0")"
+cd "$(dirname "$0")" || exit
 rm -rf build/
 set +x
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
index 2955e7574daa2d..a9c0ed4ac68a2a 100755
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
@@ -14,14 +14,14 @@ function download() {
 download
 
 # build demo trainer
-paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
+paddle_install_dir="${PADDLE_ROOT}"/build/paddle_install_dir
 
 mkdir -p build
-cd build
-rm -rf *
-cmake .. -DPADDLE_LIB=$paddle_install_dir \
-         -DWITH_MKLDNN=$TURN_ON_MKL \
-         -DWITH_MKL=$TURN_ON_MKL
+cd build || exit
+rm -rf ./*
+cmake .. -DPADDLE_LIB="$paddle_install_dir" \
+         -DWITH_MKLDNN="$TURN_ON_MKL" \
+         -DWITH_MKL="$TURN_ON_MKL"
 make
 
 cd ..
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
index f71b4bac602a9e..8a585c614e53fe 100644
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ b/paddle/fluid/train/imdb_demo/run.sh
@@ -1,3 +1,3 @@
-
+#!/bin/bash
 set -exu
 build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index d6b639d0da2a54..fdd0d490a6fdb7 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -15,14 +15,14 @@
 # limitations under the License.
 
 function start_build_docker() {
-    docker pull $IMG
+    docker pull "$IMG"
 
     apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
     DOCKER_ENV=$(cat <<EOL
         -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
         -e CTEST_OUTPUT_ON_FAILURE=1 \
         -e CTEST_PARALLEL_LEVEL=1 \
-        -e APT_MIRROR=${apt_mirror} \
+        -e APT_MIRROR="${apt_mirror}" \
         -e WITH_GPU=ON \
         -e CUDA_ARCH_NAME=Auto \
         -e WITH_AVX=ON \
@@ -39,24 +39,24 @@ EOL
     )
 
     DOCKER_CMD="nvidia-docker"
-    if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
+    if ! [ -x "$(command -v "${DOCKER_CMD}")" ]; then
         DOCKER_CMD="docker"
     fi
     if [ ! -d "${HOME}/.ccache" ]; then
-        mkdir ${HOME}/.ccache
+        mkdir "${HOME}"/.ccache
     fi
     set -ex
-    ${DOCKER_CMD} run -it \
-        ${DOCKER_ENV} \
-        -e SCRIPT_NAME=$0 \
-        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
-        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
-        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
-        -v $PADDLE_ROOT:/paddle \
-        -v ${HOME}/.ccache:/root/.ccache \
+    "${DOCKER_CMD}" run -it \
+        "${DOCKER_ENV}" \
+        -e SCRIPT_NAME="$0" \
+        -e CONTENT_DEC_PASSWD="$CONTENT_DEC_PASSWD" \
+        -e TRAVIS_BRANCH="$TRAVIS_BRANCH" \
+        -e TRAVIS_PULL_REQUEST="$TRAVIS_PULL_REQUEST" \
+        -v "$PADDLE_ROOT":/paddle \
+        -v "${HOME}"/.ccache:/root/.ccache \
         -w /paddle \
-        $IMG \
-        paddle/scripts/paddle_build.sh $@
+        "$IMG" \
+        paddle/scripts/paddle_build.sh "$@"
     set +x
 }
 
@@ -65,7 +65,7 @@ function main() {
     VERSION="latest-dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
     IMG=${DOCKER_REPO}:${VERSION}
-    start_build_docker $@
+    start_build_docker "$@"
 }
 
-main $@
+main "$@"
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4cd9d9e530d871..52950a4d92a71a 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -339,7 +339,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_isolated_var)
 
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
@@ -587,6 +586,7 @@ set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inp
         test_parallel_executor_seresnext_with_reduce_gpu
         test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
         test_parallel_executor_profiler
+        test_parallel_executor_fetch_isolated_var
         PROPERTIES LABELS "RUN_TYPE=DIST")
 
 if(NOT WIN32 AND NOT APPLE)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
new file mode 100644
index 00000000000000..1179556f915be8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestConcatBf16Op(OpTest):
+    def setUp(self):
+        enable_static()
+        self.op_type = "concat"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {
+            'axis': self.axis,
+            'use_mkldnn': True,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16)
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+# --------------------test concat bf16 in with axis 0--------------------
+
+    def init_test_data(self):
+        self.x0 = convert_float_to_uint16(
+            np.random.random(self.x0_shape).astype(np.float32))
+        self.x1 = convert_float_to_uint16(
+            np.random.random(self.x1_shape).astype(np.float32))
+        self.x2 = convert_float_to_uint16(
+            np.random.random(self.x2_shape).astype(np.float32))
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_shape(self):
+        self.x0_shape = [2, 2, 1, 2]
+        self.x1_shape = [1, 2, 1, 2]
+        self.x2_shape = [3, 2, 1, 2]
+
+
+# --------------------test concat bf16 in with axis 1--------------------
+
+
+class TestAxis1Case(TestConcatBf16Op):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_shape(self):
+        self.x0_shape = [1, 1, 5, 5]
+        self.x1_shape = [1, 2, 5, 5]
+        self.x2_shape = [1, 3, 5, 5]
+
+
+# --------------------test concat bf16 in with axis 2--------------------
+
+
+class TestAxis2Case(TestConcatBf16Op):
+    def init_axis(self):
+        self.axis = 2
+
+    def init_shape(self):
+        self.x0_shape = [2, 3, 4, 5]
+        self.x1_shape = [2, 3, 5, 5]
+        self.x2_shape = [2, 3, 6, 5]
+
+
+# --------------------test concat bf16 in with axis 3--------------------
+
+
+class TestAxis3Case(TestConcatBf16Op):
+    def init_axis(self):
+        self.axis = 3
+
+    def init_shape(self):
+        self.x0_shape = [2, 3, 5, 5]
+        self.x1_shape = [2, 3, 5, 6]
+        self.x2_shape = [2, 3, 5, 7]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
new file mode 100644
index 00000000000000..854ddb17fb275a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestReshapeBf16Op(OpTest):
+    def setUp(self):
+        enable_static()
+        self.op_type = "reshape2"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.init_data()
+        self.init_input_data()
+
+        self.inputs = {'X': self.input_data}
+        self.attrs = {
+            'shape': self.new_shape,
+            'use_mkldnn': self.use_mkldnn,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype(np.float32)
+        }
+
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+
+    def init_input_data(self):
+        self.input_data = convert_float_to_uint16(
+            np.random.random(self.ori_shape).astype(np.float32))
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
new file mode 100644
index 00000000000000..de04cecbf4c9bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        enable_static()
+        self.op_type = "transpose2"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.init_test_case()
+        self.init_test_data()
+        self.axis = (0, 2, 3, 1)
+
+        self.inputs = {'X': self.input_data}
+
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype(np.uint16),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
+
+    def init_test_case(self):
+        self.shape = (2, 3, 4, 5)
+
+    def init_test_data(self):
+        self.input_data = convert_float_to_uint16(
+            np.random.random(self.shape).astype(np.float32))
+
+
+class TestBF16Case(TestTransposeOp):
+    def init_test_case(self):
+        self.shape = (2, 4, 6, 8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
index 0c43d5693456c4..b9089448d53f1c 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -30,22 +30,21 @@ def __init__(self,
                  filter_size,
                  stride=1,
                  groups=1,
-                 act=None,
-                 use_cudnn=False):
+                 data_format="NCHW"):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = fluid.dygraph.Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = paddle.nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
             stride=stride,
             padding=(filter_size - 1) // 2,
             groups=groups,
-            act=None,
             bias_attr=False,
-            use_cudnn=use_cudnn)
+            data_format=data_format)
 
-        self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act)
+        self._batch_norm = paddle.nn.BatchNorm(
+            num_filters, data_layout=data_format)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -53,19 +52,20 @@ def forward(self, inputs):
         return y
 
 
-def create_program():
+def create_program(data_format="NCHW"):
     main = fluid.Program()
     startup = fluid.Program()
     with fluid.program_guard(main, startup):
         x = fluid.data(name='img', shape=[-1, 3, 224, 224])
         x.stop_gradient = False
+        if data_format == "NHWC":
+            x = paddle.transpose(x, [0, 2, 3, 1])
         x = fluid.layers.prelu(x, mode="channel")
         conv = ConvBNLayer(
             num_channels=3,
             num_filters=3,
             filter_size=1,
-            act='relu',
-            use_cudnn=True)
+            data_format=data_format)
         y = conv(x) + x
 
         loss = fluid.layers.reduce_sum(y)
@@ -77,7 +77,7 @@ def create_program():
 
 
 class TestInplaceAddto(unittest.TestCase):
-    def test_result(self):
+    def check_result(self, data_format="NCHW"):
         def run_program(enable_addto):
             np.random.seed(10)
             paddle.seed(10)
@@ -85,7 +85,7 @@ def run_program(enable_addto):
             if fluid.core.is_compiled_with_cuda():
                 fluid.set_flags({"FLAGS_cudnn_deterministic": True})
             fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
-            loss, main, startup, w = create_program()
+            loss, main, startup, w = create_program(data_format=data_format)
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -98,7 +98,7 @@ def run_program(enable_addto):
             exe.run(startup)
             img = np.random.uniform(-128, 128,
                                     [8, 3, 224, 224]).astype(np.float32)
-            for i in range(2):
+            for i in range(10):
                 res = exe.run(compiled,
                               feed={'img': img},
                               fetch_list=[loss.name, w.name])
@@ -106,9 +106,16 @@ def run_program(enable_addto):
 
         res1, w1 = run_program(True)
         res2, w2 = run_program(False)
-        print(res1, res2)
+
         self.assertTrue(np.array_equal(res1, res2))
 
+    def test_nchw(self):
+        self.check_result()
+
+    def test_nhwc(self):
+        self.check_result("NHWC")
+
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 34c19b88bcdbac..3158d78db63dcd 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -571,6 +571,128 @@ def test_scores_Variable():
             self.assertRaises(TypeError, test_scores_Variable)
 
 
+class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
+    def setUp(self):
+        self.set_argument()
+        N = 7
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
+        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
+        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
+
+        det_outs, lod = batched_multiclass_nms(boxes, scores, background,
+                                               score_threshold, nms_threshold,
+                                               nms_top_k, keep_top_k)
+        det_outs = np.array(det_outs)
+
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        index_outs = det_outs[:, -1:].astype('int') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms3'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'Index': (index_outs, [lod]),
+            'NmsRoisNum': np.array(lod).astype('int32')
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': True,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMS3OpNoOutput(TestMulticlassNMS3Op):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0
+        self.score_threshold = 2.0
+
+
+class TestMulticlassNMS3LoDInput(TestMulticlassNMS2LoDInput):
+    def setUp(self):
+        self.set_argument()
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        box_lod = [[1200]]
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+        normalized = False
+
+        scores = np.random.random((M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+
+        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0] = boxes[:, :, 0] * 10
+        boxes[:, :, 1] = boxes[:, :, 1] * 10
+        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
+        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
+
+        det_outs, lod = lod_multiclass_nms(
+            boxes, scores, background, score_threshold, nms_threshold,
+            nms_top_k, keep_top_k, box_lod, normalized)
+
+        det_outs = np.array(det_outs)
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms3'
+        self.inputs = {
+            'BBoxes': (boxes, box_lod),
+            'Scores': (scores, box_lod),
+            'RoisNum': np.array(box_lod).astype('int32')
+        }
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'NmsRoisNum': np.array(lod).astype('int32')
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': normalized,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMS3LoDNoOutput(TestMulticlassNMS3LoDInput):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0
+        self.score_threshold = 2.0
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
index 13932238705f5b..d64aa510f4e1a5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -16,6 +16,7 @@
 import numpy as np
 import six
 import paddle.fluid as fluid
+import paddle
 
 
 def enable_parallel_ssa_executor(enabled=True):
@@ -57,6 +58,7 @@ def test_main(self):
 
     def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
                  use_parallel_ssa_executor):
+        paddle.enable_static()
         enable_parallel_ssa_executor(use_parallel_ssa_executor)
 
         if fluid.is_compiled_with_cuda():
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index ff962fb1c1d5cb..aa99d698bc7b2b 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -454,7 +454,8 @@ def _run(self, inputs, labels=None):
                 rets.insert(i, feed[name])
 
         # step learning rate scheduler on each batch end
-        if self.model._optimizer and \
+        if self.model._optimizer and self.mode == 'train' and \
+                hasattr(self.model._optimizer, '_learning_rate') and \
                 isinstance(self.model._optimizer._learning_rate,
                            paddle.optimizer.lr.LRScheduler):
             self.model._optimizer._learning_rate.step()
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
index 3c0e57ffe7ec1f..b98d9491ca968b 100644
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -ex
-SYSTEM=`uname -s`
+SYSTEM="$(uname -s)"
 rm -f protoc-3.11.3-linux-x86_64.*
 if [ "$SYSTEM" == "Linux" ]; then
     wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
@@ -28,5 +28,5 @@ if [ "$1" != "" ]; then
     fi
 fi
 
-python spider.py --version=$version --url=$url
+python spider.py --version=$version --url="$url"
 tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
diff --git a/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn b/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn
new file mode 100644
index 00000000000000..964f082b561371
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn
@@ -0,0 +1,152 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV HOME /root
+# Add bash enhancements
+COPY paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+  apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
+  apt-get update && \
+  apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
+    coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
+
+
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+      RUN apt-get update --fix-missing
+      COPY tools/dockerfile/build_scripts /build_scripts 
+      RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
+      RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+
+RUN apt-get update && \
+  apt-get install -y python2.7 python2.7-dev \
+  python3.5 python3.5-dev \
+  python3.6 python3.6-dev \
+  python3.7 python3.7-dev \
+  python3.8 python3.8-dev && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python2.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.5 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
+  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
+    tar -xzf binutils-2.33.1.tar.gz && \ 
+    cd binutils-2.33.1 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
+
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip --no-cache-dir install ipykernel==4.6.0 wheel 
+
+#For docstring checker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.7 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip --no-cache-dir install pylint pytest astroid isort
+
+COPY ./python/requirements.txt /root/
+RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.7 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip --no-cache-dir install -r /root/requirements.txt
+
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+#CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+# For CINN environment 
+RUN apt update --fix-missing
+RUN apt-get install autoconf autogen
+RUN apt-get install libtool
+RUN apt-get install zlib1g-dev
+RUN apt install libginac-dev -y
+RUN apt install clang cmake -y
+RUN python3 -m pip install numpy
+RUN python3 -m pip install pybind11
+
+
+# Install LLVM
+RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
+RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
+RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
+RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
+RUN ln -s /usr/bin/llvm-config-6.0 /usr/bin/llvm-config
+RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
+
+RUN apt update
+RUN apt install libclang-dev llvm-10 llvm-10-dev libclang-10-dev -y
+
+
+EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 0c9bf1409d90d8..2708f4f976d232 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -24,8 +24,8 @@ wget -q -O $DIR/$DEB $URL
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
 for sub_deb in $DEBS; do
-  echo $sub_deb
-  ar x $sub_deb && tar xf data.tar.xz
+  echo "$sub_deb"
+  ar x "$sub_deb" && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
index 3ab1e68b375574..d199c535f96737 100755
--- a/tools/gen_alias_mapping.sh
+++ b/tools/gen_alias_mapping.sh
@@ -31,9 +31,9 @@
 #         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
 
 
-PADDLE_ROOT="$(dirname $(readlink -f ${BASH_SOURCE[0]}))/.."
+PADDLE_ROOT="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")/.."
 
-find ${PADDLE_ROOT}/python/ -name '*.py' \
+find "${PADDLE_ROOT}"/python/ -name '*.py' \
     | xargs  grep -v '^#' \
     | grep 'DEFINE_ALIAS' \
     | perl -ne '
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 0c9bf1409d90d8..c2adf6a79de4bb 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
@@ -24,10 +39,10 @@ wget -q -O $DIR/$DEB $URL
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
 for sub_deb in $DEBS; do
-  echo $sub_deb
-  ar x $sub_deb && tar xf data.tar.xz
+  echo "$sub_deb"
+  ar x "$sub_deb" && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
 rm /usr/include/nccl.h
-rm -rf $DIR
+rm -rf "$DIR"
diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
new file mode 100644
index 00000000000000..7d2a45c248ce27
Binary files /dev/null and b/tools/static_mode_white_list.pyc differ