From c95660174b104acfdb2027be051464bec5fb48b3 Mon Sep 17 00:00:00 2001 From: Richard Diamond <wichard@vitalitystudios.com> Date: Mon, 11 Apr 2016 18:30:40 -0500 Subject: [PATCH 1/3] refactor/cuda: implement the plugin traits generically against the Cuda backend This allows the tests/downstream crates to be generic. For Cuda, this doesn't expand the types allowed, so only `f32` and `f64` are allowed (just as before). --- src/frameworks/cuda/helper.rs | 968 -------------------------- src/frameworks/cuda/mod.rs | 1227 ++++++++++++++++++++++++++++----- 2 files changed, 1066 insertions(+), 1129 deletions(-) diff --git a/src/frameworks/cuda/helper.rs b/src/frameworks/cuda/helper.rs index a4416a8..a598bde 100644 --- a/src/frameworks/cuda/helper.rs +++ b/src/frameworks/cuda/helper.rs @@ -21,971 +21,3 @@ pub unsafe fn receive_memory_ptr_mut<T>(x: &mut ::co::tensor::SharedTensor<T>, d ).id_c() )) } - -#[macro_export] -macro_rules! impl_oconf_for_cc(($($t: ident), +) => ( - $( - impl<'a> NNOperationConfig<$t> for utils::ConvolutionConfig { } - )+ -)); - -#[macro_export] -macro_rules! impl_oconf_for_clrn(($($t: ident), +) => ( - $( - impl NNOperationConfig<$t> for utils::NormalizationConfig { } - )+ -)); - -#[macro_export] -macro_rules! impl_oconf_for_pooling(($($t: ident), +) => ( - $( - impl NNOperationConfig<$t> for utils::PoolingConfig { } - )+ -)); - -#[macro_export] -macro_rules! impl_ops_sigmoid_for { - ($t:ident, $b:ty) => ( - impl ::plugin::Sigmoid<$t> for $b { - fn sigmoid( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.sigmoid_plain(x, result) - } - - fn sigmoid_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.sigmoid_forward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation Sigmoid Forward.")) - } - })) - } - - fn sigmoid_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.sigmoid_grad_plain(x, x_diff, result, result_diff) - } - - fn sigmoid_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.sigmoid_backward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data - &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation Sigmoid Backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_sigmoid_pointwise_for { - ($t:ident, $b:ty) => ( - impl ::plugin::SigmoidPointwise<$t> for $b { - fn sigmoid_pointwise( - &self, - x: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - - self.sigmoid_pointwise_plain(x) - } - - fn sigmoid_pointwise_plain( - &self, - x: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.sigmoid_forward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Sigmoid Pointwise forward.")) - } - })) - } - - fn sigmoid_pointwise_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - - self.sigmoid_pointwise_grad_plain(x, x_diff) - } - - fn sigmoid_pointwise_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.sigmoid_backward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(x.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data - &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Sigmoid Pointwise backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_relu_for { - ($t:ident, $b:ty) => ( - impl ::plugin::Relu<$t> for $b { - fn relu( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.relu_plain(x, result) - } - - fn relu_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.relu_forward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation relu Forward.")) - } - })) - } - - fn relu_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.relu_grad_plain(x, x_diff, result, result_diff) - } - - fn relu_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.relu_backward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data - &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation relu Backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_relu_pointwise_for { - ($t:ident, $b:ty) => ( - impl ::plugin::ReluPointwise<$t> for $b { - fn relu_pointwise( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - - self.relu_pointwise_plain(x) - } - - fn relu_pointwise_plain( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.relu_forward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN ReLU Pointwise forward.")) - } - })) - } - - fn relu_pointwise_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - - self.relu_pointwise_grad_plain(x, x_diff) - } - - fn relu_pointwise_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.relu_backward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(x.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data - &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN ReLU Pointwise backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_tanh_for { - ($t:ident, $b:ty) => ( - impl ::plugin::Tanh<$t> for $b { - fn tanh( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.tanh_plain(x, result) - } - - fn tanh_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.tanh_forward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation tanh Forward.")) - } - })) - } - - fn tanh_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.tanh_grad_plain(x, x_diff, result, result_diff) - } - - fn tanh_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.tanh_backward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data - &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation tanh Backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_tanh_pointwise_for { - ($t:ident, $b:ty) => ( - impl ::plugin::TanhPointwise<$t> for $b { - fn tanh_pointwise( - &self, - x: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - - self.tanh_pointwise_plain(x) - } - - fn tanh_pointwise_plain( - &self, - x: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.tanh_forward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Tanh Pointwise forward.")) - } - })) - } - - fn tanh_pointwise_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - - self.tanh_pointwise_grad_plain(x, x_diff) - } - - fn tanh_pointwise_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.tanh_backward( - &try!(x.cudnn_tensor_desc_flat()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(x.cudnn_tensor_desc_flat()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data - &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Tanh Pointwise backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_convolution_for { - ($t:ty, $b:ty) => ( - fn convolution( - &self, - filter: &mut ::co::tensor::SharedTensor<$t>, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - workspace: &mut ::co::tensor::SharedTensor<u8>, - config: &Self::CC //::frameworks::cuda::CC - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - match workspace.add_device(self.device()) { _ => () } - - self.convolution_plain(filter, x, result, workspace, config) - } - - fn convolution_plain( - &self, - filter: &::co::tensor::SharedTensor<$t>, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - workspace: &mut ::co::tensor::SharedTensor<u8>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.convolution_forward( - config, - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }), - &try!(x.cudnn_tensor_desc()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Forward.")) - } - })) - } - - #[allow(unused_variables)] - fn convolution_grad_filter( - &self, - src_data: &mut ::co::tensor::SharedTensor<$t>, - dest_diff: &mut ::co::tensor::SharedTensor<$t>, - filter_diff: &mut ::co::tensor::SharedTensor<$t>, - workspace: &mut ::co::tensor::SharedTensor<u8>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - match src_data.add_device(self.device()) { _ => try!(src_data.sync(self.device())) } - match dest_diff.add_device(self.device()) { _ => try!(dest_diff.sync(self.device())) } - match filter_diff.add_device(self.device()) { _ => try!(filter_diff.sync(self.device())) } - match workspace.add_device(self.device()) { _ => () } - - self.convolution_grad_filter_plain(src_data, dest_diff, filter_diff, workspace, config) - } - - #[allow(unused_variables)] - fn convolution_grad_filter_plain( - &self, - src_data: &::co::tensor::SharedTensor<$t>, - dest_diff: &::co::tensor::SharedTensor<$t>, - filter_diff: &mut ::co::tensor::SharedTensor<$t>, - workspace: &mut ::co::tensor::SharedTensor<u8>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.convolution_backward_filter( - config, - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }), - &try!(src_data.cudnn_tensor_desc()), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(src_data, self.device()) }), - &try!(dest_diff.cudnn_tensor_desc()), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(dest_diff, self.device()) }), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(filter_diff, self.device()) }), - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Backward.")) - } - })) - } - - #[allow(unused_variables)] - fn convolution_grad_data( - &self, - filter: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - workspace: &mut ::co::tensor::SharedTensor<u8>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - match filter.add_device(self.device()) { _ => try!(filter.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => try!(result_diff.sync(self.device())) } - match workspace.add_device(self.device()) { _ => () } - - self.convolution_grad_data_plain(filter, x_diff, result_diff, workspace, config) - } - - #[allow(unused_variables)] - fn convolution_grad_data_plain( - &self, - filter: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - workspace: &mut ::co::tensor::SharedTensor<u8>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.convolution_backward_data( - config, - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }), - &try!(x_diff.cudnn_tensor_desc()), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), - &try!(result_diff.cudnn_tensor_desc()), - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Backward.")) - } - })) - } - ) -} - -#[macro_export] -macro_rules! impl_ops_softmax_for { - ($t:ident, $b:ty) => ( - impl ::plugin::Softmax<$t> for $b { - fn softmax( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.softmax_plain(x, result) - } - - fn softmax_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.softmax_forward( - &try!(x.cudnn_tensor_desc_softmax()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc_softmax()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN softmax Forward.")) - } - })) - } - - fn softmax_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.softmax_grad_plain(x, x_diff, result_diff) - } - - fn softmax_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.softmax_backward( - &try!(x.cudnn_tensor_desc_softmax()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_softmax()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result_diff.cudnn_tensor_desc_softmax()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN softmax Backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_log_softmax_for { - ($t:ident, $b:ty) => ( - impl ::plugin::LogSoftmax<$t> for $b { - fn log_softmax( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.log_softmax_plain(x, result) - } - - fn log_softmax_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.log_softmax_forward( - &try!(x.cudnn_tensor_desc_softmax()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc_softmax()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN logarithmic softmax Forward.")) - } - })) - } - - fn log_softmax_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.log_softmax_grad_plain(x, x_diff, result_diff) - } - - fn log_softmax_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t> - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.log_softmax_backward( - &try!(x.cudnn_tensor_desc_softmax()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc_softmax()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result_diff.cudnn_tensor_desc_softmax()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN logarithmic softmax Backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_lrn_for { - ($t:ident, $b:ty) => ( - impl ::plugin::LRN<$t> for $b { - fn new_lrn_config( - &self, - n: u32, - alpha: f64, - beta: f64, - k: f64 - ) -> Result<Self::CLRN, ::co::error::Error> { - Ok(CUDNN.init_normalization(n, alpha, beta, k).unwrap()) - } - - fn lrn( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CLRN //::frameworks::cuda::CC - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.lrn_plain(x, result, config) - } - - fn lrn_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CLRN - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.lrn_forward( - config, - &try!(x.cudnn_tensor_desc()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation lrn Forward.")) - } - })) - } - - #[allow(unused_variables)] - fn lrn_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CLRN - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.lrn_grad_plain(x, x_diff, result, result_diff, config) - } - - #[allow(unused_variables)] - fn lrn_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CLRN - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.lrn_backward( - config, - &try!(x.cudnn_tensor_desc()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result.cudnn_tensor_desc()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data - &try!(result_diff.cudnn_tensor_desc()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation lrn Backward.")) - } - })) - } - } - ) -} - -#[macro_export] -macro_rules! impl_ops_pooling_for { - ($t:ident, $b:ty) => ( - impl ::plugin::Pooling<$t> for $b { - fn new_pooling_config( - &self, - window: &[i32], - padding: &[i32], - stride: &[i32], - ) -> Result<Self::CPOOL, ::co::error::Error> { - let pooling_avg = ::cudnn::PoolingDescriptor::new(::cudnn::cudnnPoolingMode_t::CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, window, padding, stride).unwrap(); - let pooling_max = ::cudnn::PoolingDescriptor::new(::cudnn::cudnnPoolingMode_t::CUDNN_POOLING_MAX, window, padding, stride).unwrap(); - Ok(::cudnn::utils::PoolingConfig::new(pooling_avg, pooling_max)) - } - - fn pooling_max( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CPOOL - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => () } - - self.pooling_max_plain(x, result, config) - } - - fn pooling_max_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CPOOL - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.pooling_max_forward( - config, - &try!(x.cudnn_tensor_desc()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(result.cudnn_tensor_desc()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation pooling Forward.")) - } - })) - } - - #[allow(unused_variables)] - fn pooling_max_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CPOOL - ) -> Result<(), ::co::error::Error> { - match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } - match result_diff.add_device(self.device()) { _ => () } - - self.pooling_max_grad_plain(x, x_diff, result, result_diff, config) - } - - #[allow(unused_variables)] - fn pooling_max_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CPOOL - ) -> Result<(), ::co::error::Error> { - let scal_params: ::cudnn::utils::ScalParams<$t> = ::cudnn::utils::ScalParams::default(); - - Ok(try!(match CUDNN.pooling_max_backward( - config, - &try!(x.cudnn_tensor_desc()), // src_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data - &try!(x_diff.cudnn_tensor_desc()), // src_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data - &try!(result.cudnn_tensor_desc()), // dest_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data - &try!(result_diff.cudnn_tensor_desc()), // dest_diff_desc - try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data - scal_params - ) { - Ok(_) => Ok(()), - Err(_) => { - Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation pooling Backward.")) - } - })) - } - } - ) -} diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs index baa5459..6b9b51b 100644 --- a/src/frameworks/cuda/mod.rs +++ b/src/frameworks/cuda/mod.rs @@ -3,8 +3,12 @@ use ::plugin::*; use co::prelude::*; use co::plugin::Error as PluginError; +use co::plugin::numeric_helpers::Float; use cudnn::*; + +pub use cudnn::utils::DataTypeInfo; + #[macro_use] pub mod helper; @@ -29,87 +33,6 @@ pub trait ICudnnDesc<T> { fn cudnn_convolution_desc(&self, filter: &SharedTensor<T>) -> Result<ConvolutionDescriptor, PluginError>; } -macro_rules! impl_icudnndesc_for_sharedtensor { - ($t:ty, $cutype:path) => ( - impl ICudnnDesc<$t> for SharedTensor<$t> { - fn cudnn_tensor_desc(&self) -> Result<TensorDescriptor, PluginError> { - match TensorDescriptor::new(&self.desc().dims_i32().clone(), &self.desc().default_stride_i32().clone(), $cutype) { - Ok(desc) => Ok(desc), - Err(_) => { - Err(PluginError::Plugin("Unable to create CuDNN TensorDescriptor.")) - } - } - } - - fn cudnn_tensor_desc_softmax(&self) -> Result<TensorDescriptor, PluginError> { - let actual_desc = self.desc().clone(); - let override_desc = match actual_desc.len() { - // not batched and single dimension softmax - 1 => vec![1, actual_desc[0], 1, 1], - // batched and single dimension softmax - 2 => vec![actual_desc[0], actual_desc[1], 1, 1], - // neither batched nor single dimension - 3 => vec![1, actual_desc[0], actual_desc[1], actual_desc[2]], - _ => actual_desc - }; - match TensorDescriptor::new(&override_desc.dims_i32().clone(), - &override_desc.default_stride_i32().clone(), - $cutype) { - Ok(desc) => Ok(desc), - Err(_) => { - Err(PluginError::Plugin("Unable to create CuDNN TensorDescriptor.")) - } - } - } - - fn cudnn_tensor_desc_flat(&self) -> Result<TensorDescriptor, PluginError> { - let actual_desc = self.desc().clone(); - let mut override_desc = match actual_desc.len() { - 1 => vec![1, 1], - 2 => vec![1], - _ => vec![] - }; - for dim in actual_desc { - override_desc.push(dim); - } - match TensorDescriptor::new(&override_desc.dims_i32().clone(), - &override_desc.default_stride_i32().clone(), - $cutype) { - Ok(desc) => Ok(desc), - Err(_) => { - Err(PluginError::Plugin("Unable to create CuDNN TensorDescriptor.")) - } - } - } - - fn cudnn_filter_desc(&self) -> Result<FilterDescriptor, PluginError> { - match FilterDescriptor::new(&self.desc().dims_i32().clone(), $cutype) { - Ok(desc) => Ok(desc), - Err(_) => { - Err(PluginError::Plugin("Unable to create CuDNN FilterDescriptor.")) - } - } - } - - fn cudnn_convolution_desc(&self, filter: &SharedTensor<$t>) -> Result<ConvolutionDescriptor, PluginError> { - match ConvolutionDescriptor::new(&self.desc().dims_i32().clone(), &filter.desc().default_stride_i32().clone(), $cutype) { - Ok(desc) => Ok(desc), - Err(_) => { - Err(PluginError::Plugin("Unable to create CuDNN ConvolutionDescriptor.")) - } - } - } - } - ) -} - -impl_icudnndesc_for_sharedtensor!(f32, ::cudnn::utils::DataType::Float); -impl_icudnndesc_for_sharedtensor!(f64, ::cudnn::utils::DataType::Double); - -impl_oconf_for_cc!(f32, f64); -impl_oconf_for_clrn!(f32, f64); -impl_oconf_for_pooling!(f32, f64); - impl ConvForwardAlgo { /// Tries to return the matching cuDNN type for the enum value. fn as_cudnn(&self) -> Result<cudnnConvolutionFwdAlgo_t, ::co::error::Error> { @@ -240,65 +163,86 @@ impl ConvBackwardDataAlgo { } } -macro_rules! impl_convolution_for_cuda_backend { - ($t:ty, $cutype:path) => ( - impl ConvolutionConfig<$t> for ::cudnn::utils::ConvolutionConfig { - fn workspace_size(&self) -> usize { - *self.largest_workspace_size() +impl<T> ICudnnDesc<T> for SharedTensor<T> + where T: Float + DataTypeInfo, +{ + fn cudnn_tensor_desc(&self) -> Result<TensorDescriptor, PluginError> { + match TensorDescriptor::new(&self.desc().dims_i32().clone(), + &self.desc().default_stride_i32().clone(), + <T as DataTypeInfo>::cudnn_data_type()) { + Ok(desc) => Ok(desc), + Err(_) => { + Err(PluginError::Plugin("Unable to create CuDNN TensorDescriptor.")) } } + } - impl Convolution<$t> for Backend<Cuda> { - fn new_convolution_config( - &self, - src: &::co::tensor::SharedTensor<$t>, - dest: &::co::tensor::SharedTensor<$t>, - filter: &mut ::co::tensor::SharedTensor<$t>, - algo_fwd: ConvForwardAlgo, - algo_bwd_filter: ConvBackwardFilterAlgo, - algo_bwd_data: ConvBackwardDataAlgo, - stride: &[i32], - zero_padding: &[i32], - ) -> Result<Self::CC, ::co::error::Error> { - let src_desc = try!(src.cudnn_tensor_desc()); - let dest_desc = try!(dest.cudnn_tensor_desc()); - let filter_desc = try!(filter.cudnn_filter_desc()); - let conv_desc = ::cudnn::ConvolutionDescriptor::new(zero_padding, stride, $cutype).unwrap(); - - let useable_algo_fwd = try!(algo_fwd.find_cudnn_algo(&filter_desc, &conv_desc, &src_desc, &dest_desc)); - let useable_algo_bwd_filter = try!(algo_bwd_filter.find_cudnn_algo(&filter_desc, &conv_desc, &src_desc, &dest_desc)); - let useable_algo_bwd_data = try!(algo_bwd_data.find_cudnn_algo(&filter_desc, &conv_desc, &src_desc, &dest_desc)); - - let mut workspace_size_fwd = API::get_convolution_forward_workspace_size(*CUDNN.id_c(), useable_algo_fwd.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap(); - let mut workspace_size_bwd_filter = API::get_convolution_backward_filter_workspace_size(*CUDNN.id_c(), useable_algo_bwd_filter.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap(); - let mut workspace_size_bwd_data = API::get_convolution_backward_data_workspace_size(*CUDNN.id_c(), useable_algo_bwd_data.as_cudnn().unwrap(), *filter_desc.id_c(), *conv_desc.id_c(), *src_desc.id_c(), *dest_desc.id_c()).unwrap(); - - if workspace_size_fwd == 0 { - workspace_size_fwd = 8; - } - if workspace_size_bwd_filter == 0 { - workspace_size_bwd_filter = 8; - } - if workspace_size_bwd_data == 0 { - workspace_size_bwd_data = 8; - } - - Ok( - ::cudnn::utils::ConvolutionConfig::new( - useable_algo_fwd.as_cudnn().unwrap(), workspace_size_fwd, - useable_algo_bwd_filter.as_cudnn().unwrap(), workspace_size_bwd_filter, - useable_algo_bwd_data.as_cudnn().unwrap(), workspace_size_bwd_data, - conv_desc, filter_desc - ) - ) + fn cudnn_tensor_desc_softmax(&self) -> Result<TensorDescriptor, PluginError> { + let actual_desc = self.desc().clone(); + let override_desc = match actual_desc.len() { + // not batched and single dimension softmax + 1 => vec![1, actual_desc[0], 1, 1], + // batched and single dimension softmax + 2 => vec![actual_desc[0], actual_desc[1], 1, 1], + // neither batched nor single dimension + 3 => vec![1, actual_desc[0], actual_desc[1], actual_desc[2]], + _ => actual_desc + }; + match TensorDescriptor::new(&override_desc.dims_i32().clone(), + &override_desc.default_stride_i32().clone(), + <T as DataTypeInfo>::cudnn_data_type()) { + Ok(desc) => Ok(desc), + Err(_) => { + Err(PluginError::Plugin("Unable to create CuDNN TensorDescriptor.")) + } + } + } + + fn cudnn_tensor_desc_flat(&self) -> Result<TensorDescriptor, PluginError> { + let actual_desc = self.desc().clone(); + let mut override_desc = match actual_desc.len() { + 1 => vec![1, 1], + 2 => vec![1], + _ => vec![] + }; + for dim in actual_desc { + override_desc.push(dim); + } + match TensorDescriptor::new(&override_desc.dims_i32().clone(), + &override_desc.default_stride_i32().clone(), + <T as DataTypeInfo>::cudnn_data_type()) { + Ok(desc) => Ok(desc), + Err(_) => { + Err(PluginError::Plugin("Unable to create CuDNN TensorDescriptor.")) + } + } + } + + fn cudnn_filter_desc(&self) -> Result<FilterDescriptor, PluginError> { + match FilterDescriptor::new(&self.desc().dims_i32().clone(), + <T as DataTypeInfo>::cudnn_data_type()) { + Ok(desc) => Ok(desc), + Err(_) => { + Err(PluginError::Plugin("Unable to create CuDNN FilterDescriptor.")) } + } + } - impl_ops_convolution_for!($t, Backend<Cuda>); + fn cudnn_convolution_desc(&self, filter: &SharedTensor<T>) -> Result<ConvolutionDescriptor, PluginError> { + match ConvolutionDescriptor::new(&self.desc().dims_i32().clone(), + &filter.desc().default_stride_i32().clone(), + <T as DataTypeInfo>::cudnn_data_type()) { + Ok(desc) => Ok(desc), + Err(_) => { + Err(PluginError::Plugin("Unable to create CuDNN ConvolutionDescriptor.")) + } } - ) + } } -impl NN<f32> for Backend<Cuda> { +impl<T> NN<T> for Backend<Cuda> + where T: Float + DataTypeInfo, +{ type CC = utils::ConvolutionConfig; type CLRN = utils::NormalizationConfig; type CPOOL = utils::PoolingConfig; @@ -306,38 +250,999 @@ impl NN<f32> for Backend<Cuda> { fn init_nn() { let _ = CUDNN.id_c(); } fn device(&self) -> &DeviceType { self.device() } } +impl<'a, T> NNOperationConfig<T> for utils::ConvolutionConfig + where T: Float + DataTypeInfo, +{ } +impl<T> NNOperationConfig<T> for utils::NormalizationConfig + where T: Float + DataTypeInfo, +{ } +impl<T> NNOperationConfig<T> for utils::PoolingConfig + where T: Float + DataTypeInfo, +{ } -impl_convolution_for_cuda_backend!(f32, ::cudnn::utils::DataType::Float); -impl_ops_sigmoid_for!(f32, Backend<Cuda>); -impl_ops_relu_for!(f32, Backend<Cuda>); -impl_ops_tanh_for!(f32, Backend<Cuda>); -impl_ops_softmax_for!(f32, Backend<Cuda>); -impl_ops_log_softmax_for!(f32, Backend<Cuda>); -impl_ops_lrn_for!(f32, Backend<Cuda>); -impl_ops_pooling_for!(f32, Backend<Cuda>); +impl<T> Sigmoid<T> for Backend<Cuda> + where T: Float + DataTypeInfo + Default, +{ + fn sigmoid( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } -impl_ops_sigmoid_pointwise_for!(f32, Backend<Cuda>); -impl_ops_relu_pointwise_for!(f32, Backend<Cuda>); -impl_ops_tanh_pointwise_for!(f32, Backend<Cuda>); + self.sigmoid_plain(x, result) + } -impl NN<f64> for Backend<Cuda> { - type CC = utils::ConvolutionConfig; - type CLRN = utils::NormalizationConfig; - type CPOOL = utils::PoolingConfig; + fn sigmoid_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); - fn init_nn() { let _ = CUDNN.id_c(); } - fn device(&self) -> &DeviceType { self.device() } + Ok(try!(match CUDNN.sigmoid_forward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation Sigmoid Forward.")) + } + })) + } + + fn sigmoid_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.sigmoid_grad_plain(x, x_diff, result, result_diff) + } + + fn sigmoid_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.sigmoid_backward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data + &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation Sigmoid Backward.")) + } + })) + } +} + +impl<T> ConvolutionConfig<T> for ::cudnn::utils::ConvolutionConfig + where T: Float + DataTypeInfo, +{ + fn workspace_size(&self) -> usize { + *self.largest_workspace_size() + } +} + +impl<T> Convolution<T> for Backend<Cuda> + where T: Float + DataTypeInfo, +{ + fn new_convolution_config(&self, + src: &SharedTensor<T>, + dest: &SharedTensor<T>, + filter: &mut SharedTensor<T>, + algo_fwd: ConvForwardAlgo, + algo_bwd_filter: ConvBackwardFilterAlgo, + algo_bwd_data: ConvBackwardDataAlgo, + stride: &[i32], + zero_padding: &[i32]) -> + Result<Self::CC, ::co::error::Error> + { + let src_desc = try!(src.cudnn_tensor_desc()); + let dest_desc = try!(dest.cudnn_tensor_desc()); + let filter_desc = try!(filter.cudnn_filter_desc()); + let conv_desc = ::cudnn::ConvolutionDescriptor::new(zero_padding, stride, + <T as DataTypeInfo>::cudnn_data_type()).unwrap(); + + let useable_algo_fwd = try!(algo_fwd.find_cudnn_algo(&filter_desc, &conv_desc, + &src_desc, &dest_desc)); + let useable_algo_bwd_filter = try!(algo_bwd_filter.find_cudnn_algo(&filter_desc, &conv_desc, + &src_desc, &dest_desc)); + let useable_algo_bwd_data = try!(algo_bwd_data.find_cudnn_algo(&filter_desc, &conv_desc, + &src_desc, &dest_desc)); + + let mut workspace_size_fwd = + API::get_convolution_forward_workspace_size(*CUDNN.id_c(), + useable_algo_fwd.as_cudnn().unwrap(), + *filter_desc.id_c(), + *conv_desc.id_c(), + *src_desc.id_c(), + *dest_desc.id_c()) + .unwrap(); + let mut workspace_size_bwd_filter = + API::get_convolution_backward_filter_workspace_size(*CUDNN.id_c(), + useable_algo_bwd_filter.as_cudnn().unwrap(), + *filter_desc.id_c(), + *conv_desc.id_c(), + *src_desc.id_c(), + *dest_desc.id_c()) + .unwrap(); + let mut workspace_size_bwd_data = + API::get_convolution_backward_data_workspace_size(*CUDNN.id_c(), + useable_algo_bwd_data.as_cudnn().unwrap(), + *filter_desc.id_c(), + *conv_desc.id_c(), + *src_desc.id_c(), + *dest_desc.id_c()) + .unwrap(); + + if workspace_size_fwd == 0 { + workspace_size_fwd = 8; + } + if workspace_size_bwd_filter == 0 { + workspace_size_bwd_filter = 8; + } + if workspace_size_bwd_data == 0 { + workspace_size_bwd_data = 8; + } + + Ok( + ::cudnn::utils::ConvolutionConfig::new( + useable_algo_fwd.as_cudnn().unwrap(), workspace_size_fwd, + useable_algo_bwd_filter.as_cudnn().unwrap(), workspace_size_bwd_filter, + useable_algo_bwd_data.as_cudnn().unwrap(), workspace_size_bwd_data, + conv_desc, filter_desc + ) + ) + } + fn convolution(&self, + filter: &mut SharedTensor<T>, + x: &mut SharedTensor<T>, + result: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> //::frameworks::cuda::CC + Result<(), ::co::error::Error> + { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + match workspace.add_device(self.device()) { _ => () } + + self.convolution_plain(filter, x, result, workspace, config) + } + + fn convolution_plain(&self, + filter: &::co::tensor::SharedTensor<T>, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + workspace: &mut ::co::tensor::SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.convolution_forward( + config, + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }), + &try!(x.cudnn_tensor_desc()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Forward.")) + } + })) + } + + fn convolution_grad_filter(&self, + src_data: &mut SharedTensor<T>, + dest_diff: &mut SharedTensor<T>, + filter_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + match src_data.add_device(self.device()) { _ => try!(src_data.sync(self.device())) } + match dest_diff.add_device(self.device()) { _ => try!(dest_diff.sync(self.device())) } + match filter_diff.add_device(self.device()) { _ => try!(filter_diff.sync(self.device())) } + match workspace.add_device(self.device()) { _ => () } + + self.convolution_grad_filter_plain(src_data, dest_diff, filter_diff, workspace, config) + } + + fn convolution_grad_filter_plain(&self, + src_data: &SharedTensor<T>, + dest_diff: &SharedTensor<T>, + filter_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.convolution_backward_filter( + config, + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }), + &try!(src_data.cudnn_tensor_desc()), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(src_data, self.device()) }), + &try!(dest_diff.cudnn_tensor_desc()), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(dest_diff, self.device()) }), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(filter_diff, self.device()) }), + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation convolution Backward.")) + } + })) + } + + fn convolution_grad_data(&self, + filter: &mut SharedTensor<T>, + x_diff: &mut SharedTensor<T>, + result_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + match filter.add_device(self.device()) { _ => try!(filter.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => try!(result_diff.sync(self.device())) } + match workspace.add_device(self.device()) { _ => () } + + self.convolution_grad_data_plain(filter, x_diff, result_diff, workspace, config) + } + + fn convolution_grad_data_plain(&self, + filter: &SharedTensor<T>, + x_diff: &SharedTensor<T>, + result_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.convolution_backward_data( + config, + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(workspace, self.device()) }), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(filter, self.device()) }), + &try!(x_diff.cudnn_tensor_desc()), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), + &try!(result_diff.cudnn_tensor_desc()), + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(PluginError::Operation("Unable to execute CUDA cuDNN Activation convolution Backward.")) + } + })) + } } -impl_convolution_for_cuda_backend!(f64, ::cudnn::utils::DataType::Double); -impl_ops_sigmoid_for!(f64, Backend<Cuda>); -impl_ops_relu_for!(f64, Backend<Cuda>); -impl_ops_tanh_for!(f64, Backend<Cuda>); -impl_ops_softmax_for!(f64, Backend<Cuda>); -impl_ops_log_softmax_for!(f64, Backend<Cuda>); -impl_ops_lrn_for!(f64, Backend<Cuda>); -impl_ops_pooling_for!(f64, Backend<Cuda>); - -impl_ops_sigmoid_pointwise_for!(f64, Backend<Cuda>); -impl_ops_relu_pointwise_for!(f64, Backend<Cuda>); -impl_ops_tanh_pointwise_for!(f64, Backend<Cuda>); +impl<T> SigmoidPointwise<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn sigmoid_pointwise( + &self, + x: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + + self.sigmoid_pointwise_plain(x) + } + + fn sigmoid_pointwise_plain( + &self, + x: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.sigmoid_forward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Sigmoid Pointwise forward.")) + } + })) + } + + fn sigmoid_pointwise_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + + self.sigmoid_pointwise_grad_plain(x, x_diff) + } + + fn sigmoid_pointwise_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.sigmoid_backward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(x.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data + &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Sigmoid Pointwise backward.")) + } + })) + } +} + +impl<T> Relu<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn relu( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + + self.relu_plain(x, result) + } + + fn relu_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.relu_forward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation relu Forward.")) + } + })) + } + + fn relu_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.relu_grad_plain(x, x_diff, result, result_diff) + } + + fn relu_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.relu_backward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data + &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation relu Backward.")) + } + })) + } +} + +impl<T> ReluPointwise<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn relu_pointwise( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + + self.relu_pointwise_plain(x) + } + + fn relu_pointwise_plain( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.relu_forward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN ReLU Pointwise forward.")) + } + })) + } + + fn relu_pointwise_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + + self.relu_pointwise_grad_plain(x, x_diff) + } + + fn relu_pointwise_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.relu_backward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(x.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data + &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN ReLU Pointwise backward.")) + } + })) + } +} + +impl<T> Tanh<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn tanh( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + + self.tanh_plain(x, result) + } + + fn tanh_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.tanh_forward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation tanh Forward.")) + } + })) + } + + fn tanh_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.tanh_grad_plain(x, x_diff, result, result_diff) + } + + fn tanh_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.tanh_backward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data + &try!(result_diff.cudnn_tensor_desc_flat()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation tanh Backward.")) + } + })) + } +} + +impl<T> TanhPointwise<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn tanh_pointwise( + &self, + x: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + + self.tanh_pointwise_plain(x) + } + + fn tanh_pointwise_plain( + &self, + x: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.tanh_forward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Tanh Pointwise forward.")) + } + })) + } + + fn tanh_pointwise_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + + self.tanh_pointwise_grad_plain(x, x_diff) + } + + fn tanh_pointwise_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.tanh_backward( + &try!(x.cudnn_tensor_desc_flat()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_flat()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(x.cudnn_tensor_desc_flat()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), // dest_data + &try!(x_diff.cudnn_tensor_desc_flat()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(x_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Tanh Pointwise backward.")) + } + })) + } +} +impl<T> Softmax<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn softmax( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + + self.softmax_plain(x, result) + } + + fn softmax_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.softmax_forward( + &try!(x.cudnn_tensor_desc_softmax()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc_softmax()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN softmax Forward.")) + } + })) + } + + fn softmax_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.softmax_grad_plain(x, x_diff, result_diff) + } + + fn softmax_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.softmax_backward( + &try!(x.cudnn_tensor_desc_softmax()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_softmax()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result_diff.cudnn_tensor_desc_softmax()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN softmax Backward.")) + } + })) + } +} + +impl<T> LogSoftmax<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn log_softmax( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + + self.log_softmax_plain(x, result) + } + + fn log_softmax_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.log_softmax_forward( + &try!(x.cudnn_tensor_desc_softmax()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc_softmax()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN logarithmic softmax Forward.")) + } + })) + } + + fn log_softmax_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.log_softmax_grad_plain(x, x_diff, result_diff) + } + + fn log_softmax_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T> + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.log_softmax_backward( + &try!(x.cudnn_tensor_desc_softmax()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc_softmax()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result_diff.cudnn_tensor_desc_softmax()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN logarithmic softmax Backward.")) + } + })) + } +} + +impl<T> LRN<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn new_lrn_config( + &self, + n: u32, + alpha: f64, + beta: f64, + k: f64 + ) -> Result<Self::CLRN, ::co::error::Error> { + Ok(CUDNN.init_normalization(n, alpha, beta, k).unwrap()) + } + + fn lrn( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CLRN //::frameworks::cuda::CC + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + + self.lrn_plain(x, result, config) + } + + fn lrn_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CLRN + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.lrn_forward( + config, + &try!(x.cudnn_tensor_desc()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation lrn Forward.")) + } + })) + } + + #[allow(unused_variables)] + fn lrn_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CLRN + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.lrn_grad_plain(x, x_diff, result, result_diff, config) + } + + #[allow(unused_variables)] + fn lrn_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CLRN + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.lrn_backward( + config, + &try!(x.cudnn_tensor_desc()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result.cudnn_tensor_desc()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data + &try!(result_diff.cudnn_tensor_desc()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation lrn Backward.")) + } + })) + } +} + +impl<T> Pooling<T> for Backend<Cuda> + where T: Float + Default + DataTypeInfo, +{ + fn new_pooling_config( + &self, + window: &[i32], + padding: &[i32], + stride: &[i32], + ) -> Result<Self::CPOOL, ::co::error::Error> { + let pooling_avg = ::cudnn::PoolingDescriptor::new(::cudnn::cudnnPoolingMode_t::CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, window, padding, stride).unwrap(); + let pooling_max = ::cudnn::PoolingDescriptor::new(::cudnn::cudnnPoolingMode_t::CUDNN_POOLING_MAX, window, padding, stride).unwrap(); + Ok(::cudnn::utils::PoolingConfig::new(pooling_avg, pooling_max)) + } + + fn pooling_max( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CPOOL + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => () } + + self.pooling_max_plain(x, result, config) + } + + fn pooling_max_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CPOOL + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.pooling_max_forward( + config, + &try!(x.cudnn_tensor_desc()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(result.cudnn_tensor_desc()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result, self.device()) }), // dest_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation pooling Forward.")) + } + })) + } + + #[allow(unused_variables)] + fn pooling_max_grad( + &self, + x: &mut ::co::tensor::SharedTensor<T>, + x_diff: &mut ::co::tensor::SharedTensor<T>, + result: &mut ::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CPOOL + ) -> Result<(), ::co::error::Error> { + match x.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result.add_device(self.device()) { _ => try!(x.sync(self.device())) } + match result_diff.add_device(self.device()) { _ => () } + + self.pooling_max_grad_plain(x, x_diff, result, result_diff, config) + } + + #[allow(unused_variables)] + fn pooling_max_grad_plain( + &self, + x: &::co::tensor::SharedTensor<T>, + x_diff: &::co::tensor::SharedTensor<T>, + result: &::co::tensor::SharedTensor<T>, + result_diff: &mut ::co::tensor::SharedTensor<T>, + config: &Self::CPOOL + ) -> Result<(), ::co::error::Error> { + let scal_params: ::cudnn::utils::ScalParams<T> = ::cudnn::utils::ScalParams::default(); + + Ok(try!(match CUDNN.pooling_max_backward( + config, + &try!(x.cudnn_tensor_desc()), // src_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x, self.device()) }), //src_data + &try!(x_diff.cudnn_tensor_desc()), // src_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(x_diff, self.device()) }), //src_diff_data + &try!(result.cudnn_tensor_desc()), // dest_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr(result, self.device()) }), // dest_data + &try!(result_diff.cudnn_tensor_desc()), // dest_diff_desc + try!(unsafe { ::frameworks::cuda::helper::receive_memory_ptr_mut(result_diff, self.device()) }), // dest_diff_data + scal_params + ) { + Ok(_) => Ok(()), + Err(_) => { + Err(::co::plugin::Error::Operation("Unable to execute CUDA cuDNN Activation pooling Backward.")) + } + })) + } +} From 23f70c3e42591ad4e15e4ac8b8dcf25f188eda12 Mon Sep 17 00:00:00 2001 From: Richard Diamond <wichard@vitalitystudios.com> Date: Mon, 11 Apr 2016 18:40:54 -0500 Subject: [PATCH 2/3] feat/native: add an N-dimensional implicit gemm convolution algo for the native backend This algo needs to allocate a few index vectors, however it requires no additional space. --- src/frameworks/native/helper.rs | 69 +------ src/frameworks/native/mod.rs | 334 ++++++++++++++++++++++++++++++-- 2 files changed, 320 insertions(+), 83 deletions(-) diff --git a/src/frameworks/native/helper.rs b/src/frameworks/native/helper.rs index d411978..49cd522 100644 --- a/src/frameworks/native/helper.rs +++ b/src/frameworks/native/helper.rs @@ -3,9 +3,6 @@ use co::plugin::numeric_helpers::Float; use co::memory::MemoryType; -#[derive(Debug, Copy, Clone)] -#[allow(missing_docs)] -pub struct ConvolutionConfig; #[derive(Debug, Copy, Clone)] #[allow(missing_docs)] pub struct NormalizationConfig; @@ -278,66 +275,12 @@ macro_rules! impl_ops_tanh_for { ); } -#[macro_export] -macro_rules! impl_ops_convolution_for { - ($t:ident, $b:ty) => ( - impl ::plugin::Convolution<$t> for $b { - fn new_convolution_config( - &self, - src: &::co::tensor::SharedTensor<$t>, - dest: &::co::tensor::SharedTensor<$t>, - filter: &mut ::co::tensor::SharedTensor<$t>, - stride: &[i32], - zero_padding: &[i32] - ) -> Result<Self::CC, ::co::error::Error> { - unimplemented!(); - Ok(helper::ConvolutionConfig) - } - fn convolution( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - unimplemented!(); - Ok(()) - } - - fn convolution_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - unimplemented!(); - Ok(()) - } - - fn convolution_grad( - &self, - x: &mut ::co::tensor::SharedTensor<$t>, - x_diff: &mut ::co::tensor::SharedTensor<$t>, - result: &mut ::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - unimplemented!(); - Ok(()) - } - - fn convolution_grad_plain( - &self, - x: &::co::tensor::SharedTensor<$t>, - x_diff: &::co::tensor::SharedTensor<$t>, - result: &::co::tensor::SharedTensor<$t>, - result_diff: &mut ::co::tensor::SharedTensor<$t>, - config: &Self::CC - ) -> Result<(), ::co::error::Error> { - unimplemented!(); - Ok(()) - } - } - ); +#[derive(Debug, Clone)] +#[allow(missing_docs)] +pub struct ConvolutionConfig { + pub filter_shape: Vec<usize>, + pub stride: Vec<i32>, + pub padding: Vec<i32>, } #[macro_export] diff --git a/src/frameworks/native/mod.rs b/src/frameworks/native/mod.rs index 1e70e9a..bf2c3c7 100644 --- a/src/frameworks/native/mod.rs +++ b/src/frameworks/native/mod.rs @@ -9,14 +9,14 @@ use co::prelude::*; use co::Error; use co::plugin::Error as PluginError; +use std::ops::*; + #[macro_use] pub mod helper; -impl_oconf_for_cc!(f32, f64); -impl_oconf_for_clrn!(f32, f64); -impl_oconf_for_pooling!(f32, f64); - -impl NN<f32> for Backend<Native> { +impl<T> NN<T> for Backend<Native> + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, +{ type CC = helper::ConvolutionConfig; type CLRN = helper::NormalizationConfig; type CPOOL = helper::PoolingConfig; @@ -24,30 +24,324 @@ impl NN<f32> for Backend<Native> { fn init_nn() { } fn device(&self) -> &DeviceType { self.device() } } +impl<'a, T> NNOperationConfig<T> for helper::ConvolutionConfig + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, +{ } +impl<'a, T> ConvolutionConfig<T> for helper::ConvolutionConfig + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, +{ } +impl<T> NNOperationConfig<T> for helper::NormalizationConfig + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, +{ } +impl<T> NNOperationConfig<T> for helper::PoolingConfig + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, +{ } + +impl<T> ::plugin::Convolution<T> for Backend<Native> + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, +{ + fn new_convolution_config(&self, + src: &SharedTensor<T>, + dest: &SharedTensor<T>, + filter: &mut SharedTensor<T>, + algo_fwd: ConvForwardAlgo, + algo_bwd_filter: ConvBackwardFilterAlgo, + algo_bwd_data: ConvBackwardDataAlgo, + stride: &[i32], + zero_padding: &[i32]) -> Result<Self::CC, Error> { + match algo_fwd { + ConvForwardAlgo::Auto | ConvForwardAlgo::ImplicitGEMM => { + }, + _ => { + return Err(Error::Plugin(PluginError::Plugin("Unimplemented."))); + }, + } + match algo_bwd_filter { + ConvBackwardFilterAlgo::Auto | + ConvBackwardFilterAlgo::ImplicitGEMM => { + }, + _ => { + return Err(Error::Plugin(PluginError::Plugin("Unimplemented."))); + }, + } + match algo_bwd_data { + ConvBackwardDataAlgo::Auto | + ConvBackwardDataAlgo::ImplicitGEMM => { + }, + _ => { + return Err(Error::Plugin(PluginError::Plugin("Unimplemented."))); + }, + } + + Ok(helper::ConvolutionConfig { + filter_shape: filter.desc().clone(), + stride: stride.to_vec(), + padding: zero_padding.to_vec(), + }) + } + fn convolution(&self, filter: &mut SharedTensor<T>, + input: &mut SharedTensor<T>, + output: &mut SharedTensor<T>, + scratch: &mut SharedTensor<u8>, + config: &Self::CC) -> Result<(), Error> + { + let dev = self.device(); + let _ = input.add_device(dev); + try!(input.sync(dev)); + let _ = filter.add_device(dev); + try!(filter.sync(dev)); + let _ = output.add_device(dev); + try!(output.sync(dev)); + let _ = scratch.add_device(dev); + try!(scratch.sync(dev)); + + self.convolution_plain(filter, input, output, scratch, config) + } + + fn convolution_plain(&self, filter: &SharedTensor<T>, + x: &SharedTensor<T>, + result: &mut SharedTensor<T>, + _scratch: &mut SharedTensor<u8>, + config: &Self::CC) -> Result<(), Error> + { + let dev = self.device(); + + let input_dim = x.desc(); + let input = x.get(dev).unwrap() + .as_native().unwrap() + .as_slice::<T>(); + let input_stride = input_dim.default_stride(); + + let output_dim = result.desc().clone(); + let output = result.get_mut(dev).unwrap() + .as_mut_native().unwrap() + .as_mut_slice::<T>(); + let output_stride = output_dim.default_stride(); + + { + for o in output.iter_mut() { + *o = Default::default(); + } + } + + let filter_dim = filter.desc(); + let filter = filter.get(dev).unwrap() + .as_native().unwrap() + .as_slice::<T>(); + let filter_stride = filter_dim.default_stride(); + + + // sanity check + assert!(input_dim[0] == output_dim[0]); + assert!(filter_dim[0] == output_dim[1]); + assert!(input_dim[1] == filter_dim[1]); + + println!("input_dim = `{:?}`", input_dim); + println!("filter_dim = `{:?}`", filter_dim); + println!("output_dim = `{:?}`", output_dim); + + // TODO: specializations for spatial input. + + // recursively sum up elementwise multiplication of the hyperplanes. + fn filter_<T>(input: &[T], input_stride: &[usize], input_dim: &[usize], + input_offset: usize, input_idx_base: &[usize], + + filter: &[T], filter_stride: &[usize], filter_dim: &[usize], + filter_offset: usize, + + padding: &[i32], + depth: usize, depth_end: usize, + acc: Option<T>) -> T + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, + { + let mut acc = acc.unwrap_or_default(); + + let p = padding[0] as usize; + let input_idx_end = input_dim[0] + 2 * p; + + let mut input_idx = input_idx_base[0]; + let mut filter_idx = 0; + while filter_idx < filter_dim[0] { + let i_offset = input_offset + (input_idx - p) * input_stride[0]; + let f_offset = filter_offset + filter_idx * filter_stride[0]; + + let v = if input_idx < p || input_idx + 1 > input_idx_end { + Default::default() + } else if depth + 1 >= depth_end { + input[i_offset] * filter[f_offset] + } else { + filter_(input, &input_stride[1..], &input_dim[1..], + i_offset, &input_idx_base[1..], + filter, &filter_stride[1..], &filter_dim[1..], + f_offset, + &padding[1..], depth + 1, depth_end, + None) + }; + + acc = acc + v; + + input_idx += 1; + filter_idx += 1; + } + + return acc; + } + + + // depth == 0 is the first level + fn conv<T>(input: &[T], input_stride: &[usize], input_dim: &[usize], + top_input_offset: usize, input_offset: usize, + input_idx_base: &mut [usize], + + filter: &[T], filter_stride: &[usize], filter_dim: &[usize], + filter_offset: usize, + + depth: usize, + padding: &[i32], stride: &[i32], + + output: &mut [T], output_stride: &[usize], + output_dim: &[usize], + output_offset: usize) + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, + { + let p = padding[depth] as usize; + let input_end = input_dim[depth] + 2 * p - (filter_dim[depth]); + + let mut input_i = 0; + + let mut output_idx = 0; + while output_idx < output_dim[0] { + input_idx_base[depth] = input_i; + let input_offset = input_offset + input_i * input_stride[depth]; + let output_offset = output_offset + output_idx * output_stride[0]; + + if depth + 1 < input_dim.len() { + conv(input, input_stride, input_dim, top_input_offset, + input_offset, input_idx_base, + filter, filter_stride, filter_dim, filter_offset, + depth + 1, + padding, &stride[1..], output, &output_stride[1..], + &output_dim[1..], output_offset); + } else { + let v = filter_(input, input_stride, input_dim, + top_input_offset, &input_idx_base[..], + filter, filter_stride, filter_dim, filter_offset, + padding, 0, input_dim.len(), + None); + output[output_offset] = output[output_offset] + v; + } + + input_i += stride[0] as usize; + output_idx += 1; + } + } + + fn conv_k_d1<T>(_batch: usize, + input: &[T], input_stride: &[usize], input_dim: &[usize], + input_offset: usize, input_idx_base: &mut [usize], + + filter: &[T], filter_stride: &[usize], filter_dim: &[usize], + + padding: &[i32], stride: &[i32], + + output: &mut [T], output_stride: &[usize], + output_dim: &[usize], output_offset: usize) + where T: Add<T, Output = T> + Mul<T, Output = T> + Default + Copy, + { + for k in 0..filter_dim[0] { + let output_offset = output_offset + k * output_stride[0]; + let filter_offset = k * filter_stride[0]; + for d1 in 0..input_dim[0] { + let input_offset = input_offset + d1 * input_stride[0]; + let filter_offset = filter_offset + d1 * filter_stride[1]; + + conv(input, &input_stride[1..], &input_dim[1..], + input_offset, input_offset, input_idx_base, + filter, &filter_stride[2..], &filter_dim[2..], filter_offset, + 0, padding, stride, output, &output_stride[1..], + &output_dim[1..], + output_offset); + } + } + } + + let mut input_idx = Vec::new(); + input_idx.resize(input_dim.len() - 2, 0); + let mut output_idx = Vec::new(); + output_idx.resize(output_dim.len(), 0); + + let batches = input_dim[0]; + let mut batch = 0; + while batch < batches { + let input_offset = batch * input_stride[0]; + let output_offset = batch * output_stride[0]; + + conv_k_d1(batch, input, &input_stride[1..], &input_dim[1..], input_offset, + &mut input_idx[..], + filter, &filter_stride[..], &filter_dim[..], + &config.padding[..], &config.stride[..], + output, &output_stride[1..], &output_dim[1..], + output_offset); + + batch += 1; + } + + Ok(()) + } + + fn convolution_grad_filter(&self, src_data: &mut SharedTensor<T>, + dest_diff: &mut SharedTensor<T>, + filter_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + unimplemented!() + } + + fn convolution_grad_filter_plain(&self, src_data: &SharedTensor<T>, + dest_diff: &SharedTensor<T>, + filter_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + unimplemented!() + } + + fn convolution_grad_data(&self, filter: &mut SharedTensor<T>, + x_diff: &mut SharedTensor<T>, + result_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + unimplemented!() + } + + fn convolution_grad_data_plain(&self, filter: &SharedTensor<T>, + x_diff: &SharedTensor<T>, + result_diff: &mut SharedTensor<T>, + workspace: &mut SharedTensor<u8>, + config: &Self::CC) -> + Result<(), ::co::error::Error> + { + unimplemented!() + } +} impl_ops_sigmoid_for!(f32, Backend<Native>); impl_ops_relu_for!(f32, Backend<Native>); impl_ops_tanh_for!(f32, Backend<Native>); -// impl_ops_convolution_for!(f32, Backend<Native>); - impl_ops_softmax_for!(f32, Backend<Native>); - impl_ops_log_softmax_for!(f32, Backend<Native>); +impl_ops_softmax_for!(f32, Backend<Native>); +impl_ops_log_softmax_for!(f32, Backend<Native>); // impl_ops_lrn_for!(f32, Backend<Native>); -// impl_ops_pooling_for!(f32, Backend<Native>); - -impl NN<f64> for Backend<Native> { - type CC = helper::ConvolutionConfig; - type CLRN = helper::NormalizationConfig; - type CPOOL = helper::PoolingConfig; - - fn init_nn() { } - fn device(&self) -> &DeviceType { self.device() } -} +impl_ops_pooling_for!(f32, Backend<Native>); impl_ops_sigmoid_for!(f64, Backend<Native>); impl_ops_relu_for!(f64, Backend<Native>); impl_ops_tanh_for!(f64, Backend<Native>); -// impl_ops_convolution_for!(f64, Backend<Native>); impl_ops_softmax_for!(f64, Backend<Native>); impl_ops_log_softmax_for!(f64, Backend<Native>); // impl_ops_lrn_for!(f64, Backend<Native>); -// impl_ops_pooling_for!(f64, Backend<Native>); +impl_ops_pooling_for!(f64, Backend<Native>); From d5ab0d01de20cf8c2aa22dd4106020898cd6f9d1 Mon Sep 17 00:00:00 2001 From: Richard Diamond <wichard@vitalitystudios.com> Date: Mon, 11 Apr 2016 18:41:54 -0500 Subject: [PATCH 3/3] fix/tests: fix the convolution test for both native and Cuda backends FIX #45 --- tests/convolution_specs.rs | 598 ++++++++++++++++--------------------- 1 file changed, 260 insertions(+), 338 deletions(-) diff --git a/tests/convolution_specs.rs b/tests/convolution_specs.rs index 434b207..5839029 100644 --- a/tests/convolution_specs.rs +++ b/tests/convolution_specs.rs @@ -1,200 +1,241 @@ extern crate collenchyma_nn as co_nn; extern crate collenchyma as co; -#[cfg(test)] -#[cfg(feature = "cuda")] -mod convolution_spec_cuda { +use std::iter::repeat; - use co::prelude::*; - use co_nn::*; - use co::plugin::numeric_helpers::{cast, Float}; +use co::prelude::*; +use co_nn::*; +use co::plugin::numeric_helpers::{cast, Float}; + +pub fn get_native_backend() -> Backend<Native> { + Backend::<Native>::default().unwrap() +} - fn get_native_backend() -> Backend<Native> { - Backend::<Native>::default().unwrap() +pub fn write_to_memory<T: Copy>(mem: &mut MemoryType, data: &[T]) { + match mem { + &mut MemoryType::Native(ref mut mem) => { + let mut mem_buffer = mem.as_mut_slice::<T>(); + for (index, datum) in data.iter().enumerate() { + mem_buffer[index] = *datum; + } + }, + #[cfg(any(feature = "opencl", feature = "cuda"))] + _ => {} } +} - fn get_cuda_backend() -> Backend<Cuda> { - Backend::<Cuda>::default().unwrap() +pub fn write_to<T, B, C>(tensor: &mut SharedTensor<T>, + payload: &[T], + backend: Option<&Backend<B>>, + native: &Backend<C>) + where T: Copy, + B: IFramework + Clone, + C: IFramework + Clone, +{ + + let _ = tensor.add_device(native.device()); + tensor.sync(native.device()).unwrap(); + write_to_memory(tensor.get_mut(native.device()).unwrap(), payload); + if let Some(backend) = backend { + let _ = tensor.add_device(backend.device()); + tensor.sync(backend.device()).unwrap(); + } else { + tensor.sync(native.device()).unwrap(); } +} - fn write_to_memory<T: Copy>(mem: &mut MemoryType, data: &[T]) { - match mem { - &mut MemoryType::Native(ref mut mem) => { - let mut mem_buffer = mem.as_mut_slice::<T>(); - for (index, datum) in data.iter().enumerate() { - mem_buffer[index] = *datum; - } - }, - #[cfg(any(feature = "opencl", feature = "cuda"))] - _ => {} - } +pub fn get_memory<T, B, C>(backend: Option<&Backend<B>>, + native: &Backend<C>) -> (SharedTensor<T>, SharedTensor<T>, + SharedTensor<T>, SharedTensor<u8>) + where T: Float, + B: IFramework + Clone, + C: IFramework + Clone, +{ + let val = cast::<f64, T>(1.0).unwrap(); + let val2 = cast::<f64, T>(2.0).unwrap(); + let batch = 4; + let w1 = 9; + let h1 = 9; + let d1 = 3; + let k = 6; + let f = 3; + let w2 = (w1 - f + 0) / 1 + 1; + let h2 = (h1 - f + 0) / 1 + 1; + let mut x = SharedTensor::<T>::new(native.device(), &(batch, d1, h1, w1)).unwrap(); + let mut payload: &mut [T] = &mut repeat(val).take(x.capacity()).collect::<Vec<T>>(); + payload[0] = val2; + + write_to(&mut x, payload, backend, native); + + let mut filter = SharedTensor::<T>::new(native.device(), &(k, d1, f, f)).unwrap(); + let payload: &[T] = &repeat(val).take(filter.capacity()).collect::<Vec<T>>(); + + write_to(&mut filter, payload, backend, native); + + let mut result = SharedTensor::<T>::new(native.device(), &(batch, k, h2, w2)).unwrap(); + let payload: &[T] = &repeat(val2).take(result.capacity()).collect::<Vec<T>>(); + + write_to(&mut result, payload, backend, native); + + let workspace = if let Some(cuda) = backend { + SharedTensor::<u8>::new(cuda.device(), &(4)).unwrap() + } else { + SharedTensor::<u8>::new(native.device(), &(4)).unwrap() + }; + + (x, result, filter, workspace) +} + +#[allow(dead_code)] +pub fn get_grad_memory<T, B, C>(backend: Option<&Backend<B>>, + native: &Backend<C>) -> (SharedTensor<T>, SharedTensor<T>, + SharedTensor<T>, SharedTensor<T>, + SharedTensor<T>) + where T: Float, + B: IFramework + Clone, + C: IFramework + Clone, +{ + let val = cast::<f64, T>(1f64).unwrap(); + let val2 = cast::<f64, T>(2f64).unwrap(); + let batch = 4; + let w1 = 9; + let h1 = 9; + let d1 = 3; + let k = 6; + let f = 3; + let w2 = (w1 - f + 0) / 1 + 1; + let h2 = (h1 - f + 0) / 1 + 1; + + let mut x = SharedTensor::<T>::new(native.device(), &(batch, d1, h1, w1)).unwrap(); + let mut payload: &mut [T] = &mut repeat(val).take(x.capacity()).collect::<Vec<T>>(); + payload[0] = val2; + + write_to(&mut x, payload, backend, native); + + let mut x_diff = SharedTensor::<T>::new(native.device(), &(batch, k, h2, w2)).unwrap(); + let mut payload: &mut [T] = &mut repeat(val).take(x_diff.capacity()).collect::<Vec<T>>(); + payload[0] = val2; + + write_to(&mut x_diff, payload, backend, native); + + let mut filter = SharedTensor::<T>::new(native.device(), &(k, d1, f, f)).unwrap(); + let payload: &[T] = &repeat(val).take(filter.capacity()).collect::<Vec<T>>(); + + write_to(&mut filter, payload, backend, native); + + let mut result = SharedTensor::<T>::new(native.device(), &(batch, k, h2, w2)).unwrap(); + let payload: &[T] = &repeat(val).take(result.capacity()).collect::<Vec<T>>(); + + write_to(&mut result, payload, backend, native); + + let mut result_diff = SharedTensor::<T>::new(native.device(), &(batch, k, h2, w2)).unwrap(); + if let Some(cuda) = backend { + result_diff.add_device(cuda.device()).unwrap(); } - fn get_memory<T: Float, B: IFramework + Clone, C: IFramework + Clone>(backend: &Backend<B>, native: &Backend<C>) -> (SharedTensor<T>, SharedTensor<T>, SharedTensor<T>, SharedTensor<u8>){ - let val = cast::<f64, T>(1f64).unwrap(); - let val2 = cast::<f64, T>(2f64).unwrap(); - let batch = 4; - let w1 = 9; - let h1 = 9; - let d1 = 3; - let k = 6; - let f = 3; - let w2 = (w1 - f + 0) / 1; - let h2 = (h1 - f + 0) / 1; - let mut x = SharedTensor::<T>::new(backend.device(), &(batch, d1, h1, w1)).unwrap(); - let mut payload: &mut [T] = &mut ::std::iter::repeat(val).take(x.capacity()).collect::<Vec<T>>(); - payload[0] = val2; - x.add_device(native.device()).unwrap(); - x.sync(native.device()).unwrap(); - write_to_memory(x.get_mut(native.device()).unwrap(), payload); - x.sync(backend.device()).unwrap(); - - let mut filter = SharedTensor::<T>::new(backend.device(), &(k, d1, f, f)).unwrap(); - let payload: &[T] = &::std::iter::repeat(val).take(filter.capacity()).collect::<Vec<T>>(); - filter.add_device(native.device()).unwrap(); - filter.sync(native.device()).unwrap(); - write_to_memory(filter.get_mut(native.device()).unwrap(), payload); - filter.sync(backend.device()).unwrap(); - - let mut result = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - let payload: &[T] = &::std::iter::repeat(val2).take(result.capacity()).collect::<Vec<T>>(); - result.add_device(native.device()).unwrap(); - result.sync(native.device()).unwrap(); - write_to_memory(result.get_mut(native.device()).unwrap(), payload); - result.sync(backend.device()).unwrap(); - - let workspace = SharedTensor::<u8>::new(backend.device(), &(4)).unwrap(); - - (x, result, filter, workspace) + (x, x_diff, result, result_diff, filter) +} + +pub fn create_conv_config<T, B>(backend: &B, x: &SharedTensor<T>, result: &SharedTensor<T>, + filter: &mut SharedTensor<T>) -> Result<<B as co_nn::NN<T>>::CC, + co::error::Error> + where B: co_nn::Convolution<T>, +{ + backend.new_convolution_config(x, result, filter, + ConvForwardAlgo::ImplicitGEMM, + ConvBackwardFilterAlgo::ImplicitGEMM, + ConvBackwardDataAlgo::ImplicitGEMM, + &vec!(1,1), &vec!(0,0)) +} + +pub fn check_conv<T>(device: &DeviceType, mut result: SharedTensor<T>) + where T: Float + ::std::fmt::Debug, +{ + use std::iter::repeat; + + result.sync(device).unwrap(); + + let mem = result + .get(device) + .unwrap() + .as_native() + .unwrap(); + let mut payload: &mut [T] = &mut repeat(cast::<f64, T>(27.0f64).unwrap()) + .take(result.capacity()) + .collect::<Vec<T>>(); + + let desc = result.desc(); + for i in 0..desc[desc.len() - 2] - 1 { + let idx = i * desc[desc.len() - 1] * desc[desc.len() - 2]; + println!("payload offset @ i = {:?}: {:?}", + i, idx); + payload[idx] = + cast::<f64, T>(28.0).unwrap(); } - #[allow(dead_code)] - fn get_grad_memory<T: Float, B: IFramework + Clone, C: IFramework + Clone>(backend: &Backend<B>, native: &Backend<C>) -> (SharedTensor<T>, SharedTensor<T>, SharedTensor<T>, SharedTensor<T>, SharedTensor<T>){ - let val = cast::<f64, T>(1f64).unwrap(); - let val2 = cast::<f64, T>(2f64).unwrap(); - let batch = 4; - let w1 = 9; - let h1 = 9; - let d1 = 3; - let k = 6; - let f = 3; - let w2 = (w1 - f + 0) / 1; - let h2 = (h1 - f + 0) / 1; - - let mut x = SharedTensor::<T>::new(backend.device(), &(batch, d1, h1, w1)).unwrap(); - let mut payload: &mut [T] = &mut ::std::iter::repeat(val).take(x.capacity()).collect::<Vec<T>>(); - payload[0] = val2; - x.add_device(native.device()).unwrap(); - x.sync(native.device()).unwrap(); - write_to_memory(x.get_mut(native.device()).unwrap(), payload); - x.sync(backend.device()).unwrap(); - - let mut x_diff = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - let mut payload: &mut [T] = &mut ::std::iter::repeat(val).take(x_diff.capacity()).collect::<Vec<T>>(); - payload[0] = val2; - x_diff.add_device(native.device()).unwrap(); - x_diff.sync(native.device()).unwrap(); - write_to_memory(x_diff.get_mut(native.device()).unwrap(), payload); - x_diff.sync(backend.device()).unwrap(); - - let mut filter = SharedTensor::<T>::new(backend.device(), &(k, d1, f, f)).unwrap(); - let payload: &[T] = &::std::iter::repeat(val).take(filter.capacity()).collect::<Vec<T>>(); - filter.add_device(native.device()).unwrap(); - filter.sync(native.device()).unwrap(); - write_to_memory(filter.get_mut(native.device()).unwrap(), payload); - filter.sync(backend.device()).unwrap(); - - let mut result = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - let payload: &[T] = &::std::iter::repeat(val).take(result.capacity()).collect::<Vec<T>>(); - result.add_device(native.device()).unwrap(); - result.sync(native.device()).unwrap(); - write_to_memory(result.get_mut(native.device()).unwrap(), payload); - result.sync(backend.device()).unwrap(); - - let mut result_diff = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - result_diff.add_device(native.device()).unwrap(); - - (x, x_diff, result, result_diff, filter) + /*for (i, (v1, v2)) in mem.as_slice::<f32>().iter().zip(payload.iter()).enumerate() { + assert!(*v1 == *v2, "i = {:?}", i); + }*/ + + assert_eq!(payload, mem.as_slice::<T>()); +} + +#[cfg(test)] +#[cfg(feature = "cuda")] +mod convolution_spec_cuda { + + use super::*; + + use co::prelude::*; + use co_nn::*; + use co::plugin::numeric_helpers::{Float}; + + pub fn get_cuda_backend() -> Backend<Cuda> { + Backend::<Cuda>::default().unwrap() } - #[test] - fn it_computes_correct_convolution_on_cuda_for_f32() { + + fn convolution<T>(plain: bool) + where T: Float + ::std::fmt::Debug + frameworks::cuda::DataTypeInfo, + { let backend = get_cuda_backend(); let native = get_native_backend(); - let (mut x, mut result, mut filter, mut workspace) = get_memory::<f32, Cuda, Native>(&backend, &native); - - let conf = backend.new_convolution_config(&x, &result, &mut filter, ConvForwardAlgo::ImplicitGEMM, ConvBackwardFilterAlgo::ImplicitGEMM, ConvBackwardDataAlgo::ImplicitGEMM, &vec!(1,1), &vec!(0,0)).unwrap(); - match backend.convolution(&mut filter, &mut x, &mut result, &mut workspace, &conf) { - Ok(_) => { - result.sync(native.device()).unwrap(); - if let Some(mem) = result.get(native.device()).unwrap().as_native() { - let mut payload: &mut [f32] = &mut ::std::iter::repeat(27f32).take(result.capacity()).collect::<Vec<f32>>(); - payload[0] = 28f32; - assert_eq!(payload, mem.as_slice::<f32>()); - } - }, - Err(err) => { println!("{:?}", err); assert!(false) } + let (mut x, mut result, mut filter, mut workspace) = + get_memory::<T, Cuda, Native>(Some(&backend), &native); + + let conf = create_conv_config(&backend, &x, &result, &mut filter) + .unwrap(); + if !plain { + backend.convolution(&mut filter, &mut x, &mut result, + &mut workspace, &conf) + .unwrap(); + } else { + backend.convolution_plain(&mut filter, &mut x, &mut result, + &mut workspace, &conf) + .unwrap(); } + + check_conv(native.device(), result); } #[test] - fn it_computes_correct_convolution_on_cuda_for_f64() { - let backend = get_cuda_backend(); - let native = get_native_backend(); - let (mut x, mut result, mut filter, mut workspace) = get_memory::<f64, Cuda, Native>(&backend, &native); - - let conf = backend.new_convolution_config(&x, &result, &mut filter, ConvForwardAlgo::ImplicitGEMM, ConvBackwardFilterAlgo::ImplicitGEMM, ConvBackwardDataAlgo::ImplicitGEMM, &vec!(1,1), &vec!(0,0)).unwrap(); - match backend.convolution(&mut filter, &mut x, &mut result, &mut workspace, &conf) { - Ok(_) => { - result.sync(native.device()).unwrap(); - if let Some(mem) = result.get(native.device()).unwrap().as_native() { - let mut payload: &mut [f64] = &mut ::std::iter::repeat(27f64).take(result.capacity()).collect::<Vec<f64>>(); - payload[0] = 28f64; - assert_eq!(payload, mem.as_slice::<f64>()); - } - }, - Err(err) => { println!("{:?}", err); assert!(false) } - } + fn convolution_f32() { + convolution::<f32>(false); } #[test] - fn it_computes_correct_convolution_on_cuda_for_f32_plain() { - let backend = get_cuda_backend(); - let native = get_native_backend(); - let (mut x, mut result, mut filter, mut workspace) = get_memory::<f32, Cuda, Native>(&backend, &native); - - let conf = backend.new_convolution_config(&x, &result, &mut filter, ConvForwardAlgo::ImplicitGEMM, ConvBackwardFilterAlgo::ImplicitGEMM, ConvBackwardDataAlgo::ImplicitGEMM, &vec!(1,1), &vec!(0,0)).unwrap(); - match backend.convolution_plain(&mut filter, &mut x, &mut result, &mut workspace, &conf) { - Ok(_) => { - result.sync(native.device()).unwrap(); - if let Some(mem) = result.get(native.device()).unwrap().as_native() { - let mut payload: &mut [f32] = &mut ::std::iter::repeat(27f32).take(result.capacity()).collect::<Vec<f32>>(); - payload[0] = 28f32; - assert_eq!(payload, mem.as_slice::<f32>()); - } - }, - Err(err) => { println!("{:?}", err); assert!(false) } - } + fn convolution_f64() { + convolution::<f64>(false); } #[test] - fn it_computes_correct_convolution_on_cuda_for_f64_plain() { - let backend = get_cuda_backend(); - let native = get_native_backend(); - let (mut x, mut result, mut filter, mut workspace) = get_memory::<f64, Cuda, Native>(&backend, &native); - - let conf = backend.new_convolution_config(&x, &result, &mut filter, ConvForwardAlgo::ImplicitGEMM, ConvBackwardFilterAlgo::ImplicitGEMM, ConvBackwardDataAlgo::ImplicitGEMM, &vec!(1,1), &vec!(0,0)).unwrap(); - match backend.convolution_plain(&mut filter, &mut x, &mut result, &mut workspace, &conf) { - Ok(_) => { - result.sync(native.device()).unwrap(); - if let Some(mem) = result.get(native.device()).unwrap().as_native() { - let mut payload: &mut [f64] = &mut ::std::iter::repeat(27f64).take(result.capacity()).collect::<Vec<f64>>(); - payload[0] = 28f64; - assert_eq!(payload, mem.as_slice::<f64>()); - } - }, - Err(err) => { println!("{:?}", err); assert!(false) } - } + fn unsynced_convolution_f32() { + convolution::<f32>(true); + } + + #[test] + fn unsynced_convolution_f64() { + convolution::<f64>(true); } /* @@ -274,172 +315,53 @@ Err(err) => { println!("{:?}", err); assert!(false) } #[cfg(test)] #[cfg(feature = "native")] -mod convolution_spec_native{ - - // use co::backend::{Backend, BackendConfig}; - // use co::framework::IFramework; - // use co::frameworks::Native; - // use co_nn::*; - // use co::memory::MemoryType; - // use co::tensor::SharedTensor; - // use co::plugin::numeric_helpers::{cast, Float}; - // - // fn get_native_backend() -> Backend<Native> { - // let framework = Native::new(); - // let hardwares = framework.hardwares(); - // let backend_config = BackendConfig::new(framework, hardwares); - // Backend::new(backend_config).unwrap() - // } - // - // fn write_to_memory<T: Copy>(mem: &mut MemoryType, data: &[T]) { - // match mem { - // &mut MemoryType::Native(ref mut mem) => { - // let mut mem_buffer = mem.as_mut_slice::<T>(); - // for (index, datum) in data.iter().enumerate() { - // mem_buffer[index] = *datum; - // } - // }, - // #[cfg(any(feature = "opencl", feature = "cuda"))] - // _ => {} - // } - // } - // - // fn get_memory<T: Float, B: IFramework + Clone>(backend: &Backend<B>) -> (SharedTensor<T>, SharedTensor<T>, SharedTensor<T>){ - // let val = cast::<f64, T>(1f64).unwrap(); - // let val2 = cast::<f64, T>(2f64).unwrap(); - // let batch = 4; - // let w1 = 9; - // let h1 = 9; - // let d1 = 3; - // let k = 6; - // let f = 3; - // let w2 = (w1 - f + 0) / 1; - // let h2 = (h1 - f + 0) / 1; - // let mut x = SharedTensor::<T>::new(backend.device(), &(batch, d1, h1, w1)).unwrap(); - // let mut payload: &mut [T] = &mut ::std::iter::repeat(val).take(x.capacity()).collect::<Vec<T>>(); - // payload[0] = val2; - // write_to_memory(x.get_mut(backend.device()).unwrap(), payload); - // - // let mut filter = SharedTensor::<T>::new(backend.device(), &(k, d1, f, f)).unwrap(); - // let payload: &[T] = &::std::iter::repeat(val).take(filter.capacity()).collect::<Vec<T>>(); - // write_to_memory(filter.get_mut(backend.device()).unwrap(), payload); - // - // let mut result = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - // let payload: &[T] = &::std::iter::repeat(val2).take(result.capacity()).collect::<Vec<T>>(); - // write_to_memory(result.get_mut(backend.device()).unwrap(), payload); - // - // (x, result, filter) - // } - // - // #[allow(dead_code)] - // fn get_grad_memory<T: Float, B: IFramework + Clone>(backend: &Backend<B>) -> (SharedTensor<T>, SharedTensor<T>, SharedTensor<T>, SharedTensor<T>, SharedTensor<T>){ - // let val = cast::<f64, T>(1f64).unwrap(); - // let val2 = cast::<f64, T>(2f64).unwrap(); - // let batch = 4; - // let w1 = 9; - // let h1 = 9; - // let d1 = 3; - // let k = 6; - // let f = 3; - // let w2 = (w1 - f + 0) / 1; - // let h2 = (h1 - f + 0) / 1; - // - // let mut x = SharedTensor::<T>::new(backend.device(), &(batch, d1, h1, w1)).unwrap(); - // let mut payload: &mut [T] = &mut ::std::iter::repeat(val).take(x.capacity()).collect::<Vec<T>>(); - // payload[0] = val2; - // write_to_memory(x.get_mut(backend.device()).unwrap(), payload); - // - // let mut x_diff = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - // let mut payload: &mut [T] = &mut ::std::iter::repeat(val).take(x_diff.capacity()).collect::<Vec<T>>(); - // payload[0] = val2; - // write_to_memory(x_diff.get_mut(backend.device()).unwrap(), payload); - // - // let mut filter = SharedTensor::<T>::new(backend.device(), &(k, d1, f, f)).unwrap(); - // let payload: &[T] = &::std::iter::repeat(val).take(filter.capacity()).collect::<Vec<T>>(); - // write_to_memory(filter.get_mut(backend.device()).unwrap(), payload); - // - // let mut result = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - // let payload: &[T] = &::std::iter::repeat(val).take(result.capacity()).collect::<Vec<T>>(); - // write_to_memory(result.get_mut(backend.device()).unwrap(), payload); - // - // let result_diff = SharedTensor::<T>::new(backend.device(), &(batch, k, h2, w2)).unwrap(); - // - // (x, x_diff, result, result_diff, filter) - // } - - // #[test] - // #[ignore] - // fn it_computes_correct_convolution_on_native_for_f32() { - // let backend = get_native_backend(); - // let (mut x, mut result, mut filter) = get_memory::<f32, Native>(&backend); - // - // let conf = backend.new_convolution_config(&x, &result, &mut filter, &vec!(1,1), &vec!(0,0)).unwrap(); - // match backend.convolution(&mut x, &mut result, &conf) { - // Ok(_) => { - // if let Some(mem) = result.get(backend.device()).unwrap().as_native() { - // let mut payload: &mut [f32] = &mut ::std::iter::repeat(27f32).take(result.capacity()).collect::<Vec<f32>>(); - // payload[0] = 28f32; - // assert_eq!(payload, mem.as_slice::<f32>()); - // } - // }, - // Err(err) => { println!("{:?}", err); assert!(false) } - // } - // } - // - // #[test] - // #[ignore] - // fn it_computes_correct_convolution_on_native_for_f64() { - // let backend = get_native_backend(); - // let (mut x, mut result, mut filter) = get_memory::<f64, Native>(&backend); - // - // let conf = backend.new_convolution_config(&x, &result, &mut filter, &vec!(1,1), &vec!(0,0)).unwrap(); - // match backend.convolution(&mut x, &mut result, &conf) { - // Ok(_) => { - // if let Some(mem) = result.get(backend.device()).unwrap().as_native() { - // let mut payload: &mut [f64] = &mut ::std::iter::repeat(27f64).take(result.capacity()).collect::<Vec<f64>>(); - // payload[0] = 28f64; - // assert_eq!(payload, mem.as_slice::<f64>()); - // } - // }, - // Err(err) => { println!("{:?}", err); assert!(false) } - // } - // } - // - // #[test] - // #[ignore] - // fn it_computes_correct_convolution_on_native_for_f32_plain() { - // let backend = get_native_backend(); - // let (mut x, mut result, mut filter) = get_memory::<f32, Native>(&backend); - // - // let conf = backend.new_convolution_config(&x, &result, &mut filter, &vec!(1,1), &vec!(0,0)).unwrap(); - // match backend.convolution_plain(&mut x, &mut result, &conf) { - // Ok(_) => { - // if let Some(mem) = result.get(backend.device()).unwrap().as_native() { - // let mut payload: &mut [f32] = &mut ::std::iter::repeat(27f32).take(result.capacity()).collect::<Vec<f32>>(); - // payload[0] = 28f32; - // assert_eq!(payload, mem.as_slice::<f32>()); - // } - // }, - // Err(err) => { println!("{:?}", err); assert!(false) } - // } - // } - // - // #[test] - // #[ignore] - // fn it_computes_correct_convolution_on_native_for_f64_plain() { - // let backend = get_native_backend(); - // let (mut x, mut result, mut filter) = get_memory::<f64, Native>(&backend); - // - // let conf = backend.new_convolution_config(&x, &result, &mut filter, &vec!(1,1), &vec!(0,0)).unwrap(); - // match backend.convolution_plain(&mut x, &mut result, &conf) { - // Ok(_) => { - // if let Some(mem) = result.get(backend.device()).unwrap().as_native() { - // let mut payload: &mut [f64] = &mut ::std::iter::repeat(27f64).take(result.capacity()).collect::<Vec<f64>>(); - // payload[0] = 28f64; - // assert_eq!(payload, mem.as_slice::<f64>()); - // } - // }, - // Err(err) => { println!("{:?}", err); assert!(false) } - // } - // } +mod convolution_spec_native { + use super::*; + + use co::backend::{IBackend}; + use co::frameworks::Native; + use co_nn::*; + use co::plugin::numeric_helpers::{Float}; + + fn convolution<T>(plain: bool) + where T: Float + ::std::fmt::Debug + frameworks::cuda::DataTypeInfo + Default, + { + let native = super::get_native_backend(); + let (mut x, mut result, mut filter, mut workspace) = + get_memory::<T, Native, Native>(None, &native); + + let conf = create_conv_config(&native, &x, &result, &mut filter) + .unwrap(); + if !plain { + native.convolution(&mut filter, &mut x, &mut result, + &mut workspace, &conf) + .unwrap(); + } else { + native.convolution_plain(&mut filter, &mut x, &mut result, + &mut workspace, &conf) + .unwrap(); + } + + check_conv(native.device(), result); + } + + #[test] + fn convolution_f32() { + convolution::<f32>(false); + } + + #[test] + fn convolution_f64() { + convolution::<f64>(false); + } + + #[test] + fn unsynced_convolution_f32() { + convolution::<f32>(true); + } + + #[test] + fn unsynced_convolution_f64() { + convolution::<f64>(true); + } }