diff --git a/benches/network_benches.rs b/benches/network_benches.rs index e27253d7..fe6909a0 100644 --- a/benches/network_benches.rs +++ b/benches/network_benches.rs @@ -69,7 +69,7 @@ mod cuda { #[bench] #[ignore] #[cfg(feature = "cuda")] - fn bench_mnsit_forward_1(b: &mut Bencher) { + fn bench_mnsit_forward_1(_b: &mut Bencher) { let mut cfg = SequentialConfig::default(); // set up input cfg.add_input("in", &vec![1, 30, 30]); @@ -96,7 +96,7 @@ mod cuda { backend.clone(), &LayerConfig::new("network", LayerType::Sequential(cfg))); let _ = timeit_loops!(10, { - let inp = SharedTensor::<f32>::new(backend.device(), &vec![1, 30, 30]).unwrap(); + let inp = SharedTensor::<f32>::new(&[1, 30, 30]); let inp_lock = Arc::new(RwLock::new(inp)); network.forward(&[inp_lock]); @@ -260,7 +260,7 @@ mod cuda { let func = || { let forward_time = timeit_loops!(1, { - let inp = SharedTensor::<f32>::new(backend.device(), &vec![128, 3, 112, 112]).unwrap(); + let inp = SharedTensor::new(&[128, 3, 112, 112]); let inp_lock = Arc::new(RwLock::new(inp)); network.forward(&[inp_lock]); @@ -416,7 +416,7 @@ mod cuda { backend.clone(), &LayerConfig::new("network", LayerType::Sequential(cfg))); let mut func = || { - let inp = SharedTensor::<f32>::new(backend.device(), &vec![128, 3, 112, 112]).unwrap(); + let inp = SharedTensor::<f32>::new(&[128, 3, 112, 112]); let inp_lock = Arc::new(RwLock::new(inp)); network.forward(&[inp_lock]); diff --git a/examples/benchmarks.rs b/examples/benchmarks.rs index 5075a63f..41e83b62 100644 --- a/examples/benchmarks.rs +++ b/examples/benchmarks.rs @@ -160,8 +160,7 @@ fn bench_alexnet() { let func = || { let forward_time = timeit_loops!(1, { { - let inp = SharedTensor::<f32>::new(backend.device(), &vec![128, 3, 224, 224]).unwrap(); - + let inp = SharedTensor::<f32>::new(&[128, 3, 224, 224]); let inp_lock = Arc::new(RwLock::new(inp)); network.forward(&[inp_lock.clone()]); } @@ -242,8 +241,7 @@ fn bench_overfeat() { let func = || { let forward_time = timeit_loops!(1, { { - let inp = SharedTensor::<f32>::new(backend.device(), &vec![128, 3, 231, 231]).unwrap(); - + let inp = SharedTensor::new(&[128, 3, 231, 231]); let inp_lock = Arc::new(RwLock::new(inp)); network.forward(&[inp_lock.clone()]); } @@ -339,7 +337,7 @@ fn bench_vgg_a() { let func = || { let forward_time = timeit_loops!(1, { { - let inp = SharedTensor::<f32>::new(backend.device(), &vec![64, 3, 224, 224]).unwrap(); + let inp = SharedTensor::new(&[64, 3, 224, 224]); let inp_lock = Arc::new(RwLock::new(inp)); network.forward(&[inp_lock.clone()]); diff --git a/src/layer.rs b/src/layer.rs index 8cf8517b..afb5687a 100644 --- a/src/layer.rs +++ b/src/layer.rs @@ -210,8 +210,8 @@ impl<B: IBackend> Layer<B> { } let backend: Rc<IBackend<F=B::F>> = self.backend.clone(); - blob_data = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA - blob_gradient = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA + blob_data = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA + blob_gradient = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA } self.output_blob_names.push(blob_name.clone()); self.output_blobs_data.push(blob_data.clone()); @@ -234,8 +234,8 @@ impl<B: IBackend> Layer<B> { info!("{} -> {}", self.name, blob_name); let backend: Rc<IBackend<F=B::F>> = self.backend.clone(); - let output_data = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA - let output_gradient = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA + let output_data = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA + let output_gradient = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA self.output_blobs_data.push(output_data); self.output_blobs_gradient.push(output_gradient); } @@ -264,8 +264,8 @@ impl<B: IBackend> Layer<B> { let net_weight_id = weights_len; let output_data = self.output_blobs_data[weight_id].read().unwrap(); debug!("Layer {} - creating weight and gradient of size {:?}", &layer_config.name, output_data.desc()); - let weight_data = Arc::new(RwLock::new(SharedTensor::<f32>::new(output_data.latest_device(), output_data.desc()).unwrap())); - let weight_gradient = Arc::new(RwLock::new(SharedTensor::<f32>::new(output_data.latest_device(), output_data.desc()).unwrap())); + let weight_data = Arc::new(RwLock::new(SharedTensor::new(output_data.desc()))); + let weight_gradient = Arc::new(RwLock::new(SharedTensor::new(output_data.desc()))); self.weights_data.push(weight_data.clone()); self.weights_gradient.push(weight_gradient.clone()); @@ -460,11 +460,6 @@ impl<B: IBackend> Layer<B> { self.input_blobs_data[input_i].write().unwrap().reshape(&reshaped_shape).unwrap(); } - self.worker.sync(&self.backend, - &mut self.input_blobs_data, &mut self.input_blobs_gradient, - &mut self.weights_data, &mut self.weights_gradient, - &mut self.output_blobs_data, &mut self.output_blobs_gradient); - let forward_time = timeit_loops!(1, { if self.is_using_in_place() { self.worker.forward(&self.backend, &vec![], &self.weights_data, &mut self.output_blobs_data); @@ -497,11 +492,6 @@ impl<B: IBackend> Layer<B> { self.output_blobs_gradient[output_i] = output.clone(); } - self.worker.sync(&self.backend, - &mut self.input_blobs_data, &mut self.input_blobs_gradient, - &mut self.weights_data, &mut self.weights_gradient, - &mut self.output_blobs_data, &mut self.output_blobs_gradient); - if self.is_using_in_place() { self.worker.backward_input(&self.backend, &self.weights_data, @@ -527,11 +517,6 @@ impl<B: IBackend> Layer<B> { /// /// This method is mostly used when doing backpropagation. pub fn backward_parameters(&mut self) { - self.worker.sync(&self.backend, - &mut self.input_blobs_data, &mut self.input_blobs_gradient, - &mut self.weights_data, &mut self.weights_gradient, - &mut self.output_blobs_data, &mut self.output_blobs_gradient); - self.worker.backward_parameters(&self.backend, &self.output_blobs_data, &self.output_blobs_gradient, @@ -553,13 +538,11 @@ impl<B: IBackend> Layer<B> { /// /// [3]: ../solver/enum.LRPolicy.html pub fn update_weights<SolverB: IBackend + ::util::SolverOps<f32>>(&mut self, backend: &SolverB) { - let mut shared_a = ::util::native_scalar(-1f32); - let _ = shared_a.add_device(IBackend::device(backend)); - shared_a.sync(IBackend::device(backend)).unwrap(); + // PERF: allocate this scalar once + let shared_a = ::util::native_scalar(-1f32); for (weight_gradient, weight_data) in self.learnable_weights_gradients().iter().zip(&mut self.learnable_weights_data()) { - weight_gradient.write().unwrap().sync(IBackend::device(backend)).unwrap(); - weight_data.write().unwrap().sync(IBackend::device(backend)).unwrap(); - backend.axpy_plain(&shared_a, &weight_gradient.read().unwrap(), &mut weight_data.write().unwrap()).unwrap(); + backend.axpy(&shared_a, &weight_gradient.read().unwrap(), + &mut weight_data.write().unwrap()).unwrap(); } } @@ -695,7 +678,6 @@ impl<B: IBackend> Layer<B> { } let mut weight_lock = weight.write().unwrap(); - weight_lock.sync(native_backend.device()).unwrap(); let capnp_tensor = capnp_weight.get_tensor().unwrap(); let mut shape = Vec::new(); @@ -705,7 +687,7 @@ impl<B: IBackend> Layer<B> { } weight_lock.reshape(&shape).unwrap(); - let mut native_slice = weight_lock.get_mut(native_backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice::<f32>(); + let mut native_slice = weight_lock.write_only(native_backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice::<f32>(); let data = capnp_tensor.get_data().unwrap(); for k in 0..data.len() { native_slice[k as usize] = data.get(k); @@ -814,8 +796,7 @@ impl<'a, B: IBackend> CapnpWrite<'a> for Layer<B> { let mut capnp_weight = weights.borrow().get(i as u32); capnp_weight.set_name(name); - let mut weight_lock = weight.write().unwrap(); - weight_lock.sync(native_backend.device()).unwrap(); + let weight_lock = weight.write().unwrap(); let mut tensor = capnp_weight.init_tensor(); { @@ -825,7 +806,8 @@ impl<'a, B: IBackend> CapnpWrite<'a> for Layer<B> { } } { - let native_slice = weight_lock.get(native_backend.device()).unwrap().as_native().unwrap().as_slice::<f32>(); + let native_slice = weight_lock.read(native_backend.device()) + .unwrap().as_native().unwrap().as_slice::<f32>(); let mut tensor_data = tensor.borrow().init_data(native_slice.len() as u32); for (i, datum) in native_slice.iter().enumerate() { tensor_data.set(i as u32, *datum); @@ -1025,74 +1007,6 @@ pub trait ILayer<B: IBackend> : ComputeOutput<f32, B> + ComputeInputGradient<f32 self.compute_parameters_gradient(backend, &output_data_, &output_gradients_, &input_data_, &mut weights_gradients_); } - /// Synchronize the blobs before doing a forward or backward operation. - /// - /// This is necessary because the forward_layer and backward_layer methods only immutably - /// borrow the corresponding input blobs and weights which they are not supposed to change. - /// However synchronizing all blobs to the same device may be neccessary for some computations, - /// which can only be done with a mutable borrow. - fn sync(&self, - backend: &B, - input_data: &mut [ArcLock<SharedTensor<f32>>], - input_gradients: &mut [ArcLock<SharedTensor<f32>>], - weights_data: &mut [ArcLock<SharedTensor<f32>>], - weights_gradients: &mut [ArcLock<SharedTensor<f32>>], - output_data: &mut Vec<ArcLock<SharedTensor<f32>>>, - output_gradients: &mut Vec<ArcLock<SharedTensor<f32>>>) { - if self.sync_native() { - let backend = native_backend(); - for tensor in input_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in input_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - } else { - for tensor in input_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in input_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - } - } - /// Return whether "anonymous" output blobs are created automatically for the layer. /// /// If this method returns true, Network::init will create enough "anonymous" output diff --git a/src/layers/activation/relu.rs b/src/layers/activation/relu.rs index ecb266d6..c995eae3 100644 --- a/src/layers/activation/relu.rs +++ b/src/layers/activation/relu.rs @@ -56,8 +56,8 @@ impl<B: IBackend + Relu<f32> + ReluPointwise<f32>> ComputeOutput<f32, B> for ReL input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { match input_data.get(0) { - Some(input) => backend.relu_plain(input, output_data[0]).unwrap(), - None => backend.relu_pointwise_plain(output_data[0]).unwrap(), + Some(input) => backend.relu(input, output_data[0]).unwrap(), + None => backend.relu_pointwise(output_data[0]).unwrap(), } } } @@ -72,8 +72,8 @@ impl<B: IBackend + Relu<f32> + ReluPointwise<f32>> ComputeInputGradient<f32, B> input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { match output_data.get(0) { - Some(_) => backend.relu_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), - None => backend.relu_pointwise_grad_plain(input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.relu_grad(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + None => backend.relu_pointwise_grad(input_data[0], input_gradients[0]).unwrap(), } } } @@ -115,7 +115,7 @@ impl<B: IBackend + Relu<f32>> ComputeOutput<f32, B> for ReLU { input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { match input_data.get(0) { - Some(input) => backend.relu_plain(input, output_data[0]).unwrap(), + Some(input) => backend.relu(input, output_data[0]).unwrap(), None => panic!("No input provided for ReLU layer."), } } @@ -131,7 +131,7 @@ impl<B: IBackend + Relu<f32>> ComputeInputGradient<f32, B> for ReLU { input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { match output_data.get(0) { - Some(_) => backend.relu_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.relu_grad(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), None => panic!("No output_data provided for ReLU layer backward."), } } diff --git a/src/layers/activation/sigmoid.rs b/src/layers/activation/sigmoid.rs index fb5a051c..4d4a6253 100644 --- a/src/layers/activation/sigmoid.rs +++ b/src/layers/activation/sigmoid.rs @@ -60,8 +60,8 @@ impl<B: IBackend + conn::Sigmoid<f32> + conn::SigmoidPointwise<f32>> ComputeOutp input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { match input_data.get(0) { - Some(input) => backend.sigmoid_plain(input, output_data[0]).unwrap(), - None => backend.sigmoid_pointwise_plain(output_data[0]).unwrap(), + Some(input) => backend.sigmoid(input, output_data[0]).unwrap(), + None => backend.sigmoid_pointwise(output_data[0]).unwrap(), } } } @@ -76,8 +76,9 @@ impl<B: IBackend + conn::Sigmoid<f32> + conn::SigmoidPointwise<f32>> ComputeInpu input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { match output_data.get(0) { - Some(_) => backend.sigmoid_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), - None => backend.sigmoid_pointwise_grad_plain(input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.sigmoid_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), + None => backend.sigmoid_pointwise_grad(input_data[0], input_gradients[0]).unwrap(), } } } @@ -119,7 +120,7 @@ impl<B: IBackend + conn::Sigmoid<f32>> ComputeOutput<f32, B> for Sigmoid { input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { match input_data.get(0) { - Some(input) => backend.sigmoid_plain(input, output_data[0]).unwrap(), + Some(input) => backend.sigmoid(input, output_data[0]).unwrap(), None => panic!("No input provided for Sigmoid layer."), } } @@ -135,7 +136,8 @@ impl<B: IBackend + conn::Sigmoid<f32>> ComputeInputGradient<f32, B> for Sigmoid input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { match output_data.get(0) { - Some(_) => backend.sigmoid_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.sigmoid_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), None => panic!("No output_data provided for Sigmoid layer backward."), } } diff --git a/src/layers/activation/tanh.rs b/src/layers/activation/tanh.rs index 5ec2ad85..78544d94 100644 --- a/src/layers/activation/tanh.rs +++ b/src/layers/activation/tanh.rs @@ -57,8 +57,8 @@ impl<B: IBackend + conn::Tanh<f32> + conn::TanhPointwise<f32>> ComputeOutput<f32 input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { match input_data.get(0) { - Some(input) => backend.tanh_plain(input, output_data[0]).unwrap(), - None => backend.tanh_pointwise_plain(output_data[0]).unwrap(), + Some(input) => backend.tanh(input, output_data[0]).unwrap(), + None => backend.tanh_pointwise(output_data[0]).unwrap(), } } } @@ -73,8 +73,9 @@ impl<B: IBackend + conn::Tanh<f32> + conn::TanhPointwise<f32>> ComputeInputGradi input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { match output_data.get(0) { - Some(_) => backend.tanh_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), - None => backend.tanh_pointwise_grad_plain(input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.tanh_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), + None => backend.tanh_pointwise_grad(input_data[0], input_gradients[0]).unwrap(), } } } @@ -116,7 +117,7 @@ impl<B: IBackend + conn::Tanh<f32>> ComputeOutput<f32, B> for TanH { input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { match input_data.get(0) { - Some(input) => backend.tanh_plain(input, output_data[0]).unwrap(), + Some(input) => backend.tanh(input, output_data[0]).unwrap(), None => panic!("No input provided for TanH layer."), } } @@ -132,7 +133,8 @@ impl<B: IBackend + conn::Tanh<f32>> ComputeInputGradient<f32, B> for TanH { input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { match output_data.get(0) { - Some(_) => backend.tanh_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.tanh_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), None => panic!("No output_data provided for TanH layer backward."), } } diff --git a/src/layers/common/convolution.rs b/src/layers/common/convolution.rs index 4016b861..6f9b9534 100644 --- a/src/layers/common/convolution.rs +++ b/src/layers/common/convolution.rs @@ -64,7 +64,7 @@ impl<B: conn::Convolution<f32>> Convolution<B> { fn create_filter(&self, device: &DeviceType, input_shape: &[usize]) -> SharedTensor<f32> { let filter_shape = self.calculate_filter_shape(input_shape); - SharedTensor::<f32>::new(device, &filter_shape).unwrap() + SharedTensor::<f32>::new(&filter_shape) } } @@ -156,12 +156,12 @@ impl<B: IBackend + conn::Convolution<f32>> ILayer<B> for Convolution<B> { fn resize_shared_workspace(&mut self, backend: Rc<B>, workspace: Option<ArcLock<SharedTensor<u8>>>) -> Option<ArcLock<SharedTensor<u8>>> { let required_size = self.convolution_config.as_ref().unwrap().workspace_size(); let new_workspace = if workspace.is_none() { - Arc::new(RwLock::new(SharedTensor::<u8>::new(IBackend::device(&*backend), &(required_size)).unwrap())) + Arc::new(RwLock::new(SharedTensor::<u8>::new(&[required_size]))) } else { let old_workspace = workspace.as_ref().unwrap().clone(); let old_workspace_size = old_workspace.read().unwrap().capacity(); if old_workspace_size < required_size { - Arc::new(RwLock::new(SharedTensor::<u8>::new(IBackend::device(&*backend), &(required_size)).unwrap())) + Arc::new(RwLock::new(SharedTensor::<u8>::new(&[required_size]))) } else { workspace.unwrap() } @@ -181,7 +181,8 @@ impl<B: IBackend + conn::Convolution<f32>> ComputeOutput<f32, B> for Convolution let filter_data = weights[0]; let conv_config = self.convolution_config.as_ref().unwrap(); let mut workspace = self.workspace.as_ref().unwrap().write().unwrap(); - backend.convolution_plain(filter_data, input_data[0], output_data[0], &mut workspace, conv_config).unwrap(); + backend.convolution(filter_data, input_data[0], output_data[0], + &mut workspace, conv_config).unwrap(); } } @@ -197,7 +198,9 @@ impl<B: IBackend + conn::Convolution<f32>> ComputeInputGradient<f32, B> for Conv let conv_config = self.convolution_config.as_ref().unwrap(); let mut workspace = self.workspace.as_ref().unwrap().write().unwrap(); // compute gradient w.r.t. input - backend.convolution_grad_data_plain(filter_data, output_gradients[0], input_gradients[0], &mut workspace, conv_config).unwrap(); + backend.convolution_grad_data(filter_data, + output_gradients[0], input_gradients[0], + &mut workspace, conv_config).unwrap(); } } @@ -213,7 +216,9 @@ impl<B: IBackend + conn::Convolution<f32>> ComputeParametersGradient<f32, B> for let conv_config = self.convolution_config.as_ref().unwrap(); let mut workspace = self.workspace.as_ref().unwrap().write().unwrap(); // compute gradient w.r.t. filter - backend.convolution_grad_filter_plain(input_data[0], output_gradients[0], filter_gradient, &mut workspace, conv_config).unwrap(); + backend.convolution_grad_filter(input_data[0], output_gradients[0], + filter_gradient, &mut workspace, + conv_config).unwrap(); } } diff --git a/src/layers/common/linear.rs b/src/layers/common/linear.rs index c7ec6391..47a71cca 100644 --- a/src/layers/common/linear.rs +++ b/src/layers/common/linear.rs @@ -18,7 +18,6 @@ //! //! In the context of convolutional neural networks this layer is also //! called a "fully-connected layer" if it is used at the end of the network. -use std::rc::Rc; use co::backend::IBackend; use co::tensor::SharedTensor; use coblas::transpose::Transpose; @@ -75,14 +74,6 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear { true } - fn init(&mut self, backend: Rc<B>) { - let device = <B as IBackend>::device(&backend); - let _ = self.one.add_device(device); - self.one.sync(device).unwrap(); - let _ = self.zero.add_device(device); - self.zero.sync(device).unwrap(); - } - fn reshape(&mut self, backend: ::std::rc::Rc<B>, input_data: &mut Vec<ArcLock<SharedTensor<f32>>>, @@ -106,10 +97,6 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear { output_size: self.output_size, }; filler.fill(&mut weight.write().unwrap()); - - let native_backend = ::util::native_backend(); - let bound_weight = weight.read().unwrap(); - let native_output = bound_weight.get(native_backend.device()).unwrap().as_native().unwrap(); } if let Some(weight) = weights_gradient.get(0) { weight.write().unwrap().resize(&weight_shape).unwrap(); @@ -123,12 +110,20 @@ impl<B: IBackend + LayerOps<f32>> ComputeOutput<f32, B> for Linear { weights: &[&SharedTensor<f32>], input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { - backend.gemm_plain(&self.one, Transpose::NoTrans, input_data[0], Transpose::Trans, weights[0], &self.zero, output_data[0]).unwrap(); + backend.gemm(&self.one, + Transpose::NoTrans, input_data[0], + Transpose::Trans, weights[0], + &self.zero, + output_data[0]).unwrap(); let has_bias_term = false; // TODO: implement bias term if has_bias_term { let bias_multiplier = unimplemented!(); let bias_data = unimplemented!(); - backend.gemm_plain(&self.one, Transpose::NoTrans, bias_multiplier, Transpose::NoTrans, bias_data, &self.one, output_data[0]).unwrap(); + backend.gemm(&self.one, + Transpose::NoTrans, bias_multiplier, + Transpose::NoTrans, bias_data, + &self.one, + output_data[0]).unwrap(); } } } @@ -142,7 +137,11 @@ impl<B: IBackend + LayerOps<f32>> ComputeInputGradient<f32, B> for Linear { input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { // Gradient with respect to input data - backend.gemm_plain(&self.one, Transpose::NoTrans, output_gradients[0], Transpose::NoTrans, weights_data[0], &self.zero, input_gradients[0]).unwrap(); + backend.gemm(&self.one, + Transpose::NoTrans, output_gradients[0], + Transpose::NoTrans, weights_data[0], + &self.zero, + input_gradients[0]).unwrap(); } } @@ -154,7 +153,11 @@ impl<B: IBackend + LayerOps<f32>> ComputeParametersGradient<f32, B> for Linear { input_data: &[&SharedTensor<f32>], parameters_gradients: &mut [&mut SharedTensor<f32>]) { // gradient w.r.t. weights - backend.gemm_plain(&self.one, Transpose::Trans, output_gradients[0], Transpose::NoTrans, input_data[0], &self.zero, parameters_gradients[0]).unwrap(); + backend.gemm(&self.one, + Transpose::Trans, output_gradients[0], + Transpose::NoTrans, input_data[0], + &self.zero, + parameters_gradients[0]).unwrap(); // TODO: implement gradient w.r.t bias // if (bias_term_ && this->param_propagate_down_[1]) { diff --git a/src/layers/common/log_softmax.rs b/src/layers/common/log_softmax.rs index 476f2fb5..d7c06363 100644 --- a/src/layers/common/log_softmax.rs +++ b/src/layers/common/log_softmax.rs @@ -32,7 +32,7 @@ impl<B: IBackend + conn::LogSoftmax<f32>> ComputeOutput<f32, B> for LogSoftmax { _weights: &[&SharedTensor<f32>], input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { - backend.log_softmax_plain(input_data[0], output_data[0]).unwrap(); + backend.log_softmax(input_data[0], output_data[0]).unwrap(); } } @@ -44,7 +44,8 @@ impl<B: IBackend + conn::LogSoftmax<f32>> ComputeInputGradient<f32, B> for LogSo output_gradients: &[&SharedTensor<f32>], input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { - backend.log_softmax_grad_plain(output_data[0], output_gradients[0], input_gradients[0]).unwrap(); + backend.log_softmax_grad(output_data[0], output_gradients[0], + input_gradients[0]).unwrap(); } } diff --git a/src/layers/common/pooling.rs b/src/layers/common/pooling.rs index 1c1e91fa..ca033660 100644 --- a/src/layers/common/pooling.rs +++ b/src/layers/common/pooling.rs @@ -123,7 +123,8 @@ impl<B: IBackend + conn::Pooling<f32>> ComputeOutput<f32, B> for Pooling<f32, B> output_data: &mut [&mut SharedTensor<f32>]) { let config = &self.pooling_configs[0]; match self.mode { - PoolingMode::Max => backend.pooling_max_plain(input_data[0], output_data[0], &*config).unwrap(), + PoolingMode::Max => backend.pooling_max(input_data[0], output_data[0], + &*config).unwrap(), // TODO: implement average pooling // PoolingMode::Average => unimplemented!(), } @@ -140,7 +141,9 @@ impl<B: IBackend + conn::Pooling<f32>> ComputeInputGradient<f32, B> for Pooling< input_gradients: &mut [&mut SharedTensor<f32>]) { let config = &self.pooling_configs[0]; match self.mode { - PoolingMode::Max => backend.pooling_max_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0], config).unwrap() + PoolingMode::Max => backend.pooling_max_grad( + output_data[0], output_gradients[0], + input_data[0], input_gradients[0], config).unwrap() } } } diff --git a/src/layers/common/softmax.rs b/src/layers/common/softmax.rs index 0ab38c6e..dfac9cdf 100644 --- a/src/layers/common/softmax.rs +++ b/src/layers/common/softmax.rs @@ -33,7 +33,7 @@ impl<B: IBackend + conn::Softmax<f32>> ComputeOutput<f32, B> for Softmax { _weights: &[&SharedTensor<f32>], input_data: &[&SharedTensor<f32>], output_data: &mut [&mut SharedTensor<f32>]) { - backend.softmax_plain(input_data[0], output_data[0]).unwrap(); + backend.softmax(input_data[0], output_data[0]).unwrap(); } } @@ -45,7 +45,8 @@ impl<B: IBackend + conn::Softmax<f32>> ComputeInputGradient<f32, B> for Softmax output_gradients: &[&SharedTensor<f32>], input_data: &[&SharedTensor<f32>], input_gradients: &mut [&mut SharedTensor<f32>]) { - backend.softmax_grad_plain(output_data[0], output_gradients[0], input_gradients[0]).unwrap(); + backend.softmax_grad(output_data[0], output_gradients[0], + input_gradients[0]).unwrap(); } } diff --git a/src/layers/container/sequential.rs b/src/layers/container/sequential.rs index 90b005bb..175493ba 100644 --- a/src/layers/container/sequential.rs +++ b/src/layers/container/sequential.rs @@ -158,8 +158,10 @@ impl<B: IBackend + LayerOps<f32> + 'static> Sequential<B> { info!("Input {} -> {}", self.input_data_tensors.len(), tensor_name); let ibackend: Rc<IBackend<F=B::F>> = backend; - let data_tensor: ArcLock<SharedTensor<f32>> = Arc::new(RwLock::new(SharedTensor::new(ibackend.device(), &input_shape).unwrap())); - let gradient_tensor: ArcLock<SharedTensor<f32>> = Arc::new(RwLock::new(SharedTensor::new(ibackend.device(), &input_shape).unwrap())); + let data_tensor: ArcLock<SharedTensor<f32>> = Arc::new(RwLock::new( + SharedTensor::new(&input_shape))); + let gradient_tensor: ArcLock<SharedTensor<f32>> = Arc::new(RwLock::new( + SharedTensor::new(&input_shape))); self.input_data_tensors.push(data_tensor.clone()); self.input_gradient_tensors.push(gradient_tensor.clone()); diff --git a/src/layers/loss/negative_log_likelihood.rs b/src/layers/loss/negative_log_likelihood.rs index 6082299c..f0c080f0 100644 --- a/src/layers/loss/negative_log_likelihood.rs +++ b/src/layers/loss/negative_log_likelihood.rs @@ -73,8 +73,10 @@ impl<B: IBackend> ComputeOutput<f32, B> for NegativeLogLikelihood { let batch_size = Self::batch_size(labels.desc()); let native = native_backend(); - let native_labels = labels.get(native.device()).unwrap().as_native().unwrap().as_slice::<f32>(); - let native_probabilities = probabilities.get(native.device()).unwrap().as_native().unwrap().as_slice::<f32>(); + let native_labels = labels.read(native.device()).unwrap() + .as_native().unwrap().as_slice::<f32>(); + let native_probabilities = probabilities.read(native.device()).unwrap() + .as_native().unwrap().as_slice::<f32>(); let mut writable_loss = Vec::<f32>::new(); for &label_value in native_labels { @@ -86,7 +88,8 @@ impl<B: IBackend> ComputeOutput<f32, B> for NegativeLogLikelihood { loss = loss / (batch_size as f32); writable_loss = vec![loss]; - ::util::write_to_memory(output_data[0].get_mut(native.device()).unwrap(), &writable_loss); + ::util::write_to_memory(output_data[0].write_only(native.device()).unwrap(), + &writable_loss); } } @@ -103,15 +106,16 @@ impl<B: IBackend> ComputeInputGradient<f32, B> for NegativeLogLikelihood { let num_classes = self.num_classes; let native = native_backend(); - let native_labels = labels.get(native.device()).unwrap().as_native().unwrap().as_slice::<f32>(); + let native_labels = labels.read(native.device()).unwrap() + .as_native().unwrap().as_slice::<f32>(); let mut writable_gradient = vec![0f32; input_gradients[0].desc().size()]; for (batch_n, &label_value) in native_labels.iter().enumerate() { let index = (num_classes * batch_n) + label_value as usize; writable_gradient[index] = -1f32; } - input_gradients[0].sync(native.device()).unwrap(); - ::util::write_to_memory(input_gradients[0].get_mut(native.device()).unwrap(), &writable_gradient); + ::util::write_to_memory(input_gradients[0].write_only(native.device()).unwrap(), + &writable_gradient); } } diff --git a/src/solver/confusion_matrix.rs b/src/solver/confusion_matrix.rs index b5b7e349..0b32bc74 100644 --- a/src/solver/confusion_matrix.rs +++ b/src/solver/confusion_matrix.rs @@ -48,7 +48,8 @@ impl ConfusionMatrix { /// The prediction for each sample of the batch is found by /// determining which output value had the smallest loss. pub fn get_predictions(&self, network_out: &mut SharedTensor<f32>) -> Vec<usize> { - let native_infered = network_out.get(native_backend().device()).unwrap().as_native().unwrap(); + let native_infered = network_out.read(native_backend().device()).unwrap() + .as_native().unwrap(); let predictions_slice = native_infered.as_slice::<f32>(); let mut predictions = Vec::<usize>::new(); diff --git a/src/solvers/mod.rs b/src/solvers/mod.rs index fcda65f0..e0e70c65 100644 --- a/src/solvers/mod.rs +++ b/src/solvers/mod.rs @@ -68,13 +68,14 @@ trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f3 let backend = self.backend(); for net_gradient in net_gradients.clone() { let gradient = net_gradient.read().unwrap(); - let mut result = SharedTensor::<f32>::new(IBackend::device(backend), &1).unwrap(); + // PERF: preallocate tensor once + let mut result = SharedTensor::new(&[1]); // gradient.sumsq_diff(self.backend(), &mut result); - self.backend().dot_plain(&gradient, &gradient, &mut result); + self.backend().dot(&gradient, &gradient, &mut result); - let mut result = SharedTensor::<f32>::new(IBackend::device(backend), &1).unwrap(); - match result.add_device(native.device()) { _ => result.sync(native.device()).unwrap() } - match result.get(native.device()).unwrap() { + // FIXME: I've removed redefinition of `result` that was here. + // Code was invalid. Not sure what it meant. It may explode. + match result.read(native.device()).unwrap() { &MemoryType::Native(ref sumsq_result) => { let sumsq_diff_slice = sumsq_result.as_slice::<f32>(); sumsq_diff += sumsq_diff_slice[0]; diff --git a/src/solvers/sgd/mod.rs b/src/solvers/sgd/mod.rs index 64cab199..6159ecb9 100644 --- a/src/solvers/sgd/mod.rs +++ b/src/solvers/sgd/mod.rs @@ -31,8 +31,7 @@ macro_rules! impl_isolver_sgd { for weight_gradient in net.learnable_weights_gradients() { let shape = weight_gradient.read().unwrap().desc().clone(); - let mut tensor = SharedTensor::new(IBackend::device(&*self.backend), - &shape).unwrap(); + let mut tensor = SharedTensor::new(&shape); let filler = ::weight::FillerType::Constant { value: 0f32 }; filler.fill(&mut tensor); diff --git a/src/solvers/sgd/momentum.rs b/src/solvers/sgd/momentum.rs index 15dbb759..334e7d4b 100644 --- a/src/solvers/sgd/momentum.rs +++ b/src/solvers/sgd/momentum.rs @@ -46,19 +46,12 @@ impl<SolverB: IBackend + SolverOps<f32>> Momentum<SolverB> { /// /// [2]: ../../../solver/struct.Solver.html#method.from_config pub fn new(backend: Rc<SolverB>) -> Momentum<SolverB> { - let (lr, momentum) = { - let device = IBackend::device(backend.as_ref()); - - (SharedTensor::<f32>::new(device, &1).unwrap(), - SharedTensor::<f32>::new(device, &1).unwrap()) - }; - Momentum { history: Vec::new(), backend: backend, - lr: lr, - momentum: momentum, + lr: SharedTensor::<f32>::new(&[1]), + momentum: SharedTensor::<f32>::new(&[1]), } } @@ -71,6 +64,7 @@ impl<B: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> SGD history_blob_id: usize, global_lr: &f32, blob_lr: &f32) { + // PERF: check if value is changed before writing it ::weight::FillerType::Constant { value: global_lr * blob_lr }.fill(&mut self.lr); @@ -83,20 +77,14 @@ impl<B: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> SGD let device = IBackend::device(backend); let history_blob = &self.history[history_blob_id]; + Axpby::axpby(backend, + &self.lr, + &weight_gradient.read().unwrap(), + &self.momentum, + &mut history_blob.write().unwrap()).unwrap(); - let _ = weight_gradient.write().unwrap().add_device(device); - weight_gradient.write().unwrap().sync(device).unwrap(); - let _ = history_blob.write().unwrap().add_device(device); - history_blob.write().unwrap().sync(device).unwrap(); - - Axpby::axpby_plain(backend, - &self.lr, - &weight_gradient.read().unwrap(), - &self.momentum, - &mut history_blob.write().unwrap()).unwrap(); - - backend.copy_plain( - &history_blob.read().unwrap(), &mut weight_gradient.write().unwrap()).unwrap(); + backend.copy(&history_blob.read().unwrap(), + &mut weight_gradient.write().unwrap()).unwrap(); } } diff --git a/src/util.rs b/src/util.rs index 91f4c3ef..b7a29cf0 100644 --- a/src/util.rs +++ b/src/util.rs @@ -51,16 +51,16 @@ pub fn write_batch_sample<T: NumCast + ::std::marker::Copy>(tensor: &mut SharedT let batch_size = tensor.desc().size(); let sample_size = batch_size / tensor.desc()[0]; - let _ = tensor.add_device(native_backend.device()); - tensor.sync(native_backend.device()).unwrap(); - write_to_memory_offset(tensor.get_mut(native_backend.device()).unwrap(), &data, i * sample_size); + write_to_memory_offset(tensor.write_only(native_backend.device()).unwrap(), + &data, + i * sample_size); } /// Create a Collenchyma SharedTensor for a scalar value. pub fn native_scalar<T: NumCast + ::std::marker::Copy>(scalar: T) -> SharedTensor<T> { let native = native_backend(); - let mut shared_scalar = SharedTensor::<T>::new(native.device(), &vec![1]).unwrap(); - write_to_memory(shared_scalar.get_mut(native.device()).unwrap(), &[scalar]); + let mut shared_scalar = SharedTensor::<T>::new(&[1]); + write_to_memory(shared_scalar.write_only(native.device()).unwrap(), &[scalar]); shared_scalar } @@ -79,20 +79,12 @@ pub trait Axpby<F> : Axpy<F> + Scal<F> { /// Performs the operation y := a*x + b*y . /// /// Consists of a scal(b, y) followed by a axpby(a,x,y). - fn axpby(&self, a: &mut SharedTensor<F>, x: &mut SharedTensor<F>, b: &mut SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::co::error::Error> { + fn axpby(&self, a: &SharedTensor<F>, x: &SharedTensor<F>, b: &SharedTensor<F>, + y: &mut SharedTensor<F>) -> Result<(), ::co::error::Error> { try!(self.scal(b, y)); try!(self.axpy(a, x, y)); Ok(()) } - - /// Performs the operation y := a*x + b*y . - /// - /// Consists of a scal(b, y) followed by a axpby(a,x,y). - fn axpby_plain(&self, a: &SharedTensor<F>, x: &SharedTensor<F>, b: &SharedTensor<F>, y: &mut SharedTensor<F>) -> Result<(), ::co::error::Error> { - try!(self.scal_plain(b, y)); - try!(self.axpy_plain(a, x, y)); - Ok(()) - } } impl<T: Axpy<f32> + Scal<f32>> Axpby<f32> for T {} diff --git a/src/weight.rs b/src/weight.rs index 09fc631e..e4a8a114 100644 --- a/src/weight.rs +++ b/src/weight.rs @@ -169,23 +169,20 @@ impl FillerType { pub fn fill(&self, weight: &mut SharedTensor<f32>) { let native = native_backend(); let native_device = native.device(); - let actual_device = weight.latest_device().clone(); - // sync to native so we can fill - match weight.add_device(native_device) { _ => weight.sync(native_device).unwrap() } match *self { - FillerType::Constant { value } => Self::fill_constant(weight, value), - FillerType::Glorot { input_size, output_size } => Self::fill_glorot(weight, input_size, output_size), + FillerType::Constant { value } => + Self::fill_constant(weight, value), + FillerType::Glorot { input_size, output_size } => + Self::fill_glorot(weight, input_size, output_size), } - - // sync back to the actual device - weight.sync(&actual_device).unwrap(); } /// Directly use the [Constant Filler](#variant.Constant). pub fn fill_constant(weight: &mut SharedTensor<f32>, value: f32) { let native = native_backend(); - let native_weight = weight.get_mut(native.device()).unwrap().as_mut_native().unwrap(); + let native_weight = weight.write_only(native.device()).unwrap() + .as_mut_native().unwrap(); for e in native_weight.as_mut_slice::<f32>() { *e = value; @@ -195,7 +192,8 @@ impl FillerType { /// Directly use the [Glorot Filler](#variant.Glorot). pub fn fill_glorot(weight: &mut SharedTensor<f32>, num_inputs: usize, num_outputs: usize) { let native = native_backend(); - let native_weight = weight.get_mut(native.device()).unwrap().as_mut_native().unwrap(); + let native_weight = weight.write_only(native.device()).unwrap() + .as_mut_native().unwrap(); let init_range = (6.0f32 / (num_inputs as f32 + num_outputs as f32)).sqrt(); diff --git a/tests/layer_specs.rs b/tests/layer_specs.rs index d368a172..94b4bb5e 100644 --- a/tests/layer_specs.rs +++ b/tests/layer_specs.rs @@ -66,8 +66,10 @@ mod layer_spec { let loaded_weights = loaded_layer.learnable_weights_data(); let loaded_weight_lock = loaded_weights[0].read().unwrap(); - let original_weight = original_weight_lock.get(native_backend().device()).unwrap().as_native().unwrap().as_slice::<f32>(); - let loaded_weight = loaded_weight_lock.get(native_backend().device()).unwrap().as_native().unwrap().as_slice::<f32>(); + let original_weight = original_weight_lock.read(native_backend().device()) + .unwrap().as_native().unwrap().as_slice::<f32>(); + let loaded_weight = loaded_weight_lock.read(native_backend().device()) + .unwrap().as_native().unwrap().as_slice::<f32>(); assert_eq!(original_weight, loaded_weight); } @@ -131,27 +133,28 @@ mod layer_spec { let mut reshape_network = Layer::from_config(cuda_backend.clone(), &LayerConfig::new("reshape_model", LayerType::Sequential(reshape_model))); let input = vec![1f32, 1f32, 2f32]; - let mut normal_tensor = SharedTensor::<f32>::new(native_backend.device(), &(3)).unwrap(); + let mut normal_tensor = SharedTensor::<f32>::new(&[3]); // let mut normal_tensor_output = SharedTensor::<f32>::new(native_backend.device(), &(3)).unwrap(); - let mut reshape_tensor = SharedTensor::<f32>::new(native_backend.device(), &(3)).unwrap(); + let mut reshape_tensor = SharedTensor::<f32>::new(&[3]); // let mut reshape_tensor_output = SharedTensor::<f32>::new(native_backend.device(), &(3)).unwrap(); - write_to_memory(normal_tensor.get_mut(native_backend.device()).unwrap(), &input); - write_to_memory(reshape_tensor.get_mut(native_backend.device()).unwrap(), &input); + write_to_memory(normal_tensor.write_only(native_backend.device()).unwrap(), &input); + write_to_memory(reshape_tensor.write_only(native_backend.device()).unwrap(), &input); let normal_tensor_output = normal_network.forward(&[Arc::new(RwLock::new(normal_tensor))])[0].clone(); - let _ = normal_tensor_output.write().unwrap().add_device(native_backend.device()); - normal_tensor_output.write().unwrap().sync(native_backend.device()).unwrap(); let normal_tensor_output_native_ = normal_tensor_output.read().unwrap(); - let normal_tensor_output_native = normal_tensor_output_native_.get(native_backend.device()).unwrap().as_native().unwrap(); - assert_eq!(&[0.7310585786f32, 0.7310586f32, 0.880797f32], normal_tensor_output_native.as_slice::<f32>()); + let normal_tensor_output_native = normal_tensor_output_native_ + .read(native_backend.device()).unwrap().as_native().unwrap(); + assert_eq!(&[0.7310585786f32, 0.7310586f32, 0.880797f32], + normal_tensor_output_native.as_slice::<f32>()); let reshape_tensor_output = reshape_network.forward(&[Arc::new(RwLock::new(reshape_tensor))])[0].clone(); - let _ = reshape_tensor_output.write().unwrap().add_device(native_backend.device()); - reshape_tensor_output.write().unwrap().sync(native_backend.device()).unwrap(); let reshape_tensor_output_native_ = reshape_tensor_output.read().unwrap(); - let reshape_tensor_output_native = reshape_tensor_output_native_.get(native_backend.device()).unwrap().as_native().unwrap(); - assert_eq!(&[0.7310585786f32, 0.7310586f32, 0.880797f32], reshape_tensor_output_native.as_slice::<f32>()); - assert_eq!(normal_tensor_output_native.as_slice::<f32>(), reshape_tensor_output_native.as_slice::<f32>()); + let reshape_tensor_output_native = reshape_tensor_output_native_ + .read(native_backend.device()).unwrap().as_native().unwrap(); + assert_eq!(&[0.7310585786f32, 0.7310586f32, 0.880797f32], + reshape_tensor_output_native.as_slice::<f32>()); + assert_eq!(normal_tensor_output_native.as_slice::<f32>(), + reshape_tensor_output_native.as_slice::<f32>()); } }