Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dfdx-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ num-traits = { workspace = true }
safetensors = { workspace = true, optional = true }
memmap2 = { workspace = true, optional = true }
half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
rayon = { version = "1.7.0", optional = true }
libm = { workspace = true }
wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }
Expand Down
1 change: 1 addition & 0 deletions dfdx-core/src/data/collate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ impl<A, B> Collate for Vec<(A, B)> {
impl<'a, A, B> Collate for Vec<&'a (A, B)> {
type Collated = (Vec<&'a A>, Vec<&'a B>);
fn collated(self) -> Self::Collated {
#[allow(clippy::map_identity)]
self.into_iter().map(|(a, b)| (a, b)).unzip()
}
}
Expand Down
38 changes: 0 additions & 38 deletions dfdx-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,44 +128,6 @@ pub mod prelude {
pub use crate::tensor_ops::*;
}

/// Sets a CPU `sse` flag to flush denormal floating point numbers to zero. The opposite of this is [keep_denormals()].
///
/// Some resources:
/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
pub fn flush_denormals_to_zero() {
#[cfg(all(target_arch = "x86", target_feature = "sse"))]
{
use std::arch::x86::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
}

#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
{
use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
}
}

/// Sets a CPU flag to keep denormal floating point numbers. The opposite of this is [flush_denormals_to_zero()].
///
/// Some resources:
/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
pub fn keep_denormals() {
#[cfg(all(target_arch = "x86", target_feature = "sse"))]
{
use std::arch::x86::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
}

#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
{
use std::arch::x86_64::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
}
}

#[cfg(test)]
pub(crate) mod tests {
pub use num_traits::{Float, NumCast, Zero};
Expand Down
24 changes: 24 additions & 0 deletions dfdx-core/src/shapes/shape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,30 @@ where
}
}

impl<const N: usize> core::ops::Sub<Const<N>> for usize {
type Output = usize;
fn sub(self, _: Const<N>) -> Self::Output {
self.size() - N
}
}
impl<const N: usize> core::ops::Sub<usize> for Const<N> {
type Output = usize;
fn sub(self, rhs: usize) -> Self::Output {
N - rhs.size()
}
}

#[cfg(feature = "nightly")]
impl<const N: usize, const M: usize> core::ops::Sub<Const<N>> for Const<M>
where
Const<{ M - N }>: Sized,
{
type Output = Const<{ M - N }>;
fn sub(self, _: Const<N>) -> Self::Output {
Const
}
}

impl<const N: usize> core::ops::Mul<Const<N>> for usize {
type Output = usize;
fn mul(self, _: Const<N>) -> Self::Output {
Expand Down
2 changes: 1 addition & 1 deletion dfdx-core/src/tensor/gradients.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
#[inline]
pub(crate) fn many_and_ref<L: Shape, R: Shape>(
&mut self,
ls: &Vec<impl Tensorlike<L, E, D>>,
ls: &[impl Tensorlike<L, E, D>],
r: &impl Tensorlike<R, E, D>,
) -> (Vec<&mut D::Vec>, &D::Vec) {
for i in 0..ls.len() {
Expand Down
50 changes: 39 additions & 11 deletions dfdx-core/src/tensor_ops/utilities/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,33 +114,61 @@ pub trait Device<E: Dtype>:
+ crate::tensor_ops::axpy::AxpyKernel<E>

// conv1d
+ super::super::conv1d::Conv1DKernel<E>
+ NonCudnnCuda<E>
{
}

#[cfg(feature = "cudnn")]
pub trait NonCudnnCuda<E: Dtype> {}

#[cfg(not(feature = "cudnn"))]
pub trait NonCudnnCuda<E: Dtype>:
// conv1d
super::super::conv1d::Conv1DKernel<E>
{
}

#[cfg(feature = "f16")]
impl Device<f16> for crate::tensor::Cpu {}
#[cfg(feature = "f16")]
impl Device<AMP<f16>> for crate::tensor::Cpu {}
mod f16_ {
use super::*;
impl Device<f16> for crate::tensor::Cpu {}
impl NonCudnnCuda<f16> for crate::tensor::Cpu {}
impl Device<AMP<f16>> for crate::tensor::Cpu {}
impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cpu {}
}
impl Device<f32> for crate::tensor::Cpu {}
impl NonCudnnCuda<f32> for crate::tensor::Cpu {}
impl Device<f64> for crate::tensor::Cpu {}
impl NonCudnnCuda<f64> for crate::tensor::Cpu {}

#[cfg(all(feature = "cuda", feature = "f16"))]
impl Device<f16> for crate::tensor::Cuda {}
#[cfg(all(feature = "cuda", feature = "f16"))]
impl Device<AMP<f16>> for crate::tensor::Cuda {}
#[cfg(feature = "cuda")]
impl Device<f32> for crate::tensor::Cuda {}
mod cuda_f16 {
use super::*;
impl Device<f16> for crate::tensor::Cuda {}
impl NonCudnnCuda<f16> for crate::tensor::Cuda {}
impl Device<AMP<f16>> for crate::tensor::Cuda {}
impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cuda {}
}
#[cfg(feature = "cuda")]
impl Device<f64> for crate::tensor::Cuda {}
mod cuda {
use super::*;
impl Device<f32> for crate::tensor::Cuda {}
impl NonCudnnCuda<f32> for crate::tensor::Cuda {}
impl Device<f64> for crate::tensor::Cuda {}
impl NonCudnnCuda<f64> for crate::tensor::Cuda {}
}

// TODO: How can we implement this for f16 when WGSL doesn't support f16 yet?
// #[cfg(all(feature = "webgpu", feature = "f16"))]
// impl Device<f16> for crate::tensor::Webgpu {}
// #[cfg(all(feature = "webgpu", feature = "f16"))]
// impl Device<AMP<f16>> for crate::tensor::Webgpu {}
#[cfg(feature = "webgpu")]
impl Device<f32> for crate::tensor::Webgpu {}
mod webgpu {
use super::*;
impl Device<f32> for crate::tensor::Webgpu {}
impl NonCudnnCuda<f32> for crate::tensor::Webgpu {}
}

// TODO: How can we implement this for f64 when WGSL doesn't support f64 yet?
// #[cfg(feature = "webgpu")]
Expand Down
3 changes: 0 additions & 3 deletions dfdx/examples/12-mnist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ type Mlp = (
const BATCH_SIZE: usize = 32;

fn main() {
// ftz substantially improves performance
dfdx::flush_denormals_to_zero();

let mnist_path = std::env::args()
.nth(1)
.unwrap_or_else(|| "./datasets/MNIST/raw".to_string());
Expand Down