diff --git a/REQUIRE b/REQUIRE
index bc97a8b..91d46e2 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,5 +1,4 @@
 julia 0.6
-Compat 0.8.6
 StatsBase 0.15.0
 Reexport
 SpecialFunctions
diff --git a/src/DataArrays.jl b/src/DataArrays.jl
index 9627bd8..5cd9d19 100644
--- a/src/DataArrays.jl
+++ b/src/DataArrays.jl
@@ -2,7 +2,7 @@ __precompile__()
 
 module DataArrays
     using Base: promote_op
-    using Base.Cartesian, Compat, Reexport
+    using Base.Cartesian, Reexport
     @reexport using StatsBase
     using SpecialFunctions
 
diff --git a/src/reducedim.jl b/src/reducedim.jl
index 4b9a13f..bf7369b 100644
--- a/src/reducedim.jl
+++ b/src/reducedim.jl
@@ -5,23 +5,25 @@ using Base.check_reducedims
 # This is a substantially faster implementation of the "all" reduction
 # across dimensions for reducing a BitArray to an Array{Bool}. We use
 # this below for implementing MaxFun and MinFun with skipna=true.
-@ngenerate N typeof(R) function Base._mapreducedim!{N}(f, op::typeof(&), R::Array{Bool}, A::BitArray{N})
-    lsiz = check_reducedims(R, A)
-    isempty(A) && return R
-    @nextract N sizeR d->size(R, d)
-    @nexprs 1 d->(state_0 = state_{N} = 1)
-    @nexprs N d->(skip_d = sizeR_d == 1)
-    Achunks = A.chunks
-    k = 1
-    @nloops(N, i, A,
-        d->(state_{d-1} = state_d),
-        d->(skip_d || (state_d = state_0)),
-        begin
-            @inbounds R[state_0] &= f(Base.unsafe_bitgetindex(Achunks, k))
-            state_0 += 1
-            k += 1
-        end)
-    R
+@generated function Base._mapreducedim!(f, op::typeof(&), R::Array{Bool}, A::BitArray{N}) where {N}
+    quote
+        lsiz = check_reducedims(R, A)
+        isempty(A) && return R
+        @nextract $N sizeR d->size(R, d)
+        @nexprs 1 d->(state_0 = state_{$N} = 1)
+        @nexprs $N d->(skip_d = sizeR_d == 1)
+        Achunks = A.chunks
+        k = 1
+        @nloops($N, i, A,
+            d->(state_{d-1} = state_d),
+            d->(skip_d || (state_d = state_0)),
+            begin
+                @inbounds R[state_0] &= f(Base.unsafe_bitgetindex(Achunks, k))
+                state_0 += 1
+                k += 1
+            end)
+        R
+    end
 end
 
 # Determine if there are any true values in a BitArray in a given
@@ -60,103 +62,107 @@ function _count(B::BitArray, istart::Int, iend::Int)
 end
 
 ## NA-preserving
-@ngenerate N typeof(R) function _mapreducedim!{T,N}(f::SafeMapFuns, op::SafeReduceFuns,
-                                                    R::DataArray, A::DataArray{T,N})
-    data = A.data
-    na = A.na
-
-    lsiz = check_reducedims(R, data)
-    isempty(data) && return R
-
-    if lsiz > 16
-        # use mapreduce_impl, which is probably better tuned to achieve higher performance
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        extr = daextract(R)
-        for i = 1:nslices
-            if _any(na, ibase+1, ibase+lsiz)
-                unsafe_setna!(R, extr, i)
-            else
-                v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz)
-                @inbounds unsafe_dasetindex!(R, extr, v, i)
-            end
-            ibase += lsiz
-        end
-    else
-        @nextract N sizeR d->size(R,d)
-        na_chunks = A.na.chunks
-
-        new_data = R.data
-
-        # If reducing to a DataArray, skip strides with NAs.
-        # In my benchmarks, it is actually faster to compute a new NA
-        # array and BitArray it than to operate on the BitArray
-        # directly.
-        new_na = fill(false, size(new_data))
-
-        @nexprs 1 d->(state_0 = state_{N} = 1)
-        @nexprs N d->(skip_d = sizeR_d == 1)
-        k = 1
-        @nloops(N, i, A,
-            d->(state_{d-1} = state_d),
-            d->(skip_d || (state_d = state_0)), begin
-                @inbounds vna = new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k)
-                if vna
-                    @inbounds new_na[state_0] = true
+@generated function _mapreducedim!(f::SafeMapFuns, op::SafeReduceFuns,
+                                   R::DataArray, A::DataArray{T,N} where {T}) where {N}
+    quote
+        data = A.data
+        na = A.na
+
+        lsiz = check_reducedims(R, data)
+        isempty(data) && return R
+
+        if lsiz > 16
+            # use mapreduce_impl, which is probably better tuned to achieve higher performance
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            extr = daextract(R)
+            for i = 1:nslices
+                if _any(na, ibase+1, ibase+lsiz)
+                    unsafe_setna!(R, extr, i)
                 else
-                    @inbounds x = data[k]
-                    v = f(x)
-                    @inbounds v0 = new_data[state_0]
-                    nv = op(v0, v)
-                    @inbounds new_data[state_0] = nv
+                    v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz)
+                    @inbounds unsafe_dasetindex!(R, extr, v, i)
                 end
-
-                state_0 += 1
-                k += 1
-            end)
-
-        R.na = BitArray(new_na)
+                ibase += lsiz
+            end
+        else
+            @nextract $N sizeR d->size(R,d)
+            na_chunks = A.na.chunks
+
+            new_data = R.data
+
+            # If reducing to a DataArray, skip strides with NAs.
+            # In my benchmarks, it is actually faster to compute a new NA
+            # array and BitArray it than to operate on the BitArray
+            # directly.
+            new_na = fill(false, size(new_data))
+
+            @nexprs 1 d->(state_0 = state_{$N} = 1)
+            @nexprs $N d->(skip_d = sizeR_d == 1)
+            k = 1
+            @nloops($N, i, A,
+                d->(state_{d-1} = state_d),
+                d->(skip_d || (state_d = state_0)), begin
+                    @inbounds vna = new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k)
+                    if vna
+                        @inbounds new_na[state_0] = true
+                    else
+                        @inbounds x = data[k]
+                        v = f(x)
+                        @inbounds v0 = new_data[state_0]
+                        nv = op(v0, v)
+                        @inbounds new_data[state_0] = nv
+                    end
+
+                    state_0 += 1
+                    k += 1
+                end)
+
+            R.na = BitArray(new_na)
+        end
+        return R
     end
-    return R
 end
 
 ## NA-preserving to array
-@ngenerate N typeof(R) function _mapreducedim!{T,N}(f::SafeMapFuns, op::SafeReduceFuns,
-                                                    R::AbstractArray, A::DataArray{T,N})
-    data = A.data
-    na = A.na
-
-    lsiz = check_reducedims(R, data)
-    isempty(data) && return R
-
-    if lsiz > 16
-        # use mapreduce_impl, which is probably better tuned to achieve higher performance
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        extr = daextract(R)
-        for i = 1:nslices
-            if _any(na, ibase+1, ibase+lsiz)
-                throw(NAException("cannot reduce a DataArray containing NAs to an AbstractArray"))
-            else
-                v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz)
-                @inbounds unsafe_dasetindex!(R, extr, v, i)
+@generated function _mapreducedim!(f::SafeMapFuns, op::SafeReduceFuns,
+                                   R::AbstractArray, A::DataArray{T,N} where {T}) where {N}
+    quote
+        data = A.data
+        na   = A.na
+
+        lsiz = check_reducedims(R, data)
+        isempty(data) && return R
+
+        if lsiz > 16
+            # use mapreduce_impl, which is probably better tuned to achieve higher performance
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            extr = daextract(R)
+            for i = 1:nslices
+                if _any(na, ibase+1, ibase+lsiz)
+                    throw(NAException("cannot reduce a DataArray containing NAs to an AbstractArray"))
+                else
+                    v = Base.mapreduce_impl(f, op, data, ibase+1, ibase+lsiz)
+                    @inbounds unsafe_dasetindex!(R, extr, v, i)
+                end
+                ibase += lsiz
+            end
+        else
+            @nextract $N sizeR d->size(R,d)
+
+            # If reducing to a non-DataArray, throw an error at the start on NA
+            any(isna, A) && throw(NAException("cannot reduce a DataArray containing NAs to an AbstractArray"))
+            @nloops $N i data d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds x = (@nref $N data i)
+                v = f(x)
+                @inbounds v0 = (@nref $N R j)
+                nv = op(v0, v)
+                @inbounds (@nref $N R j) = nv
             end
-            ibase += lsiz
-        end
-    else
-        @nextract N sizeR d->size(R,d)
-
-        # If reducing to a non-DataArray, throw an error at the start on NA
-        any(isna, A) && throw(NAException("cannot reduce a DataArray containing NAs to an AbstractArray"))
-        @nloops N i data d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-            @inbounds x = (@nref N data i)
-            v = f(x)
-            @inbounds v0 = (@nref N R j)
-            nv = op(v0, v)
-            @inbounds (@nref N R j) = nv
         end
+        return R
     end
-    return R
 end
 _mapreducedim!(f, op, R, A) = Base._mapreducedim!(f, op, R, A)
 
@@ -166,60 +172,63 @@ _getdata(A::DataArray) = A.data
 
 # mapreduce across a dimension. If specified, C contains the number of
 # non-NA values reduced into each element of R.
-@ngenerate N typeof(R) function _mapreducedim_skipna_impl!{T,N}(f, op, R::AbstractArray,
-                                                                C::Union{Array{Int}, Void},
-                                                                A::DataArray{T,N})
-    data = A.data
-    na = A.na
-    na_chunks = na.chunks
-    new_data = _getdata(R)
-
-    C === nothing || size(R) == size(C) || throw(DimensionMismatch("R and C must have same size"))
-    lsiz = check_reducedims(new_data, data)
-    isempty(data) && return R
-
-    if lsiz > 16
-        # keep the accumulator as a local variable when reducing along the first dimension
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        for i = 1:nslices
-            # TODO: use pairwise impl for sum
-            @inbounds v = new_data[i]
-            @inbounds C !== nothing && (C[i] = lsiz - _count(na, ibase+1, ibase+lsiz))
-            for k = ibase+1:ibase+lsiz
-                @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue
-                @inbounds x = data[k]
-                v = convert(typeof(v), op(f(x), v))::typeof(v)
-            end
-            @inbounds new_data[i] = v
-            ibase += lsiz
-        end
-    else
-        # general implementation
-        @nextract N sizeR d->size(new_data,d)
-        @nexprs 1 d->(state_0 = state_{N} = 1)
-        @nexprs N d->(skip_d = sizeR_d == 1)
-        k = 1
-        C !== nothing && fill!(C, div(length(A), length(R)))
-        @nloops(N, i, A,
-            d->(state_{d-1} = state_d),
-            d->(skip_d || (state_d = state_0)), begin
-                @inbounds xna = Base.unsafe_bitgetindex(na_chunks, k)
-                if xna
-                    C !== nothing && @inbounds C[state_0] -= 1
-                else
+@generated function _mapreducedim_skipna_impl!(f, op, R::AbstractArray,
+                                                      C::Union{Array{Int}, Void},
+                                                      A::DataArray{T,N} where {T}) where {N}
+    quote
+
+        data = A.data
+        na = A.na
+        na_chunks = na.chunks
+        new_data = _getdata(R)
+
+        C === nothing || size(R) == size(C) || throw(DimensionMismatch("R and C must have same size"))
+        lsiz = check_reducedims(new_data, data)
+        isempty(data) && return R
+
+        if lsiz > 16
+            # keep the accumulator as a local variable when reducing along the first dimension
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            for i = 1:nslices
+                # TODO: use pairwise impl for sum
+                @inbounds v = new_data[i]
+                @inbounds C !== nothing && (C[i] = lsiz - _count(na, ibase+1, ibase+lsiz))
+                for k = ibase+1:ibase+lsiz
+                    @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue
                     @inbounds x = data[k]
-                    v = f(x)
-                    @inbounds v0 = new_data[state_0]
-                    nv = op(v0, v)
-                    @inbounds new_data[state_0] = nv
+                    v = convert(typeof(v), op(f(x), v))::typeof(v)
                 end
-
-                state_0 += 1
-                k += 1
-        end)
+                @inbounds new_data[i] = v
+                ibase += lsiz
+            end
+        else
+            # general implementation
+            @nextract $N sizeR d->size(new_data,d)
+            @nexprs 1 d->(state_0 = state_{$N} = 1)
+            @nexprs $N d->(skip_d = sizeR_d == 1)
+            k = 1
+            C !== nothing && fill!(C, div(length(A), length(R)))
+            @nloops($N, i, A,
+                d->(state_{d-1} = state_d),
+                d->(skip_d || (state_d = state_0)), begin
+                    @inbounds xna = Base.unsafe_bitgetindex(na_chunks, k)
+                    if xna
+                        C !== nothing && @inbounds C[state_0] -= 1
+                    else
+                        @inbounds x = data[k]
+                        v = f(x)
+                        @inbounds v0 = new_data[state_0]
+                        nv = op(v0, v)
+                        @inbounds new_data[state_0] = nv
+                    end
+
+                    state_0 += 1
+                    k += 1
+            end)
+        end
+        return R
     end
-    return R
 end
 
 _mapreducedim_skipna!(f, op, R::AbstractArray, A::DataArray) =
@@ -344,138 +353,142 @@ end
 
 # A version of _mapreducedim! that accepts an array S of the same size
 # as R, the elements of which are passed as a second argument to f.
-@ngenerate N typeof(R) function _mapreducedim_2arg!{T,N}(f, op, R::DataArray,
-                                                         A::DataArray{T,N},
-                                                         S::AbstractArray)
-    data = A.data
-    na = A.na
-    Sextr = daextract(S)
-
-    lsiz = check_reducedims(R, data)
-    size(R) == size(S) || throw(DimensionMismatch("R and S must have same size"))
-    isempty(data) && return R
-
-    if lsiz > 16
-        # use mapreduce_impl, which is probably better tuned to achieve higher performance
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        extr = daextract(R)
-        for i = 1:nslices
-            if unsafe_isna(S, Sextr, i) || _any(na, ibase+1, ibase+lsiz)
-                unsafe_setna!(R, extr, i)
-            else
-                @inbounds s = unsafe_getindex_notna(S, Sextr, i)
-                v = Base.mapreduce_impl(MapReduceDim2ArgHelperFun(f, s), op, data, ibase+1, ibase+lsiz)
-                @inbounds unsafe_dasetindex!(R, extr, v, i)
-            end
-            ibase += lsiz
-        end
-    else
-        @nextract N sizeR d->size(R,d)
-        na_chunks = A.na.chunks
-        new_data = R.data
-        new_na = isa(S, DataArray) ? Array(S.na) : fill(false, size(S))
-
-        @nexprs 1 d->(state_0 = state_{N} = 1)
-        @nexprs N d->(skip_d = sizeR_d == 1)
-        k = 1
-        @nloops(N, i, A,
-            d->(state_{d-1} = state_d),
-            d->(skip_d || (state_d = state_0)), begin
-                @inbounds vna = new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k)
-                if vna
-                    @inbounds new_na[state_0] = true
+@generated function _mapreducedim_2arg!(f, op, R::DataArray,
+                                                    A::DataArray{T,N} where {T},
+                                                    S::AbstractArray) where {N}
+    quote
+        data = A.data
+        na = A.na
+        Sextr = daextract(S)
+
+        lsiz = check_reducedims(R, data)
+        size(R) == size(S) || throw(DimensionMismatch("R and S must have same size"))
+        isempty(data) && return R
+
+        if lsiz > 16
+            # use mapreduce_impl, which is probably better tuned to achieve higher performance
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            extr = daextract(R)
+            for i = 1:nslices
+                if unsafe_isna(S, Sextr, i) || _any(na, ibase+1, ibase+lsiz)
+                    unsafe_setna!(R, extr, i)
                 else
-                    @inbounds s = unsafe_getindex_notna(S, Sextr, state_0)
-                    @inbounds x = data[k]
-                    v = f(x, s)
-                    @inbounds v0 = new_data[state_0]
-                    nv = op(v0, v)
-                    @inbounds new_data[state_0] = nv
+                    @inbounds s = unsafe_getindex_notna(S, Sextr, i)
+                    v = Base.mapreduce_impl(MapReduceDim2ArgHelperFun(f, s), op, data, ibase+1, ibase+lsiz)
+                    @inbounds unsafe_dasetindex!(R, extr, v, i)
                 end
-
-                state_0 += 1
-                k += 1
-            end)
-
-        R.na = BitArray(new_na)
+                ibase += lsiz
+            end
+        else
+            @nextract $N sizeR d->size(R,d)
+            na_chunks = A.na.chunks
+            new_data = R.data
+            new_na = isa(S, DataArray) ? Array(S.na) : fill(false, size(S))
+
+            @nexprs 1 d->(state_0 = state_{$N} = 1)
+            @nexprs $N d->(skip_d = sizeR_d == 1)
+            k = 1
+            @nloops($N, i, A,
+                d->(state_{d-1} = state_d),
+                d->(skip_d || (state_d = state_0)), begin
+                    @inbounds vna = new_na[state_0] | Base.unsafe_bitgetindex(na_chunks, k)
+                    if vna
+                        @inbounds new_na[state_0] = true
+                    else
+                        @inbounds s = unsafe_getindex_notna(S, Sextr, state_0)
+                        @inbounds x = data[k]
+                        v = f(x, s)
+                        @inbounds v0 = new_data[state_0]
+                        nv = op(v0, v)
+                        @inbounds new_data[state_0] = nv
+                    end
+
+                    state_0 += 1
+                    k += 1
+                end)
+
+            R.na = BitArray(new_na)
+        end
+        return R
     end
-    return R
 end
 
 # A version of _mapreducedim_skipna! that accepts an array S of the same size
 # as R, the elements of which are passed as a second argument to f.
-@ngenerate N typeof(R) function _mapreducedim_skipna_2arg!{T,N}(f, op, R::AbstractArray,
-                                                                C::Union{Array{Int}, Void},
-                                                                A::DataArray{T,N}, S::AbstractArray)
-    data = A.data
-    na = A.na
-    na_chunks = na.chunks
-    new_data = _getdata(R)
-    Sextr = daextract(S)
-
-    lsiz = check_reducedims(new_data, data)
-    C === nothing || size(R) == size(C) || throw(DimensionMismatch("R and C must have same size"))
-    size(R) == size(S) || throw(DimensionMismatch("R and S must have same size"))
-    isempty(data) && return R
-    @nextract N sizeR d->size(new_data,d)
-    sizA1 = size(data, 1)
-
-    # If there are any NAs in S, assume these will produce NAs in R
-    if isa(S, DataArray)
-        copy!(R.na, S.na)
-    end
+@generated function _mapreducedim_skipna_2arg!(f, op, R::AbstractArray,
+                                                      C::Union{Array{Int}, Void},
+                                                      A::DataArray{T,N} where {T}, S::AbstractArray) where {N}
+    quote
+        data = A.data
+        na = A.na
+        na_chunks = na.chunks
+        new_data = _getdata(R)
+        Sextr = daextract(S)
+
+        lsiz = check_reducedims(new_data, data)
+        C === nothing || size(R) == size(C) || throw(DimensionMismatch("R and C must have same size"))
+        size(R) == size(S) || throw(DimensionMismatch("R and S must have same size"))
+        isempty(data) && return R
+        @nextract $N sizeR d->size(new_data,d)
+        sizA1 = size(data, 1)
+
+        # If there are any NAs in S, assume these will produce NAs in R
+        if isa(S, DataArray)
+            copy!(R.na, S.na)
+        end
 
-    if lsiz > 16
-        # keep the accumulator as a local variable when reducing along the first dimension
-        nslices = div(length(A), lsiz)
-        ibase = 0
-        for i = 1:nslices
-            @inbounds v = new_data[i]
-            !isa(C, Void) && (C[i] = lsiz - _count(na, ibase+1, ibase+lsiz))
-
-            # If S[i] is NA, skip this iteration
-            @inbounds sna = unsafe_isna(S, Sextr, i)
-            if !sna
-                @inbounds s = unsafe_getindex_notna(S, Sextr, i)
-                # TODO: use pairwise impl for sum
-                for k = ibase+1:ibase+lsiz
-                    @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue
-                    @inbounds x = data[k]
-                    v = convert(typeof(v), op(f(x, s), v))::typeof(v)
+        if lsiz > 16
+            # keep the accumulator as a local variable when reducing along the first dimension
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            for i = 1:nslices
+                @inbounds v = new_data[i]
+                !isa(C, Void) && (C[i] = lsiz - _count(na, ibase+1, ibase+lsiz))
+
+                # If S[i] is NA, skip this iteration
+                @inbounds sna = unsafe_isna(S, Sextr, i)
+                if !sna
+                    @inbounds s = unsafe_getindex_notna(S, Sextr, i)
+                    # TODO: use pairwise impl for sum
+                    for k = ibase+1:ibase+lsiz
+                        @inbounds Base.unsafe_bitgetindex(na_chunks, k) && continue
+                        @inbounds x = data[k]
+                        v = convert(typeof(v), op(f(x, s), v))::typeof(v)
+                    end
+
+                    @inbounds new_data[i] = v
                 end
 
-                @inbounds new_data[i] = v
+                ibase += lsiz
             end
-
-            ibase += lsiz
+        else
+            # general implementation
+            @nexprs 1 d->(state_0 = state_{$N} = 1)
+            @nexprs $N d->(skip_d = sizeR_d == 1)
+            k = 1
+            !isa(C, Void) && fill!(C, div(length(A), length(R)))
+            @nloops($N, i, A,
+                d->(state_{d-1} = state_d),
+                d->(skip_d || (state_d = state_0)), begin
+                    @inbounds xna = Base.unsafe_bitgetindex(na_chunks, k) | unsafe_isna(S, Sextr, state_0)
+                    if xna
+                        !isa(C, Void) && @inbounds C[state_0] -= 1
+                    else
+                        @inbounds s = unsafe_getindex_notna(S, Sextr, state_0)
+                        @inbounds x = data[k]
+                        v = f(x, s)
+                        @inbounds v0 = new_data[state_0]
+                        nv = op(v0, v)
+                        @inbounds new_data[state_0] = nv
+                    end
+
+                    state_0 += 1
+                    k += 1
+            end)
         end
-    else
-        # general implementation
-        @nexprs 1 d->(state_0 = state_{N} = 1)
-        @nexprs N d->(skip_d = sizeR_d == 1)
-        k = 1
-        !isa(C, Void) && fill!(C, div(length(A), length(R)))
-        @nloops(N, i, A,
-            d->(state_{d-1} = state_d),
-            d->(skip_d || (state_d = state_0)), begin
-                @inbounds xna = Base.unsafe_bitgetindex(na_chunks, k) | unsafe_isna(S, Sextr, state_0)
-                if xna
-                    !isa(C, Void) && @inbounds C[state_0] -= 1
-                else
-                    @inbounds s = unsafe_getindex_notna(S, Sextr, state_0)
-                    @inbounds x = data[k]
-                    v = f(x, s)
-                    @inbounds v0 = new_data[state_0]
-                    nv = op(v0, v)
-                    @inbounds new_data[state_0] = nv
-                end
-
-                state_0 += 1
-                k += 1
-        end)
+        return R
     end
-    return R
 end
 
 struct Abs2MinusFun end