JuliaLang
diff --git a/‎base/abstractarray.jl
+3 b/‎base/abstractarray.jl
+3
diff --git a/‎base/array.jl
+3-3 b/‎base/array.jl
+3-3
diff --git a/‎base/darray.jl
+1-1 b/‎base/darray.jl
+1-1
diff --git a/‎base/multidimensional.jl
+100-30 b/‎base/multidimensional.jl
+100-30
@@ -67,6 +67,9 @@ immutable LinearSlow <: LinearIndexing end
 linearindexing(::AbstractArray) = LinearSlow()
 linearindexing(::Array) = LinearFast()
 linearindexing(::Range) = LinearFast()
+linearindexing{A<:AbstractArray}(::Type{A}) = LinearSlow()
+linearindexing{A<:Array}(::Type{A}) = LinearFast()
+linearindexing{A<:Range}(::Type{A}) = LinearFast()
 
 ## Bounds checking ##
 checkbounds(sz::Int, i::Int) = 1 <= i <= sz || throw(BoundsError())
 
@@ -8,9 +8,9 @@ typealias DenseVector{T} DenseArray{T,1}
 typealias DenseMatrix{T} DenseArray{T,2}
 typealias DenseVecOrMat{T} Union(DenseVector{T}, DenseMatrix{T})
 
-typealias StridedArray{T,N,A<:DenseArray} Union(DenseArray{T,N}, SubArray{T,N,A})
-typealias StridedVector{T,A<:DenseArray}  Union(DenseArray{T,1}, SubArray{T,1,A})
-typealias StridedMatrix{T,A<:DenseArray}  Union(DenseArray{T,2}, SubArray{T,2,A})
+typealias StridedArray{T,N,A<:DenseArray,I<:(RangeIndex...)} Union(DenseArray{T,N}, SubArray{T,N,A,I})
+typealias StridedVector{T,A<:DenseArray,I<:(RangeIndex...)}  Union(DenseArray{T,1}, SubArray{T,1,A,I})
+typealias StridedMatrix{T,A<:DenseArray,I<:(RangeIndex...)}  Union(DenseArray{T,2}, SubArray{T,2,A,I})
 typealias StridedVecOrMat{T} Union(StridedVector{T}, StridedMatrix{T})
 
 ## Basic functions ##
 
@@ -243,7 +243,7 @@ function getindex_tuple{T}(d::DArray{T}, I::(Int...))
 end
 
 getindex(d::DArray) = d[1]
-getindex(d::DArray, I::Union(Int,UnitRange{Int})...) = sub(d,I)
+getindex(d::DArray, I::Union(Int,UnitRange{Int})...) = sub(d,I...)
 
 copy(d::SubOrDArray) = d
 
 
@@ -8,6 +8,7 @@ export eachindex
 
 # Traits for linear indexing
 linearindexing(::BitArray) = LinearFast()
+linearindexing{A<:BitArray}(::Type{A}) = LinearFast()
 
 # Iterator/state
 abstract CartesianIndex{N}       # the state for all multidimensional iterators
@@ -134,11 +135,7 @@ using .IteratorsMD
     nothing
 end
 
-unsafe_getindex(v::Real, ind::Int) = v
-unsafe_getindex(v::Range, ind::Int) = first(v) + (ind-1)*step(v)
 unsafe_getindex(v::BitArray, ind::Int) = Base.unsafe_bitgetindex(v.chunks, ind)
-unsafe_getindex(v::AbstractArray, ind::Int) = v[ind]
-unsafe_getindex(v, ind::Real) = unsafe_getindex(v, to_index(ind))
 
 unsafe_setindex!{T}(v::AbstractArray{T}, x::T, ind::Int) = (v[ind] = x; v)
 unsafe_setindex!(v::BitArray, x::Bool, ind::Int) = (Base.unsafe_bitsetindex!(v.chunks, x, ind); v)
@@ -226,42 +223,115 @@ end
 
 ### subarray.jl
 
-# Here we want to skip creating the dict-based cached version,
-# so use the ngenerate function
-function gen_getindex_body(N::Int)
+function gen_setindex_body(N::Int)
     quote
-        strd_1 = 1
-        @nexprs $N d->(@inbounds strd_{d+1} = strd_d*s.dims[d])
-        ind -= 1
-        indp = s.first_index
-        @nexprs $N d->begin
-            i = div(ind, strd_{$N-d+1})
-            @inbounds indp += i*s.strides[$N-d+1]
-            ind -= i*strd_{$N-d+1}
+        Base.Cartesian.@nexprs $N d->(J_d = J[d])
+        Base.Cartesian.@ncall $N checkbounds V J
+        Base.Cartesian.@nexprs $N d->(I_d = Base.to_index(J_d))
+        if !isa(x, AbstractArray)
+            Base.Cartesian.@nloops $N i d->(1:length(I_d)) d->(@inbounds j_d = Base.unsafe_getindex(I_d, i_d)) begin
+                @inbounds (Base.Cartesian.@nref $N V j) = x
+            end
+        else
+            X = x
+            Base.Cartesian.@ncall $N Base.setindex_shape_check X I
+            k = 1
+            Base.Cartesian.@nloops $N i d->(1:length(I_d)) d->(@inbounds j_d = Base.unsafe_getindex(I_d, i_d)) begin
+                @inbounds (Base.Cartesian.@nref $N V j) = X[k]
+                k += 1
+            end
         end
-        s.parent[indp]
+        V
     end
 end
 
-eval(ngenerate(:N, nothing, :(getindex{T}(s::SubArray{T,N}, ind::Integer)), gen_getindex_body, 2:5, false))
-
+## SubArray index merging
+# A view created like V = A[2:3:8, 5:2:17] can later be indexed as V[2:7],
+# creating a new 1d view.
+# In such cases we have to collapse the 2d space spanned by the ranges.
+#
+# API:
+#    merge_indexes(V, indexes::NTuple, dims::Dims, linindex)
+# where dims encodes the trailing sizes of the parent array,
+# indexes encodes the view's trailing indexes into the parent array,
+# and linindex encodes the subset of these elements that we'll select.
+#
+# The generic algorithm makes use of div to convert elements
+# of linindex into a cartesian index into indexes, looks up
+# the corresponding cartesian index into the parent, and then uses
+# dims to convert back to a linear index into the parent array.
+#
+# However, a common case is linindex::UnitRange.
+# Since div is slow and in(j::Int, linindex::UnitRange) is fast,
+# it can be much faster to generate all possibilities and
+# then test whether the corresponding linear index is in linindex.
+# One exception occurs when only a small subset of the total
+# is desired, in which case we fall back to the div-based algorithm.
+stagedfunction merge_indexes(V, indexes::NTuple, dims::Dims, linindex::UnitRange{Int})
+    N = length(indexes)
+    N > 0 || error("Cannot merge empty indexes")
+    quote
+        n = length(linindex)
+        Base.Cartesian.@nexprs $N d->(I_d = indexes[d])
+        L = 1
+        dimoffset = ndims(V.parent) - length(dims)
+        Base.Cartesian.@nexprs $N d->(L *= dimsize(V.parent, d+dimoffset, I_d))
+        if n < 0.1L   # this has not been carefully tuned
+            return merge_indexes_div(V, indexes, dims, linindex)
+        end
+        Pstride_1 = 1   # parent strides
+        Base.Cartesian.@nexprs $(N-1) d->(Pstride_{d+1} = Pstride_d*dims[d])
+        Istride_1 = 1   # indexes strides
+        Base.Cartesian.@nexprs $(N-1) d->(Istride_{d+1} = Istride_d*dimsize(V, d+dimoffset, I_d))
+        Base.Cartesian.@nexprs $N d->(counter_d = 1) # counter_0 is a linear index into indexes
+        Base.Cartesian.@nexprs $N d->(offset_d = 1)  # offset_0 is a linear index into parent
+        k = 0
+        index = Array(Int, n)
+        Base.Cartesian.@nloops $N i d->(1:dimsize(V, d+dimoffset, I_d)) d->(offset_{d-1} = offset_d + (I_d[i_d]-1)*Pstride_d; counter_{d-1} = counter_d + (i_d-1)*Istride_d) begin
+            if in(counter_0, linindex)
+                index[k+=1] = offset_0
+            end
+        end
+        index
+    end
+end
+merge_indexes(V, indexes::NTuple, dims::Dims, linindex) = merge_indexes_div(V, indexes, dims, linindex)
 
-function gen_setindex!_body(N::Int)
+# This could be written as a regular function, but performance
+# will be better using Cartesian macros to avoid the heap and
+# an extra loop.
+stagedfunction merge_indexes_div(V, indexes::NTuple, dims::Dims, linindex)
+    N = length(indexes)
+    N > 0 || error("Cannot merge empty indexes")
+    Istride_N = symbol("Istride_$N")
     quote
-        strd_1 = 1
-        @nexprs $N d->(@inbounds strd_{d+1} = strd_d*s.dims[d])
-        ind -= 1
-        indp = s.first_index
-        @nexprs $N d->begin
-            i = div(ind, strd_{$N-d+1})
-            @inbounds indp += i*s.strides[$N-d+1]
-            ind -= i*strd_{$N-d+1}
+        Base.Cartesian.@nexprs $N d->(I_d = indexes[d])
+        Pstride_1 = 1   # parent strides
+        Base.Cartesian.@nexprs $(N-1) d->(Pstride_{d+1} = Pstride_d*dims[d])
+        Istride_1 = 1   # indexes strides
+        dimoffset = ndims(V.parent) - length(dims)
+        Base.Cartesian.@nexprs $(N-1) d->(Istride_{d+1} = Istride_d*dimsize(V.parent, d+dimoffset, I_d))
+        n = length(linindex)
+        L = $(Istride_N) * dimsize(V.parent, $N+dimoffset, indexes[end])
+        index = Array(Int, n)
+        for i = 1:n
+            k = linindex[i] # k is the indexes-centered linear index
+            1 <= k <= L || throw(BoundsError())
+            k -= 1
+            j = 0  # j will be the new parent-centered linear index
+            Base.Cartesian.@nexprs $N d->(d < $N ?
+                begin
+                    c, k = divrem(k, Istride_{$N-d+1})
+                    j += (Base.unsafe_getindex(I_{$N-d+1}, c+1)-1)*Pstride_{$N-d+1}
+                end : begin
+                    j += Base.unsafe_getindex(I_1, k+1)
+                end)
+            index[i] = j
         end
-        s.parent[indp] = v
+        index
     end
 end
 
-eval(ngenerate(:N, nothing, :(setindex!{T}(s::SubArray{T,N}, v, ind::Integer)), gen_setindex!_body, 2:5, false))
 
  cumsum(A::AbstractArray, axis::Integer=1) =  cumsum!(similar(A, Base._cumsum_type(A)), A, axis)
 cumprod(A::AbstractArray, axis::Integer=1) = cumprod!(similar(A), A, axis)
@@ -622,7 +692,7 @@ for (V, PT, BT) in [((:N,), BitArray, BitArray), ((:T,:N), Array, StridedArray)]
         offset = 1 - sum(@ntuple N d->strides_{d+1})
 
         if isa(B, SubArray)
-            offset += B.first_index - 1
+            offset += first_index(B) - 1
             B = B.parent
         end