The naive Julia code made a few pretty fundamental mistakes (Complex vs Complex{...

The naive Julia code made a few pretty fundamental mistakes (Complex vs Complex{Float64}, and row vs column major). The following is non-optimized Julia code that is roughly 6x faster (and much simpler) than the "optimized" code in the blogpost. Some further optimizations would give another 2-4x over this (like using StaticArrays), but I'll leave that as an exercise for the reader.

    apply_filter(x, y) = vec(y)' \* vec(x)

    function cma!(wxy, E, mu, R, os, ntaps)
        L, pols = size(E)
        N = (L ÷ os ÷ ntaps - 1) \* ntaps  # ÷ or div are integer division
        err = similar(E)  # allocate array without initializing its values
        @inbounds for k in axes(E, 2)  # avoid assuming 1-based arrays. Just need a single inbounds macro call
            @views for i in 1:N   # everything in this block is a view
                X = E[i*os-1:i*os+ntaps-2, :]
                Xest = apply_filter(X, wxy[:,:, k])
                err[i,k] = (R - abs2(Xest)) \* Xest  # abs2 avoids needless extra work
                wxy[:,:,k] .+= (mu \* conj(err[i,k])) .\* X  # remember the dots!
            end  
        end
        return wxy, err  # note order of returns, seems more idiomatic
    end