The naive Julia code made a few pretty fundamental mistakes (Complex vs Complex{Float64}, and row vs column major). The following is non-optimized Julia code that is roughly 6x faster (and much simpler) than the "optimized" code in the blogpost. Some further optimizations would give another 2-4x over this (like using StaticArrays), but I'll leave that as an exercise for the reader.
apply_filter(x, y) = vec(y)' \* vec(x)
function cma!(wxy, E, mu, R, os, ntaps)
L, pols = size(E)
N = (L ÷ os ÷ ntaps - 1) \* ntaps # ÷ or div are integer division
err = similar(E) # allocate array without initializing its values
@inbounds for k in axes(E, 2) # avoid assuming 1-based arrays. Just need a single inbounds macro call
@views for i in 1:N # everything in this block is a view
X = E[i*os-1:i*os+ntaps-2, :]
Xest = apply_filter(X, wxy[:,:, k])
err[i,k] = (R - abs2(Xest)) \* Xest # abs2 avoids needless extra work
wxy[:,:,k] .+= (mu \* conj(err[i,k])) .\* X # remember the dots!
end
end
return wxy, err # note order of returns, seems more idiomatic
end