void blur(const Image &in, Image &blurred) { Image tmp(in.width(), in.height()); for (int y = 0; y < in.height(); y++){ for (int x = 0; x < in.width(); x++){ tmp(x, y) = (in(x-1, y) + in(x, y) + in(x+1, y))/3; } } for (int y = 0; y < in.height(); y++){ for (int x = 0; x < in.width(); x++){ blurred(x, y) = (tmp(x, y-1) + tmp(x, y) + tmp(x, y+1))/3; } } }
void fast_blur(const Image &in, Image &blurred) { m128i one_third = _mm_set1_epi16(21846); #pragma omp parallel for for (int yTile = 0; yTile < in.height(); yTile += 32) { m128i a, b, c, sum, avg; m128i tmp[(256/8)*(32+2)]; for (int xTile = 0; xTile < in.width(); xTile += 256) { m128i *tmpPtr = tmp; for (int y = -1; y < 32+1; y++) { const uint16_t *inPtr = &(in(xTile, yTile+y)); for (int x = 0; x < 256; x += 8) { a = _mm_loadu_si128(( m128i*)(inPtr-1)); b = _mm_loadu_si128(( m128i*)(inPtr+1)); c = _mm_load_si128(( m128i*)(inPtr)); sum = _mm_add_epi16(_mm_add_epi16(a, b), c); avg = _mm_mulhi_epi16(sum, one_third); _mm_store_si128(tmpPtr++, avg); inPtr += 8; } } tmpPtr = tmp; for (int y = 0; y < 32; y++) { m128i *outPtr = ( m128i *)(&(blurred(xTile, yTile+y))); for (int x = 0; x < 256; x += 8) { a = _mm_load_si128(tmpPtr+(2*256)/8); b = _mm_load_si128(tmpPtr+256/8); c = _mm_load_si128(tmpPtr++); sum = _mm_add_epi16(_mm_add_epi16(a, b), c); avg = _mm_mulhi_epi16(sum, one_third); _mm_store_si128(outPtr++, avg); } } } } }
And, just for comparison, Halide code that produces results as fast as the second code:
Func halide_blur(Func in) { Func tmp, blurred; Var x, y, xi, yi; // The algorithm tmp(x, y) = (in(x-1, y) + in(x, y) + in(x+1, y))/3; blurred(x, y) = (tmp(x, y-1) + tmp(x, y) + tmp(x, y+1))/3; // The schedule blurred.tile(x, y, xi, yi, 256, 32).vectorize(xi, 8).parallel(y); tmp.chunk(x).vectorize(x, 8); return blurred;
(this is kind of a weird coincidence; last time I replied to you I mentioned Halide[1] as well)
[0] http://people.csail.mit.edu/jrk/halide12/halide12.pdf
[1] http://halide-lang.org/
(defun language-choice (developer) (if (> (developer-hipness developer) (developer-experience developer)) (lang-du-jour) (if (developer-scared-of developer 'parenthesis) (c-family-language) (lisp-family-language))))
And, just for comparison, Halide code that produces results as fast as the second code:
}(this is kind of a weird coincidence; last time I replied to you I mentioned Halide[1] as well)
[0] http://people.csail.mit.edu/jrk/halide12/halide12.pdf
[1] http://halide-lang.org/