'How to make use of SIMD capability for sum of squared differences between 8-bit components of RGBA pixels?
The below code is trying to extract the red, green and blue channel of a pixel value and performing an arithmetic with another set of RGB values. It seems that code is slow around the logic where its trying to perform the squaring and addition.
What would be the possibility to replace it with a faster version as this logic doesn't seems to be using SIMD capabilities at all.
typedef struct {
unsigned char b, g, r, a;
} pixel;
register pixel *pPixel;
register int i, red1, green1, blue1, alpha1;
register int red2, green2, blue2, alpha2;
register long oldD, newD;
red1 = GetRed( *pPixel );
green1 = GetGreen( *pPixel );
blue1 = GetBlue( *pPixel );
alpha1 = GetAlpha( *pPixel );
oldD = 2000000000;
for ( i = 0; i < newcolors; ++i ) {
red2 = GetRed( mycolormap[i].acolor );
green2 = GetGreen( mycolormap[i].acolor );
blue2 = GetBlue( mycolormap[i].acolor );
alpha2 = GetAlpha( mycolormap[i].acolor );
newD = ( red1 - red2 ) * ( red1 - red2 ) +
( green1 - green2 ) * ( green1 - green2 ) +
( blue1 - blue2 ) * ( blue1 - blue2 ) +
( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
if ( newD < oldD ) {
oldD = newD;
}
}
Below section of code seems to be requiring improvement
newD = ( red1 - red2 ) * ( red1 - red2 ) +
( green1 - green2 ) * ( green1 - green2 ) +
( blue1 - blue2 ) * ( blue1 - blue2 ) +
( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
Solution 1:[1]
It’s harder than it seems. Unfortunately for you, automatic vectorizers in C++ compilers are very rarely doing a good job for integer arithmetic, like you have there.
The following implementation only needs SSE4.1. If you have AVX2 possible to improve substantially by upgrading all these vectors to 32-byte ones, however this will complicate a couple things, remainder and final reduction.
I assumed not only you want the minimum dot product, also the index of the pixel. If you only want the minimum dot product, remove bestIndices
field and the code which handles that field.
struct alignas( 4 ) Pixel
{
uint8_t b, g, r, a;
};
// Define __SSE4_1__ macro when building with MSVC for AVX1 or newer ISA
#if defined( _MSC_VER ) && defined( __AVX__ ) && !defined( __SSE4_1__ )
#define __SSE4_1__ 1
#endif
size_t findClosestPixel( const Pixel& ref, const Pixel* rsi, size_t length, int& bestValue )
{
if( 0 == length )
{
bestValue = INT_MAX;
return ~(size_t)0;
}
class Acc
{
// The reference pixel we're after, broadcasted and split into low/high pieces in 16-bit lanes
__m128i lowRef, highRef;
// The best dot product so far
__m128i bestSquares = _mm_set1_epi32( INT_MAX );
// Index of the pixels currently in bestSquares
__m128i bestIndices = _mm_set1_epi32( -1 );
const __m128i lowMask = _mm_set1_epi16( 0xFF );
// For lanes where dp < bestSquares, update bestSquares and bestIndices vectors
void updateFields( __m128i dp, __m128i indices )
{
const __m128i lt = _mm_cmplt_epi32( dp, bestSquares );
#ifndef __SSE4_1__
bestSquares = _mm_or_si128( _mm_and_si128( lt, dp ), _mm_andnot_si128( lt, bestSquares ) );
bestIndices = _mm_or_si128( _mm_and_si128( lt, indices ), _mm_andnot_si128( lt, bestIndices ) );
#else
bestSquares = _mm_min_epi32( dp, bestSquares );
bestIndices = _mm_blendv_epi8( bestIndices, indices, lt );
#endif
}
public:
Acc( const Pixel& ref )
{
__m128i tmp = _mm_set1_epi32( *(const int*)( &ref ) );
lowRef = _mm_and_si128( tmp, lowMask );
highRef = _mm_srli_epi16( tmp, 8 );
}
// Update the accumulator with another 4 pixels
void update( __m128i pixels, __m128i indices )
{
// Split into two vectors with 16-bit lanes:
// low contains blue and red channels, high contains green and alpha
__m128i low = _mm_and_si128( pixels, lowMask );
__m128i high = _mm_srli_epi16( pixels, 8 );
// Compute difference with the reference value we're after
low = _mm_sub_epi16( low, lowRef );
high = _mm_sub_epi16( high, highRef );
// Compute squares as 32-bit numbers, add adjacent pairs
low = _mm_madd_epi16( low, low );
high = _mm_madd_epi16( high, high );
// Adding them results in the dot product (sum of squares) for all 4 channels
__m128i dp = _mm_add_epi32( low, high );
// Update the state
updateFields( dp, indices );
}
// Compute horizontal minimum across lanes in these accumulators
uint32_t reduce( int& bestDp )
{
// Swap low/high halves
__m128i s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 1, 0, 3, 2 ) );
__m128i i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 1, 0, 3, 2 ) );
updateFields( s2, i2 );
// Swap even/odd lanes
s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 2, 3, 0, 1 ) );
i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 2, 3, 0, 1 ) );
updateFields( s2, i2 );
// Return lowest lanes from both vectors
bestDp = _mm_cvtsi128_si32( bestSquares );
return (uint32_t)_mm_cvtsi128_si32( bestIndices );
}
};
Acc impl{ ref };
const size_t lengthAligned = ( length / 4 ) * 4;
size_t i;
__m128i currentIndices = _mm_setr_epi32( 0, 1, 2, 3 );
for( i = 0; i < lengthAligned; i += 4 )
{
// Load 4 source pixels
__m128i src = _mm_loadu_si128( ( const __m128i* )( rsi + i ) );
// Update things
impl.update( src, currentIndices );
// Increment index vector by 4 pixels
currentIndices = _mm_add_epi32( currentIndices, _mm_set1_epi32( 4 ) );
}
const size_t remainder = length % 4;
if( remainder == 0 )
{
// The input was a multiple of 4 pixels
return impl.reduce( bestValue );
}
const int* const pi = (const int*)( rsi + i );
__m128i rv;
if( lengthAligned > 0 )
{
// We had at least 4 elements on input, can do unaligned load with negative offset
size_t offset = 4 - remainder;
currentIndices = _mm_sub_epi32( currentIndices, _mm_set1_epi32( (int)offset ) );
rv = _mm_loadu_si128( ( const __m128i* )( pi - offset ) );
}
else
{
// Less than 4 elements on input, doing partial load and broadcasting the last element
const size_t remainder = length % 4;
switch( remainder )
{
case 1:
rv = _mm_set1_epi32( pi[ 0 ] );
break;
case 2:
rv = _mm_loadl_epi64( ( const __m128i* )pi );
rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 1, 1, 1, 0 ) );
break;
case 3:
rv = _mm_loadl_epi64( ( const __m128i* )pi );
#ifndef __SSE4_1__
rv = _mm_unpacklo_epi64( rv, _mm_set1_epi32( pi[ 2 ] ) );
#else
rv = _mm_insert_epi32( rv, pi[ 2 ], 2 );
rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 2, 2, 1, 0 ) );
#endif
break;
}
}
impl.update( rv, currentIndices );
return impl.reduce( bestValue );
}
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 |