Hi all,
I've wrote a simple boxcar monochrome (8 bit/pixel) image scale method, and it is too slow. The following algorithm's running time is 9.3 ms for a 1280x960 => 640x480 conversion (500 MHz, so ~4.6 million cycle for 1.3M pixel, eg 3.8 clock/pixel). is there any official lib to do this faster? (For example I cannot find any similar algorithm in IMGLIB).
void scale(UInt32 width, UInt32 height, char *restrict out, const char *restrict in)
{
int y;
// Two rows step
const UInt32 IN_INC = 2*width;
const UInt32 OUT_INC = width>>1;
const UInt32 CYCLE = height >> 1;
const char *restrict x0 = in; // even row pointer
const char *restrict x1 = in+width; // odd row pointer
#pragma MUST_ITERATE(90)
for (y = 0; y < CYCLE; ++y)
{
scale_inner(width, out, x0, x1);
x0 += IN_INC;
x1 += IN_INC;
out += OUT_INC;
}
}
static void scale_inner(UInt32 width, char *restrict xo, const char *restrict x0, const char *restrict x1)
{
int i;
const UInt32 CYCLE = width >> 3; // 8 pixel unrolling
#pragma MUST_ITERATE(20)
for (i = 0; i < CYCLE; i++)
{
xo[4*i+0] = ((Uint16)x0[8*i+0] + (Uint16)x1[8*i+0] + (Uint16)x0[8*i+1] + (Uint16)x1[8*i+1])>>2;
xo[4*i+1] = ((Uint16)x0[8*i+2] + (Uint16)x1[8*i+2] + (Uint16)x0[8*i+3] + (Uint16)x1[8*i+3])>>2;
xo[4*i+2] = ((Uint16)x0[8*i+4] + (Uint16)x1[8*i+4] + (Uint16)x0[8*i+5] + (Uint16)x1[8*i+5])>>2;
xo[4*i+3] = ((Uint16)x0[8*i+6] + (Uint16)x1[8*i+6] + (Uint16)x0[8*i+7] + (Uint16)x1[8*i+7])>>2;
}
}