=== modified file 'adler32_arm.c' --- adler32_arm.c 2011-04-15 06:48:42 +0000 +++ adler32_arm.c 2011-05-01 21:23:42 +0000 @@ -8,8 +8,7 @@ /* @(#) $Id$ */ -#if defined(__ARM_NEON__) -// TODO: need byte order define +#if defined(__ARM_NEON__) && defined(__ARMEL__) /* * Big endian NEON qwords are kind of broken. * They are big endian within the dwords, but WRONG @@ -18,15 +17,13 @@ * * This is madness and unsupportable. For this reason * GCC wants to disable qword endian specific patterns. - * We would need a Preprocessor define which endian we - * have to disable this code. */ # include # define SOVUCQ sizeof(uint8x16_t) # define SOVUC sizeof(uint8x8_t) -/* since we do not have the 64bit psadbw sum, we could prop. do a little more */ -# define VNMAX (6*NMAX) +/* since we do not have the 64bit psadbw sum, we could still go a little higher (we are at 0xc) */ +# define VNMAX (8*NMAX) # define HAVE_ADLER32_VEC # define MIN_WORK 32 @@ -86,11 +83,7 @@ s1 = adler & 0xffff; s2 = (adler >> 16) & 0xffff; -// TODO: big endian mask is prop. wrong - if (host_is_bigendian()) - vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; - else - vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; if (likely(len >= 2*SOVUCQ)) { unsigned f, n; @@ -130,14 +123,8 @@ /* get input data */ in16 = *(const uint8x16_t *)buf; /* mask out excess data */ - if(host_is_bigendian()) { - in16 = neon_simple_alignq(v0, in16, n); - vord_a = neon_simple_alignq(v0, vord, n); - } else { - in16 = neon_simple_alignq(in16, v0, f); - vord_a = neon_simple_alignq(vord, v0, f); - } - + in16 = neon_simple_alignq(in16, v0, f); + vord_a = neon_simple_alignq(vord, v0, f); /* pairwise add bytes and long, pairwise add word long acc */ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); /* apply order, add words, pairwise add word long acc */ @@ -154,33 +141,33 @@ if (likely(k >= SOVUCQ)) do { uint32x4_t vs1_r = v0_32; do { - /* add vs1 for this round */ - vs1_r = vaddq_u32(vs1_r, vs1); - - /* get input data */ - in16 = *(const uint8x16_t *)buf; - -// TODO: make work in inner loop more tight - /* - * decompose partial sums, so we do less instructions and - * build loops around it to do acc and so on only from time - * to time. - * This is hard with NEON, because the instruction are nice: - * we have the stuff in widening and with acc (practicaly - * for free...) - */ - /* pairwise add bytes and long, pairwise add word long acc */ - vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); - /* apply order, add words, pairwise add word long acc */ - vs2 = vpadalq_u16(vs2, - vmlal_u8( - vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), - vget_high_u8(in16), vget_high_u8(vord) - ) - ); - - buf += SOVUCQ; - k -= SOVUCQ; + uint16x8_t vs2_lo = (uint16x8_t)v0_32, vs2_hi = (uint16x8_t)v0_32; + unsigned j; + + j = (k/16) > 16 ? 16 : k/16; + k -= j * 16; + do { + /* GCC does not create the most pretty inner loop, + * with extra moves and stupid scheduling, but + * i am not in the mood for inline ASM, keep it + * compatible. + */ + /* get input data */ + in16 = *(const uint8x16_t *)buf; + buf += SOVUCQ; + + /* add vs1 for this round */ + vs1_r = vaddq_u32(vs1_r, vs1); + + /* pairwise add bytes and long, pairwise add word long acc */ + vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); + /* apply order, word long and acc */ + vs2_lo = vmlal_u8(vs2_lo, vget_low_u8(in16), vget_low_u8(vord)); + vs2_hi = vmlal_u8(vs2_hi, vget_high_u8(in16), vget_high_u8(vord)); + } while(--j); + /* pair wise add long and acc */ + vs2 = vpadalq_u16(vs2, vs2_lo); + vs2 = vpadalq_u16(vs2, vs2_hi); } while (k >= SOVUCQ); /* reduce vs1 round sum before multiplying by 16 */ vs1_r = vector_reduce(vs1_r); @@ -193,7 +180,7 @@ len += k; k = len < VNMAX ? (unsigned) len : VNMAX; len -= k; - } while (likely(k >= SOVUC)); + } while (likely(k >= SOVUCQ)); if (likely(k)) { /* @@ -206,10 +193,7 @@ /* get input data */ in16 = *(const uint8x16_t *)buf; /* masks out bad data */ - if(host_is_bigendian()) - in16 = neon_simple_alignq(in16, v0, f); - else - in16 = neon_simple_alignq(v0, in16, k); + in16 = neon_simple_alignq(v0, in16, k); /* pairwise add bytes and long, pairwise add word long acc */ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); @@ -445,6 +429,7 @@ do { /* get input data */ __m64 in = *(const __m64 *)buf; + buf += SOV8; /* add vs1 for this round */ vs1_r = _mm_add_pi32(vs1_r, vs1); @@ -480,7 +465,6 @@ /* widen bytes to words and acc */ vs2_l = _mm_add_pi16(vs2_l, _mm_unpackel_pu8(in)); vs2_h = _mm_add_pi16(vs2_h, _mm_unpackeh_pu8(in)); - buf += SOV8; } while (--j); /* shake and roll vs1_r, so both 32 bit sums get some input */ vs1_r = _mm_shuffle_pi16(vs1_r, 0x4e); @@ -521,14 +505,23 @@ } /* inline asm, so only on GCC (or compatible) && ARM v6 or better */ -#elif defined(__GNUC__) && ( \ +#elif 0 && defined(__GNUC__) && ( \ defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ defined(__ARM_ARCH_7A__) \ ) +/* This code is disabled, since it is not faster, only for reference. + * We are at speedup: 0.952830 + * Again counting instructions is futile, 5 instructions per 4 bytes + * against at least 3 per byte (loop overhead excluded) is no win. + * And split sums also does not save us. + */ # define SOU32 (sizeof(unsigned int)) # define HAVE_ADLER32_VEC # define MIN_WORK 16 +// TODO: maybe 2*NMAX is possible, but that's very thin +/* this way we are at 0xda */ +# define VNMAX (NMAX+((NMAX*9)/10)) /* ========================================================================= */ local noinline uLong adler32_vec(adler, buf, len) @@ -553,7 +546,6 @@ unsigned int vs1 = s1, vs2 = s2; unsigned int order_lo, order_hi; -// TODO: byte order? if (host_is_bigendian()) { order_lo = 0x00030001; order_hi = 0x00040002; @@ -561,62 +553,44 @@ order_lo = 0x00020004; order_hi = 0x00010003; } -// TODO: we could go over NMAX, since we have split the vs2 sum - /* something around (NMAX+(NMAX/3)+302) */ - k = len < NMAX ? len : NMAX; + k = len < VNMAX ? len : VNMAX; len -= k; do { unsigned int vs1_r = 0; do { - unsigned int t21, t22, in; - - /* get input data */ - in = *(const unsigned int *)buf; - - /* add vs1 for this round */ - vs1_r += vs1; - - /* add horizontal and acc */ - asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1)); - /* widen bytes to words, apply order, add and acc */ - asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in)); - asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in)); -// TODO: instruction result latency - /* - * The same problem like the classic serial sum: - * Chip maker sell us 1-cycle instructions, but that is not the - * whole story. Nearly all 1-cycle chips are pipelined, so - * you can get one result per cycle, but only if _they_ (plural) - * are independent. - * If you are depending on the result of an preciding instruction, - * in the worst case you hit the instruction latency which is worst - * case >= pipeline length. On the other hand there are result-fast-paths. - * This could all be a wash with the classic sum (4 * 2 instructions, - * + dependence), since smald is: - * - 2 cycle issue - * - needs the acc in pipeline step E1, instead of E2 - * But the Cortex has a fastpath for acc. - * I don't know. - * We can not even unroll, we would need 4 order vars, return ENOREGISTER. - */ - asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2)); - asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2)); - - buf += SOU32; - k -= SOU32; + unsigned int j; + unsigned int vs2_lo = 0, vs2_hi = 0; + + j = (k/4) >= 128 ? 128 : (k/4); + k -= j * 4; + do { + /* get input data */ + unsigned int in = *(const unsigned int *)buf; + buf += SOU32; + /* add vs1 for this round */ + vs1_r += vs1; + /* add horizontal and acc */ + asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1)); + /* widen bytes to words and acc */ + asm ("uxtab16 %0, %1, %2" : "=r" (vs2_lo) : "r" (vs2_lo), "r" (in)); + asm ("uxtab16 %0, %1, %2, ror #8" : "=r" (vs2_hi) : "r" (vs2_hi), "r" (in)); + } while (--j); + /* aply order and acc */ + asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (vs2_lo) , "r" (order_lo), "r" (vs2)); + asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (vs2_hi) , "r" (order_hi), "r" (vs2)); } while (k >= SOU32); /* reduce vs1 round sum before multiplying by 4 */ reduce(vs1_r); /* add vs1 for this round (4 times) */ vs2 += vs1_r * 4; - /* reduce both sums to something within 16 bit */ + /* reduce both sums */ reduce(vs2); reduce(vs1); len += k; - k = len < NMAX ? len : NMAX; + k = len < VNMAX ? len : VNMAX; len -= k; - } while (likely(k >= 4 * SOU32)); + } while (likely(k >= SOU32)); len += k; s1 = vs1; s2 = vs2; @@ -626,7 +600,7 @@ s1 += *buf++; s2 += s1; } while (--len); - /* at this point we should no have so big s1 & s2 */ + /* at this point we should not have so big s1 & s2 */ reduce_x(s1); reduce_x(s2);