=== modified file 'adler32_arm.c'
--- adler32_arm.c	2011-04-15 06:48:42 +0000
+++ adler32_arm.c	2011-05-01 21:23:42 +0000
@@ -8,8 +8,7 @@
 
 /* @(#) $Id$ */
 
-#if defined(__ARM_NEON__)
-// TODO: need byte order define
+#if defined(__ARM_NEON__) && defined(__ARMEL__)
 /*
  * Big endian NEON qwords are kind of broken.
  * They are big endian within the dwords, but WRONG
@@ -18,15 +17,13 @@
  *
  * This is madness and unsupportable. For this reason
  * GCC wants to disable qword endian specific patterns.
- * We would need a Preprocessor define which endian we
- * have to disable this code.
  */
 #  include <arm_neon.h>
 
 #  define SOVUCQ sizeof(uint8x16_t)
 #  define SOVUC sizeof(uint8x8_t)
-/* since we do not have the 64bit psadbw sum, we could prop. do a little more */
-#  define VNMAX (6*NMAX)
+/* since we do not have the 64bit psadbw sum, we could still go a little higher (we are at 0xc) */
+#  define VNMAX (8*NMAX)
 #  define HAVE_ADLER32_VEC
 #  define MIN_WORK 32
 
@@ -86,11 +83,7 @@
     s1 = adler & 0xffff;
     s2 = (adler >> 16) & 0xffff;
 
-// TODO: big endian mask is prop. wrong
-    if (host_is_bigendian())
-        vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
-    else
-        vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
 
     if (likely(len >= 2*SOVUCQ)) {
         unsigned f, n;
@@ -130,14 +123,8 @@
         /* get input data */
         in16 = *(const uint8x16_t *)buf;
         /* mask out excess data */
-        if(host_is_bigendian()) {
-            in16 = neon_simple_alignq(v0, in16, n);
-            vord_a = neon_simple_alignq(v0, vord, n);
-        } else {
-            in16 = neon_simple_alignq(in16, v0, f);
-            vord_a = neon_simple_alignq(vord, v0, f);
-        }
-
+        in16 = neon_simple_alignq(in16, v0, f);
+        vord_a = neon_simple_alignq(vord, v0, f);
         /* pairwise add bytes and long, pairwise add word long acc */
         vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
         /* apply order, add words, pairwise add word long acc */
@@ -154,33 +141,33 @@
         if (likely(k >= SOVUCQ)) do {
             uint32x4_t vs1_r = v0_32;
             do {
-                /* add vs1 for this round */
-                vs1_r = vaddq_u32(vs1_r, vs1);
-
-                /* get input data */
-                in16 = *(const uint8x16_t *)buf;
-
-// TODO: make work in inner loop more tight
-                /*
-                 * decompose partial sums, so we do less instructions and
-                 * build loops around it to do acc and so on only from time
-                 * to time.
-                 * This is hard with NEON, because the instruction are nice:
-                 * we have the stuff in widening and with acc (practicaly
-                 * for free...)
-                 */
-                /* pairwise add bytes and long, pairwise add word long acc */
-                vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
-                /* apply order, add words, pairwise add word long acc */
-                vs2 = vpadalq_u16(vs2,
-                        vmlal_u8(
-                            vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
-                            vget_high_u8(in16), vget_high_u8(vord)
-                            )
-                        );
-
-                buf += SOVUCQ;
-                k -= SOVUCQ;
+                uint16x8_t vs2_lo = (uint16x8_t)v0_32, vs2_hi = (uint16x8_t)v0_32;
+                unsigned j;
+
+                j  = (k/16) > 16 ? 16 : k/16;
+                k -= j * 16;
+                do {
+                    /* GCC does not create the most pretty inner loop,
+                     * with extra moves and stupid scheduling, but
+                     * i am not in the mood for inline ASM, keep it
+                     * compatible.
+                     */
+                    /* get input data */
+                    in16 = *(const uint8x16_t *)buf;
+                    buf += SOVUCQ;
+
+                    /* add vs1 for this round */
+                    vs1_r = vaddq_u32(vs1_r, vs1);
+
+                    /* pairwise add bytes and long, pairwise add word long acc */
+                    vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
+                    /* apply order, word long and acc */
+                    vs2_lo = vmlal_u8(vs2_lo, vget_low_u8(in16), vget_low_u8(vord));
+                    vs2_hi = vmlal_u8(vs2_hi, vget_high_u8(in16), vget_high_u8(vord));
+                } while(--j);
+                /* pair wise add long and acc */
+                vs2 = vpadalq_u16(vs2, vs2_lo);
+                vs2 = vpadalq_u16(vs2, vs2_hi);
             } while (k >= SOVUCQ);
             /* reduce vs1 round sum before multiplying by 16 */
             vs1_r = vector_reduce(vs1_r);
@@ -193,7 +180,7 @@
             len += k;
             k = len < VNMAX ? (unsigned) len : VNMAX;
             len -= k;
-        } while (likely(k >= SOVUC));
+        } while (likely(k >= SOVUCQ));
 
         if (likely(k)) {
             /*
@@ -206,10 +193,7 @@
             /* get input data */
             in16 = *(const uint8x16_t *)buf;
             /* masks out bad data */
-            if(host_is_bigendian())
-                in16 = neon_simple_alignq(in16, v0, f);
-            else
-                in16 = neon_simple_alignq(v0, in16, k);
+            in16 = neon_simple_alignq(v0, in16, k);
 
             /* pairwise add bytes and long, pairwise add word long acc */
             vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
@@ -445,6 +429,7 @@
                 do {
                     /* get input data */
                     __m64 in = *(const __m64 *)buf;
+                    buf += SOV8;
 
                     /* add vs1 for this round */
                     vs1_r = _mm_add_pi32(vs1_r, vs1);
@@ -480,7 +465,6 @@
                     /* widen bytes to words and acc */
                     vs2_l = _mm_add_pi16(vs2_l, _mm_unpackel_pu8(in));
                     vs2_h = _mm_add_pi16(vs2_h, _mm_unpackeh_pu8(in));
-                    buf += SOV8;
                 } while (--j);
                 /* shake and roll vs1_r, so both 32 bit sums get some input */
                 vs1_r = _mm_shuffle_pi16(vs1_r, 0x4e);
@@ -521,14 +505,23 @@
 }
 
 /* inline asm, so only on GCC (or compatible) && ARM v6 or better */
-#elif defined(__GNUC__) && ( \
+#elif 0 && defined(__GNUC__) && ( \
         defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__)  || \
         defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
         defined(__ARM_ARCH_7A__) \
       )
+/* This code is disabled, since it is not faster, only for reference.
+ * We are at speedup: 0.952830
+ * Again counting instructions is futile, 5 instructions per 4 bytes
+ * against at least 3 per byte (loop overhead excluded) is no win.
+ * And split sums also does not save us.
+ */
 #  define SOU32 (sizeof(unsigned int))
 #  define HAVE_ADLER32_VEC
 #  define MIN_WORK 16
+// TODO: maybe 2*NMAX is possible, but that's very thin
+/* this way we are at 0xda */
+#  define VNMAX (NMAX+((NMAX*9)/10))
 
 /* ========================================================================= */
 local noinline uLong adler32_vec(adler, buf, len)
@@ -553,7 +546,6 @@
         unsigned int vs1 = s1, vs2 = s2;
         unsigned int order_lo, order_hi;
 
-// TODO: byte order?
         if (host_is_bigendian()) {
             order_lo = 0x00030001;
             order_hi = 0x00040002;
@@ -561,62 +553,44 @@
             order_lo = 0x00020004;
             order_hi = 0x00010003;
         }
-// TODO: we could go over NMAX, since we have split the vs2 sum
-        /* something around (NMAX+(NMAX/3)+302) */
-        k = len < NMAX ? len : NMAX;
+        k = len < VNMAX ? len : VNMAX;
         len -= k;
 
         do {
             unsigned int vs1_r = 0;
             do {
-                unsigned int t21, t22, in;
-
-                /* get input data */
-                in = *(const unsigned int *)buf;
-
-                /* add vs1 for this round */
-                vs1_r += vs1;
-
-                /* add horizontal and acc */
-                asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1));
-                /* widen bytes to words, apply order, add and acc */
-                asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in));
-                asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in));
-// TODO: instruction result latency
-                /*
-                 * The same problem like the classic serial sum:
-                 * Chip maker sell us 1-cycle instructions, but that is not the
-                 * whole story. Nearly all 1-cycle chips are pipelined, so
-                 * you can get one result per cycle, but only if _they_ (plural)
-                 * are independent.
-                 * If you are depending on the result of an preciding instruction,
-                 * in the worst case you hit the instruction latency which is worst
-                 * case >= pipeline length. On the other hand there are result-fast-paths.
-                 * This could all be a wash with the classic sum (4 * 2 instructions,
-                 * + dependence), since smald is:
-                 * - 2 cycle issue
-                 * - needs the acc in pipeline step E1, instead of E2
-                 * But the Cortex has a fastpath for acc.
-                 * I don't know.
-                 * We can not even unroll, we would need 4 order vars, return ENOREGISTER.
-                 */
-                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2));
-                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2));
-
-                buf += SOU32;
-                k -= SOU32;
+                unsigned int j;
+                unsigned int vs2_lo = 0, vs2_hi = 0;
+
+                j  = (k/4) >= 128 ? 128 : (k/4);
+                k -= j * 4;
+                do {
+                    /* get input data */
+                    unsigned int in = *(const unsigned int *)buf;
+                    buf += SOU32;
+                    /* add vs1 for this round */
+                    vs1_r += vs1;
+                    /* add horizontal and acc */
+                    asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1));
+                    /* widen bytes to words and acc */
+                    asm ("uxtab16 %0, %1, %2" : "=r" (vs2_lo) : "r" (vs2_lo), "r" (in));
+                    asm ("uxtab16 %0, %1, %2, ror #8" : "=r" (vs2_hi) : "r" (vs2_hi), "r" (in));
+                } while (--j);
+                /* aply order and acc */
+                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (vs2_lo) , "r" (order_lo), "r" (vs2));
+                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (vs2_hi) , "r" (order_hi), "r" (vs2));
             } while (k >= SOU32);
             /* reduce vs1 round sum before multiplying by 4 */
             reduce(vs1_r);
             /* add vs1 for this round (4 times) */
             vs2 += vs1_r * 4;
-            /* reduce both sums to something within 16 bit */
+            /* reduce both sums */
             reduce(vs2);
             reduce(vs1);
             len += k;
-            k = len < NMAX ? len : NMAX;
+            k = len < VNMAX ? len : VNMAX;
             len -= k;
-        } while (likely(k >= 4 * SOU32));
+        } while (likely(k >= SOU32));
         len += k;
         s1 = vs1;
         s2 = vs2;
@@ -626,7 +600,7 @@
         s1 += *buf++;
         s2 += s1;
     } while (--len);
-    /* at this point we should no have so big s1 & s2 */
+    /* at this point we should not have so big s1 & s2 */
     reduce_x(s1);
     reduce_x(s2);