=== modified file 'adler32_mips.c' --- adler32_mips.c 2011-04-07 20:46:26 +0000 +++ adler32_mips.c 2011-05-01 15:54:37 +0000 @@ -18,11 +18,11 @@ # include # ifdef __mips_loongson_vector_rev # define HAVE_ADLER32_VEC -# define MIN_WORK 16 +# define MIN_WORK 64 # include # define SOV8 (sizeof(uint8x8_t)) -# define VNMAX (5*NMAX) +# define VNMAX (4*NMAX) /* GCCs loongson port looks like a quick hack. * It can output some simple vector instruction sequences, @@ -59,8 +59,8 @@ if(likely(len >= 2*SOV8)) { /* Loongsons and their ST MMX foo are little endian */ - static const int16x4_t vord_lo = {5,6,7,8}; - static const int16x4_t vord_hi = {1,2,3,4}; + static const int16x4_t vord_lo = {8,7,6,5}; + static const int16x4_t vord_hi = {4,3,2,1}; uint32x2_t vs2, vs1; int16x4_t in_lo, in_hi; uint8x8_t v0 = {0}; @@ -119,43 +119,49 @@ if (likely(k >= SOV8)) do { uint32x2_t vs1_r; - int t; + uint16x4_t vs2_lo, vs2_hi; + int t, j; - /* gcc generates horible loop code... */ + /* + * GCC generates horible loop code, so write + * the core loop by hand... + */ __asm__ __volatile__ ( ".set noreorder\n\t" - "xor %3, %3, %3\n\t" - "1:\n\t" - "ldc1 %0, (%6)\n\t" - "addiu %7, %7, -8\n\t" - "paddw %3, %1, %3\n\t" - "biadd %5, %0\n\t" - "punpcklbh %4, %0, %9\n\t" - "paddw %1, %5, %1\n\t" - "punpckhbh %5, %0, %9\n\t" - "pmaddhw %4, %4, %10\n\t" - "pmaddhw %5, %5, %11\n\t" - "sltiu %8, %7, 8\n\t" - "paddw %2, %2, %4\n\t" - "paddw %2, %2, %5\n\t" - "beqz %8, 2f\n\t" - SZPRFX"addiu %6, %6, 8\n\t" - "ldc1 %0, (%6)\n\t" - "addiu %7, %7, -8\n\t" - "paddw %3, %1, %3\n\t" - "biadd %5, %0\n\t" - "punpcklbh %4, %0, %9\n\t" - "dsll %5, %5, %12\n\t" - "paddw %1, %5, %1\n\t" - "punpckhbh %5, %0, %9\n\t" - "pmaddhw %4, %4, %10\n\t" - "pmaddhw %5, %5, %11\n\t" - "sltiu %8, %7, 8\n\t" - "paddw %2, %2, %4\n\t" - "paddw %2, %2, %5\n\t" - "bnez %8, 1b\n\t" - SZPRFX"addiu %6, %6, 8\n\t" + "b 5f\n\t" + "xor %3, %3, %3\n" "2:\n\t" + "xor %6, %6, %6\n\t" + "sll %10, %11, 3\n\t" + "xor %7, %7, %7\n\t" + "subu %9, %9, %10\n" + "1:\n\t" + "ldc1 %0, (%8)\n\t" + SZPRFX"addiu %8, %8, 8\n\t" + "addiu %11, %11, -1\n\t" + "paddw %3, %1, %3\n\t" + "biadd %5, %0\n\t" + "punpcklbh %4, %0, %12\n\t" + "paddw %1, %5, %1\n\t" + "punpckhbh %5, %0, %12\n\t" + "paddh %6, %6, %4\n\t" + "bnez %11, 1b\n\t" + "paddh %7, %7, %5\n\t" + /* loop bottom */ + "pshufh %1, %1, %15\n\t" + "sltiu %10, %9, 8\n\t" + "pmaddhw %4, %6, %13\n\t" + "pmaddhw %5, %7, %14\n\t" + "paddw %2, %2, %4\n\t" + "bnez %10, 4f\n\t" + "paddw %2, %2, %5\n" + "5:\n\t" + "sltiu %10, %9, 1032\n\t" + "beqz %10, 2b\n\t" + "li %11, 128\n\t" + "b 2b\n\t" + "srl %11, %9, 3\n" + "4:\n\t" ".set reorder\n\t" : /* %0 */ "=&f" (in8), /* %1 */ "=f" (vs1), @@ -163,25 +169,34 @@ /* %3 */ "=&f" (vs1_r), /* %4 */ "=&f" (in_lo), /* %5 */ "=&f" (in_hi), - /* %6 */ "=d" (buf), - /* %7 */ "=r" (k), - /* %8 */ "=r" (t) - : /* %9 */ "f" (v0), - /* %10 */ "f" (vord_lo), - /* %11 */ "f" (vord_hi), - /* %12 */ "f" (32), - /* %12 */ "1" (vs1), - /* %13 */ "2" (vs2), - /* %15 */ "6" (buf), - /* %16 */ "7" (k) + /* %6 */ "=&f" (vs2_lo), + /* %7 */ "=&f" (vs2_hi), + /* %8 */ "=d" (buf), + /* %9 */ "=r" (k), + /* %10 */ "=r" (t), + /* %11 */ "=r" (j) + : /* %12 */ "f" (v0), + /* %13 */ "f" (vord_lo), + /* %14 */ "f" (vord_hi), + /* %15 */ "f" (0x4e), + /* %15 */ "1" (vs1), + /* %16 */ "2" (vs2), + /* %17 */ "8" (buf), + /* %18 */ "9" (k) ); - /* - * and the rest of the generated code also looks awful, - * looks like gcc does not know he can shift and and in - * the copro regs + is a little lost with reg allocation - * in the copro... - * But besides of some extra moves & stuff, let the compiler - * handle this. + /* And the rest of the generated code also looks awful. + * Looks like GCC is missing instruction patterns for: + * - 64 bit shifts in loongson copro regs + * - logic in loongson copro regs + * and to make things much worse, GCC seems to be missing + * a loongson copro register <-> copro register move + * pattern (for example using an or instruction), instead + * GCC always moves over the GPR. + * + * But still, let the compiler handle this, we get some + * extra moves between copro regs and GPR, but save us + * a lot of work. + * And maybe some day some one will fix this... */ /* reduce vs1 round sum before multiplying by 8 */ @@ -198,9 +213,7 @@ if (likely(k)) { uint32x2_t vk; - /* - * handle trailer - */ + /* handle trailer */ f = SOV8 - k; vk = (uint32x2_t)(unsigned long long)k; @@ -210,12 +223,14 @@ /* add k times vs1 for this trailer */ /* apply order, add 4 byte horizontal and add to old dword */ __asm__ ( - "ldc1 %0, %10\n\t" + "ldc1 %0, %11\n\t" + "pmuluw %3, %1, %6\n\t" + "pshufh %1, %1, %10\n\t" + "paddw %2, %2, %3\n\t" "pmuluw %3, %1, %6\n\t" "dsll %0, %0, %5\n\t" - "dsrl %0, %0, %5\n\t" - "paddw %2, %2, %3\n\t" "biadd %4, %0\n\t" + "paddw %2, %2, %3\n\t" "punpcklbh %3, %0, %7\n\t" "paddw %1, %1, %4\n\t" "punpckhbh %4, %0, %7\n\t" @@ -233,9 +248,10 @@ /* %7 */ "f" (v0), /* %8 */ "f" (vord_lo), /* %9 */ "f" (vord_hi), - /* %10 */ "m" (*buf), - /* %11 */ "1" (vs1), - /* %12 */ "2" (vs2) + /* %10 */ "f" (0x4e), + /* %11 */ "m" (*buf), + /* %12 */ "1" (vs1), + /* %13 */ "2" (vs2) ); buf += k; @@ -243,8 +259,8 @@ } /* add horizontal */ - vs1 = paddw_u(vs1, (uint32x2_t)pshufh_u((uint16x4_t)v0, (uint16x4_t)vs1, 0xE)); - vs2 = paddw_u(vs2, (uint32x2_t)pshufh_u((uint16x4_t)v0, (uint16x4_t)vs2, 0xE)); + vs1 = paddw_u(vs1, (uint32x2_t)pshufh_u((uint16x4_t)v0, (uint16x4_t)vs1, 0x4E)); + vs2 = paddw_u(vs2, (uint32x2_t)pshufh_u((uint16x4_t)v0, (uint16x4_t)vs2, 0x4E)); /* shake and roll */ s1 = (unsigned int)(unsigned long long)vs1; s2 = (unsigned int)(unsigned long long)vs2;