[Zlib-devel] [PATCH 08/12] add SSE4.2 optimized hash function

Jim Kukunas james.t.kukunas at linux.intel.com
Fri Dec 13 18:13:17 EST 2013


For systems supporting SSE4.2, use the crc32 instruction as a fast
hash function. Also, provide a better fallback hash.

For both new hash functions, we hash 4 bytes, instead of 3, for certain
levels. This shortens the hash chains, and also improves the quality
of each hash entry.
---
 configure |    6 ++++
 deflate.c |   77 +++++++++++++++++++++++++++++++++++++++++++++++++------------
 deflate.h |   15 ++++++++++++
 3 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/configure b/configure
index 9755cbe..bf88e45 100755
--- a/configure
+++ b/configure
@@ -803,6 +803,9 @@ case "${ARCH}" in
             FILL_WINDOW_SSE_o=""
             FILL_WINDOW_SSE_lo=""
         fi
+
+        CFLAGS="${CFLAGS} -DUSE_SSE4_2_CRC_HASH"
+        SFLAGS="${SFLAGS} -DUSE_SSE4_2_CRC_HASH"
     ;;
     i386 | i486 | i586 | i686)
         OBJC="${OBJC} x86.o"
@@ -828,6 +831,9 @@ case "${ARCH}" in
             FILL_WINDOW_SSE_o=""
             FILL_WINDOW_SSE_lo=""
         fi
+
+        CFLAGS="${CFLAGS} -DUSE_SSE4_2_CRC_HASH"
+        SFLAGS="${SFLAGS} -DUSE_SSE4_2_CRC_HASH"
     ;;
 esac
 
diff --git a/deflate.c b/deflate.c
index 32df211..be9665d 100644
--- a/deflate.c
+++ b/deflate.c
@@ -169,17 +169,57 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
  *    input characters and the first MIN_MATCH bytes of str are valid
  *    (except for the last MIN_MATCH-1 bytes of the input file).
  */
+#ifdef USE_SSE4_2_CRC_HASH
+#include "x86.h"
+local inline Pos insert_string_sse(deflate_state *z_const s, z_const Pos str)
+{
+    Pos ret;
+    unsigned *ip, val, h = 0;
+
+    ip = (unsigned *)&s->window[str];
+    val = *ip;
+
+    if (s->level >= 6)
+        val &= 0xFFFFFF;
+
+    __asm__ __volatile__ (
+        "crc32 %1,%0\n\t"
+    : "+r" (h)
+    : "r" (val)
+    :
+    );
+
+    ret = s->head[h & s->hash_mask];
+    s->head[h & s->hash_mask] = str;
+    s->prev[str & s->w_mask] = ret;
+    return ret;
+}
+#endif
+
+local inline Pos insert_string_c(deflate_state *z_const s, z_const Pos str)
+{
+    Pos ret;
+
+    UPDATE_HASH(s, s->ins_h, str);
 #ifdef FASTEST
-#define INSERT_STRING(s, str, match_head) \
-   (UPDATE_HASH(s, s->ins_h, (str)), \
-    match_head = s->head[s->ins_h], \
-    s->head[s->ins_h] = (Pos)(str))
+    ret = s->head[s->ins_h];
 #else
-#define INSERT_STRING(s, str, match_head) \
-   (UPDATE_HASH(s, s->ins_h, (str)), \
-    match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
-    s->head[s->ins_h] = (Pos)(str))
+    ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
 #endif
+    s->head[s->ins_h] = str;
+
+    return ret;
+}
+
+local inline Pos insert_string(deflate_state *z_const s, z_const Pos str)
+{
+#ifdef USE_SSE4_2_CRC_HASH
+    if (x86_cpu_has_sse42)
+        return insert_string_sse(s, str);
+#endif
+    return insert_string_c(s, str);
+}
+
 
 /* ===========================================================================
  * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
@@ -226,7 +266,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
      * output size for (length,distance) codes is <= 24 bits.
      */
 
-#ifdef CHECK_SSE2
+#if defined(CHECK_SSE2) || defined(USE_SSE4_2_CRC_HASH)
     x86_check_features();
 #endif
 
@@ -285,7 +325,13 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
     s->w_size = 1 << s->w_bits;
     s->w_mask = s->w_size - 1;
 
+#ifdef USE_SSE4_2_CRC_HASH
+    if (x86_cpu_has_sse42)
+        s->hash_bits = 15;
+    else
+#endif
     s->hash_bits = memLevel + 7;
+
     s->hash_size = 1 << s->hash_bits;
     s->hash_mask = s->hash_size - 1;
     s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
@@ -1278,7 +1324,8 @@ local void fill_window_c(s)
         if (s->lookahead + s->insert >= MIN_MATCH) {
             uInt str = s->strstart - s->insert;
             s->ins_h = s->window[str];
-            UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
+            if (str >= 1)
+                UPDATE_HASH(s, s->ins_h, str + 2 - (MIN_MATCH-1));
 #if MIN_MATCH != 3
             Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
@@ -1458,7 +1505,7 @@ local block_state deflate_fast(s, flush)
          */
         hash_head = NIL;
         if (s->lookahead >= MIN_MATCH) {
-            INSERT_STRING(s, s->strstart, hash_head);
+            hash_head = insert_string(s, s->strstart);
         }
 
         /* Find the longest match, discarding those <= prev_length.
@@ -1489,7 +1536,7 @@ local block_state deflate_fast(s, flush)
                 s->match_length--; /* string at strstart already in table */
                 do {
                     s->strstart++;
-                    INSERT_STRING(s, s->strstart, hash_head);
+                    hash_head = insert_string(s, s->strstart);
                     /* strstart never exceeds WSIZE-MAX_MATCH, so there are
                      * always MIN_MATCH bytes ahead.
                      */
@@ -1501,7 +1548,7 @@ local block_state deflate_fast(s, flush)
                 s->strstart += s->match_length;
                 s->match_length = 0;
                 s->ins_h = s->window[s->strstart];
-                UPDATE_HASH(s, s->ins_h, s->strstart+1 - (MIN_MATCH-1));
+                UPDATE_HASH(s, s->ins_h, s->strstart+2 - (MIN_MATCH));
 #if MIN_MATCH != 3
                 Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
@@ -1561,7 +1608,7 @@ local block_state deflate_slow(s, flush)
          */
         hash_head = NIL;
         if (s->lookahead >= MIN_MATCH) {
-            INSERT_STRING(s, s->strstart, hash_head);
+            hash_head = insert_string(s, s->strstart);
         }
 
         /* Find the longest match, discarding those <= prev_length.
@@ -1612,7 +1659,7 @@ local block_state deflate_slow(s, flush)
             s->prev_length -= 2;
             do {
                 if (++s->strstart <= max_insert) {
-                    INSERT_STRING(s, s->strstart, hash_head);
+                    hash_head = insert_string(s, s->strstart);
                 }
             } while (--s->prev_length != 0);
             s->match_available = 0;
diff --git a/deflate.h b/deflate.h
index f1c1ed9..fd9c80f 100644
--- a/deflate.h
+++ b/deflate.h
@@ -349,6 +349,21 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
  *    input characters, so that a running hash key can be computed from the
  *    previous key instead of complete recalculation each time.
  */
+#ifdef USE_SSE4_2_CRC_HASH
+#define UPDATE_HASH(s,h,i) (\
+    {\
+        if (s->level < 6) \
+            h = (3483 * (s->window[i]) +\
+                 23081* (s->window[i+1]) +\
+                 6954 * (s->window[i+2]) +\
+                 20947* (s->window[i+3])) & s->hash_mask;\
+        else\
+            h = (25881* (s->window[i]) +\
+                 24674* (s->window[i+1]) +\
+                 25811* (s->window[i+2])) & s->hash_mask;\
+    })
+#else
 #define UPDATE_HASH(s,h,i) (h = (((h)<<s->hash_shift) ^ (s->window[i + (MIN_MATCH-1)])) & s->hash_mask)
+#endif
 
 #endif /* DEFLATE_H */
-- 
1.7.1





More information about the Zlib-devel mailing list