123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- /*-----------------------------------------------------------------------------
- * MurmurHash3 was written by Austin Appleby, and is placed in the public
- * domain.
- *
- * This implementation was written by Shane Day, and is also public domain.
- *
- * This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A)
- * with support for progressive processing.
- */
- /*-----------------------------------------------------------------------------
-
- If you want to understand the MurmurHash algorithm you would be much better
- off reading the original source. Just point your browser at:
- http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
- What this version provides?
- 1. Progressive data feeding. Useful when the entire payload to be hashed
- does not fit in memory or when the data is streamed through the application.
- Also useful when hashing a number of strings with a common prefix. A partial
- hash of a prefix string can be generated and reused for each suffix string.
- 2. Portability. Plain old C so that it should compile on any old compiler.
- Both CPU endian and access-alignment neutral, but avoiding inefficient code
- when possible depending on CPU capabilities.
- 3. Drop in. I personally like nice self contained public domain code, making it
- easy to pilfer without loads of refactoring to work properly in the existing
- application code & makefile structure and mucking around with licence files.
- Just copy PMurHash.h and PMurHash.c and you're ready to go.
- How does it work?
- We can only process entire 32 bit chunks of input, except for the very end
- that may be shorter. So along with the partial hash we need to give back to
- the caller a carry containing up to 3 bytes that we were unable to process.
- This carry also needs to record the number of bytes the carry holds. I use
- the low 2 bits as a count (0..3) and the carry bytes are shifted into the
- high byte in stream order.
- To handle endianess I simply use a macro that reads a uint32_t and define
- that macro to be a direct read on little endian machines, a read and swap
- on big endian machines, or a byte-by-byte read if the endianess is unknown.
- -----------------------------------------------------------------------------*/
- #include "PMurHash.h"
- /* I used ugly type names in the header to avoid potential conflicts with
- * application or system typedefs & defines. Since I'm not including any more
- * headers below here I can rename these so that the code reads like C99 */
- #undef uint32_t
- #define uint32_t MH_UINT32
- #undef uint8_t
- #define uint8_t MH_UINT8
- /* MSVC warnings we choose to ignore */
- #if defined(_MSC_VER)
- #pragma warning(disable: 4127) /* conditional expression is constant */
- #endif
- /*-----------------------------------------------------------------------------
- * Endianess, misalignment capabilities and util macros
- *
- * The following 3 macros are defined in this section. The other macros defined
- * are only needed to help derive these 3.
- *
- * READ_UINT32(x) Read a little endian unsigned 32-bit int
- * UNALIGNED_SAFE Defined if READ_UINT32 works on non-word boundaries
- * ROTL32(x,r) Rotate x left by r bits
- */
- /* Convention is to define __BYTE_ORDER == to one of these values */
- #if !defined(__BIG_ENDIAN)
- #define __BIG_ENDIAN 4321
- #endif
- #if !defined(__LITTLE_ENDIAN)
- #define __LITTLE_ENDIAN 1234
- #endif
- /* I386 */
- #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(i386)
- #define __BYTE_ORDER __LITTLE_ENDIAN
- #define UNALIGNED_SAFE
- #endif
- /* gcc 'may' define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ to 1 (Note the trailing __),
- * or even _LITTLE_ENDIAN or _BIG_ENDIAN (Note the single _ prefix) */
- #if !defined(__BYTE_ORDER)
- #if defined(__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__==1 || defined(_LITTLE_ENDIAN) && _LITTLE_ENDIAN==1
- #define __BYTE_ORDER __LITTLE_ENDIAN
- #elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__==1 || defined(_BIG_ENDIAN) && _BIG_ENDIAN==1
- #define __BYTE_ORDER __BIG_ENDIAN
- #endif
- #endif
- /* gcc (usually) defines xEL/EB macros for ARM and MIPS endianess */
- #if !defined(__BYTE_ORDER)
- #if defined(__ARMEL__) || defined(__MIPSEL__)
- #define __BYTE_ORDER __LITTLE_ENDIAN
- #endif
- #if defined(__ARMEB__) || defined(__MIPSEB__)
- #define __BYTE_ORDER __BIG_ENDIAN
- #endif
- #endif
- /* Now find best way we can to READ_UINT32 */
- #if __BYTE_ORDER==__LITTLE_ENDIAN
- /* CPU endian matches murmurhash algorithm, so read 32-bit word directly */
- #define READ_UINT32(ptr) (*((uint32_t*)(ptr)))
- #elif __BYTE_ORDER==__BIG_ENDIAN
- /* TODO: Add additional cases below where a compiler provided bswap32 is available */
- #if defined(__GNUC__) && (__GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>=3))
- #define READ_UINT32(ptr) (__builtin_bswap32(*((uint32_t*)(ptr))))
- #else
- /* Without a known fast bswap32 we're just as well off doing this */
- #define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24)
- #define UNALIGNED_SAFE
- #endif
- #else
- /* Unknown endianess so last resort is to read individual bytes */
- #define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24)
- /* Since we're not doing word-reads we can skip the messing about with realignment */
- #define UNALIGNED_SAFE
- #endif
- /* Find best way to ROTL32 */
- #if defined(_MSC_VER)
- #include <stdlib.h> /* Microsoft put _rotl declaration in here */
- #define ROTL32(x,r) _rotl(x,r)
- #else
- /* gcc recognises this code and generates a rotate instruction for CPUs with one */
- #define ROTL32(x,r) (((uint32_t)x << r) | ((uint32_t)x >> (32 - r)))
- #endif
- /*-----------------------------------------------------------------------------
- * Core murmurhash algorithm macros */
- #define C1 (0xcc9e2d51)
- #define C2 (0x1b873593)
- /* This is the main processing body of the algorithm. It operates
- * on each full 32-bits of input. */
- #define DOBLOCK(h1, k1) do{ \
- k1 *= C1; \
- k1 = ROTL32(k1,15); \
- k1 *= C2; \
- \
- h1 ^= k1; \
- h1 = ROTL32(h1,13); \
- h1 = h1*5+0xe6546b64; \
- }while(0)
- /* Append unaligned bytes to carry, forcing hash churn if we have 4 bytes */
- /* cnt=bytes to process, h1=name of h1 var, c=carry, n=bytes in c, ptr/len=payload */
- #define DOBYTES(cnt, h1, c, n, ptr, len) do{ \
- int _i = cnt; \
- while(_i--) { \
- c = c>>8 | *ptr++<<24; \
- n++; len--; \
- if(n==4) { \
- DOBLOCK(h1, c); \
- n = 0; \
- } \
- } }while(0)
- /*---------------------------------------------------------------------------*/
- /* Main hashing function. Initialise carry to 0 and h1 to 0 or an initial seed
- * if wanted. Both ph1 and pcarry are required arguments. */
- void PMurHash32_Process(uint32_t *ph1, uint32_t *pcarry, const void *key, int len)
- {
- uint32_t h1 = *ph1;
- uint32_t c = *pcarry;
- const uint8_t *ptr = (uint8_t*)key;
- const uint8_t *end;
- /* Extract carry count from low 2 bits of c value */
- int n = c & 3;
- #if defined(UNALIGNED_SAFE)
- /* This CPU handles unaligned word access */
- /* Consume any carry bytes */
- int i = (4-n) & 3;
- if(i && i <= len) {
- DOBYTES(i, h1, c, n, ptr, len);
- }
- /* Process 32-bit chunks */
- end = ptr + len/4*4;
- for( ; ptr < end ; ptr+=4) {
- uint32_t k1 = READ_UINT32(ptr);
- DOBLOCK(h1, k1);
- }
- #else /*UNALIGNED_SAFE*/
- /* This CPU does not handle unaligned word access */
- /* Consume enough so that the next data byte is word aligned */
- int i = -(long)ptr & 3;
- if(i && i <= len) {
- DOBYTES(i, h1, c, n, ptr, len);
- }
- /* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */
- end = ptr + len/4*4;
- switch(n) { /* how many bytes in c */
- case 0: /* c=[----] w=[3210] b=[3210]=w c'=[----] */
- for( ; ptr < end ; ptr+=4) {
- uint32_t k1 = READ_UINT32(ptr);
- DOBLOCK(h1, k1);
- }
- break;
- case 1: /* c=[0---] w=[4321] b=[3210]=c>>24|w<<8 c'=[4---] */
- for( ; ptr < end ; ptr+=4) {
- uint32_t k1 = c>>24;
- c = READ_UINT32(ptr);
- k1 |= c<<8;
- DOBLOCK(h1, k1);
- }
- break;
- case 2: /* c=[10--] w=[5432] b=[3210]=c>>16|w<<16 c'=[54--] */
- for( ; ptr < end ; ptr+=4) {
- uint32_t k1 = c>>16;
- c = READ_UINT32(ptr);
- k1 |= c<<16;
- DOBLOCK(h1, k1);
- }
- break;
- case 3: /* c=[210-] w=[6543] b=[3210]=c>>8|w<<24 c'=[654-] */
- for( ; ptr < end ; ptr+=4) {
- uint32_t k1 = c>>8;
- c = READ_UINT32(ptr);
- k1 |= c<<24;
- DOBLOCK(h1, k1);
- }
- }
- #endif /*UNALIGNED_SAFE*/
- /* Advance over whole 32-bit chunks, possibly leaving 1..3 bytes */
- len -= len/4*4;
- /* Append any remaining bytes into carry */
- DOBYTES(len, h1, c, n, ptr, len);
- /* Copy out new running hash and carry */
- *ph1 = h1;
- *pcarry = (c & ~0xff) | n;
- }
- /*---------------------------------------------------------------------------*/
- /* Finalize a hash. To match the original Murmur3A the total_length must be provided */
- uint32_t PMurHash32_Result(uint32_t h, uint32_t carry, uint32_t total_length)
- {
- uint32_t k1;
- int n = carry & 3;
- if(n) {
- k1 = carry >> (4-n)*8;
- k1 *= C1; k1 = ROTL32(k1,15); k1 *= C2; h ^= k1;
- }
- h ^= total_length;
- /* fmix */
- h ^= h >> 16;
- h *= 0x85ebca6b;
- h ^= h >> 13;
- h *= 0xc2b2ae35;
- h ^= h >> 16;
- return h;
- }
- /*---------------------------------------------------------------------------*/
- /* Murmur3A compatable all-at-once */
- uint32_t PMurHash32(uint32_t seed, const void *key, int len)
- {
- uint32_t h1=seed, carry=0;
- PMurHash32_Process(&h1, &carry, key, len);
- return PMurHash32_Result(h1, carry, len);
- }
- /*---------------------------------------------------------------------------*/
- /* Provide an API suitable for smhasher */
- void PMurHash32_test(const void *key, int len, uint32_t seed, void *out)
- {
- uint32_t h1=seed, carry=0;
- const uint8_t *ptr = (uint8_t*)key;
- const uint8_t *end = ptr + len;
- #if 0 /* Exercise the progressive processing */
- while(ptr < end) {
- //const uint8_t *mid = ptr + rand()%(end-ptr)+1;
- const uint8_t *mid = ptr + (rand()&0xF);
- mid = mid<end?mid:end;
- PMurHash32_Process(&h1, &carry, ptr, mid-ptr);
- ptr = mid;
- }
- #else
- PMurHash32_Process(&h1, &carry, ptr, (int)(end-ptr));
- #endif
- h1 = PMurHash32_Result(h1, carry, len);
- *(uint32_t*)out = h1;
- }
- /*---------------------------------------------------------------------------*/
- #ifdef TEST
- int main() {
- // http://www.cprover.org/cbmc/
- // cbmc PMurHash.c --function PMurHash32 --unwind 255 --bounds-check --pointer-check
- //=> seed=308736u (00000000000001001011011000000000)
- // key=INVALID-128 (1000000011111111111111111111111111111111111111111111110101100111)
- // len=640
- // Violated property:
- //file PMurHash.c line 201 function PMurHash32_Process
- //dereference failure: object bounds
- //!(POINTER_OFFSET(ptr) < 0) && OBJECT_SIZE(ptr) >= 1 + POINTER_OFFSET(ptr) || DYNAMIC_OBJECT(ptr)
- uint32_t seed = 308736;
- unsigned long long key = 0x80fffffffffffd67ULL;
- PMurHash32(seed, &key, sizeof(key));
- }
- #endif
|