feat(libs): add incremental version of murmurhash2 calculation

This does two passes for a given file, which is kinda slow, but I don't know how else to get the size excluding the filtered ones :< Signed-off-by: flow <flowlnlnln@gmail.com>
2022-07-23 23:14:49 -03:00
parent 15ec1abb6a
commit f95bcf45ad
2 changed files with 106 additions and 82 deletions
@@ -1,86 +1,110 @@
 //-----------------------------------------------------------------------------
 // MurmurHash2 was written by Austin Appleby, and is placed in the public
 // domain. The author hereby disclaims copyright to this source code.
-
-// Note - This code makes a few assumptions about how your machine behaves -
-
-// 1. We can read a 4-byte value from any address without crashing
-// 2. sizeof(int) == 4
-
-// And it has a few limitations -
-
-// 1. It will not work incrementally.
-// 2. It will not produce the same results on little-endian and big-endian
-//    machines.
+//
+// This was modified as to possibilitate it's usage incrementally.
+// Those modifications are also placed in the public domain, and the author of
+// such modifications hereby disclaims copyright to this source code.

 #include "MurmurHash2.h"

 //-----------------------------------------------------------------------------
-// Platform-specific functions and macros

-// Microsoft Visual Studio
+// 'm' and 'r' are mixing constants generated offline.
+// They're not really 'magic', they just happen to work well.
+const uint32_t m = 0x5bd1e995;
+const int r = 24;

-#if defined(_MSC_VER)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else	// defined(_MSC_VER)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-
-uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed )
+uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out)
 {
-  // 'm' and 'r' are mixing constants generated offline.
-  // They're not really 'magic', they just happen to work well.
+    auto* buffer = new char[buffer_size];
+    char data[4];

-  const uint32_t m = 0x5bd1e995;
-  const int r = 24;
+    int read = 0;
+    uint32_t size = 0;

-  // Initialize the hash to a 'random' value
+    // We need the size without the filtered out characters before actually calculating the hash,
+    // to setup the initial value for the hash.
+    do {
+        file_stream.read(buffer, buffer_size);
+        read = file_stream.gcount();
+        for (int i = 0; i < read; i++) {
+            if (!filter_out(buffer[i]))
+                size += 1;
+        }
+    } while (!file_stream.eof());

-  uint32_t h = seed ^ len;
+    file_stream.clear();
+    file_stream.seekg(0, file_stream.beg);

-  // Mix 4 bytes at a time into the hash
-  const auto* data = (const unsigned char*) key;
-  while(len >= 4)
-  {
-    uint32_t k = *(uint32_t*)data;
+    int index = 0;

-    k *= m;
-    k ^= k >> r;
-    k *= m;
+    // This forces a seed of 1.
+    IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size };
+    do {
+        file_stream.read(buffer, buffer_size);
+        read = file_stream.gcount();
+        for (int i = 0; i < read; i++) {
+            char c = buffer[i];

-    h *= m;
-    h ^= k;
+            if (filter_out(c))
+                continue;

-    data += 4*sizeof(char);
-    len -= 4;
-  }
+            data[index] = c;
+            index = (index + 1) % 4;

-  // Handle the last few bytes of the input array
+            // Mix 4 bytes at a time into the hash
+            if (index == 0)
+                FourBytes_MurmurHash2((unsigned char*)&data, info);
+        }
+    } while (!file_stream.eof());

-  switch(len)
-  {
-  case 3: h ^= data[2] << 16;
-  case 2: h ^= data[1] << 8;
-  case 1: h ^= data[0];
-      h *= m;
-  };
+    // Do one last bit shuffle in the hash
+    FourBytes_MurmurHash2((unsigned char*)&data, info);

-  // Do a few final mixes of the hash to ensure the last few
-  // bytes are well-incorporated.
+    delete[] buffer;

-  h ^= h >> 13;
-  h *= m;
-  h ^= h >> 15;
+    file_stream.close();
+    return info.h;
+}

-  return h;
-} 
+void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev)
+{
+    if (prev.len >= 4) {
+        // Not the final mix
+        uint32_t k = *(uint32_t*)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        prev.h *= m;
+        prev.h ^= k;
+
+        prev.len -= 4;
+    } else {
+        // The final mix
+
+        // Handle the last few bytes of the input array
+        switch (prev.len) {
+            case 3:
+                prev.h ^= data[2] << 16;
+            case 2:
+                prev.h ^= data[1] << 8;
+            case 1:
+                prev.h ^= data[0];
+                prev.h *= m;
+        };
+
+        // Do a few final mixes of the hash to ensure the last few
+        // bytes are well-incorporated.
+
+        prev.h ^= prev.h >> 13;
+        prev.h *= m;
+        prev.h ^= prev.h >> 15;
+
+        prev.len = 0;
+    }
+}

 //-----------------------------------------------------------------------------
@@ -1,30 +1,30 @@
 //-----------------------------------------------------------------------------
-// MurmurHash2 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
+// The original MurmurHash2 was written by Austin Appleby, and is placed in the
+// public domain. The author hereby disclaims copyright to this source code.
+//
+// This was modified as to possibilitate it's usage incrementally.
+// Those modifications are also placed in the public domain, and the author of
+// such modifications hereby disclaims copyright to this source code.

 #pragma once

-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
+#include <cstdint>
+#include <fstream>

-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else	// defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
+#include <functional>

 //-----------------------------------------------------------------------------

-uint64_t MurmurHash2        ( const void* key, int len, uint32_t seed = 1 );
+uint32_t MurmurHash2(
+    std::ifstream&& file_stream,
+    std::size_t buffer_size = 4096,
+    std::function<bool(char)> filter_out = [](char) { return true; });
+
+struct IncrementalHashInfo {
+    uint32_t h;
+    uint32_t len;
+};
+
+void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev);

 //-----------------------------------------------------------------------------