From f95bcf45ad0e537a124f5aa0c7b9e97a78811289 Mon Sep 17 00:00:00 2001 From: flow Date: Sat, 23 Jul 2022 23:14:49 -0300 Subject: [PATCH] feat(libs): add incremental version of murmurhash2 calculation This does two passes for a given file, which is kinda slow, but I don't know how else to get the size excluding the filtered ones :< Signed-off-by: flow --- libraries/murmur2/src/MurmurHash2.cpp | 148 +++++++++++++++----------- libraries/murmur2/src/MurmurHash2.h | 40 +++---- 2 files changed, 106 insertions(+), 82 deletions(-) diff --git a/libraries/murmur2/src/MurmurHash2.cpp b/libraries/murmur2/src/MurmurHash2.cpp index 3e52e6d1..b625efb1 100644 --- a/libraries/murmur2/src/MurmurHash2.cpp +++ b/libraries/murmur2/src/MurmurHash2.cpp @@ -1,86 +1,110 @@ //----------------------------------------------------------------------------- // MurmurHash2 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. +// +// This was modified as to possibilitate it's usage incrementally. +// Those modifications are also placed in the public domain, and the author of +// such modifications hereby disclaims copyright to this source code. #include "MurmurHash2.h" //----------------------------------------------------------------------------- -// Platform-specific functions and macros -// Microsoft Visual Studio +// 'm' and 'r' are mixing constants generated offline. +// They're not really 'magic', they just happen to work well. +const uint32_t m = 0x5bd1e995; +const int r = 24; -#if defined(_MSC_VER) - -#define BIG_CONSTANT(x) (x) - -// Other compilers - -#else // defined(_MSC_VER) - -#define BIG_CONSTANT(x) (x##LLU) - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- - -uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed ) +uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function filter_out) { - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. + auto* buffer = new char[buffer_size]; + char data[4]; - const uint32_t m = 0x5bd1e995; - const int r = 24; + int read = 0; + uint32_t size = 0; - // Initialize the hash to a 'random' value + // We need the size without the filtered out characters before actually calculating the hash, + // to setup the initial value for the hash. + do { + file_stream.read(buffer, buffer_size); + read = file_stream.gcount(); + for (int i = 0; i < read; i++) { + if (!filter_out(buffer[i])) + size += 1; + } + } while (!file_stream.eof()); - uint32_t h = seed ^ len; + file_stream.clear(); + file_stream.seekg(0, file_stream.beg); - // Mix 4 bytes at a time into the hash - const auto* data = (const unsigned char*) key; - while(len >= 4) - { - uint32_t k = *(uint32_t*)data; + int index = 0; - k *= m; - k ^= k >> r; - k *= m; + // This forces a seed of 1. + IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size }; + do { + file_stream.read(buffer, buffer_size); + read = file_stream.gcount(); + for (int i = 0; i < read; i++) { + char c = buffer[i]; - h *= m; - h ^= k; + if (filter_out(c)) + continue; - data += 4*sizeof(char); - len -= 4; - } + data[index] = c; + index = (index + 1) % 4; - // Handle the last few bytes of the input array + // Mix 4 bytes at a time into the hash + if (index == 0) + FourBytes_MurmurHash2((unsigned char*)&data, info); + } + } while (!file_stream.eof()); - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; + // Do one last bit shuffle in the hash + FourBytes_MurmurHash2((unsigned char*)&data, info); - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. + delete[] buffer; - h ^= h >> 13; - h *= m; - h ^= h >> 15; + file_stream.close(); + return info.h; +} - return h; -} +void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev) +{ + if (prev.len >= 4) { + // Not the final mix + uint32_t k = *(uint32_t*)data; + + k *= m; + k ^= k >> r; + k *= m; + + prev.h *= m; + prev.h ^= k; + + prev.len -= 4; + } else { + // The final mix + + // Handle the last few bytes of the input array + switch (prev.len) { + case 3: + prev.h ^= data[2] << 16; + case 2: + prev.h ^= data[1] << 8; + case 1: + prev.h ^= data[0]; + prev.h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + prev.h ^= prev.h >> 13; + prev.h *= m; + prev.h ^= prev.h >> 15; + + prev.len = 0; + } +} //----------------------------------------------------------------------------- diff --git a/libraries/murmur2/src/MurmurHash2.h b/libraries/murmur2/src/MurmurHash2.h index c7b83bca..acea27ea 100644 --- a/libraries/murmur2/src/MurmurHash2.h +++ b/libraries/murmur2/src/MurmurHash2.h @@ -1,30 +1,30 @@ //----------------------------------------------------------------------------- -// MurmurHash2 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. +// The original MurmurHash2 was written by Austin Appleby, and is placed in the +// public domain. The author hereby disclaims copyright to this source code. +// +// This was modified as to possibilitate it's usage incrementally. +// Those modifications are also placed in the public domain, and the author of +// such modifications hereby disclaims copyright to this source code. #pragma once -//----------------------------------------------------------------------------- -// Platform-specific functions and macros +#include +#include -// Microsoft Visual Studio - -#if defined(_MSC_VER) && (_MSC_VER < 1600) - -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned __int64 uint64_t; - -// Other compilers - -#else // defined(_MSC_VER) - -#include - -#endif // !defined(_MSC_VER) +#include //----------------------------------------------------------------------------- -uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed = 1 ); +uint32_t MurmurHash2( + std::ifstream&& file_stream, + std::size_t buffer_size = 4096, + std::function filter_out = [](char) { return true; }); + +struct IncrementalHashInfo { + uint32_t h; + uint32_t len; +}; + +void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev); //-----------------------------------------------------------------------------