110 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			110 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #ifndef BASE_ROLLING_HASH_H
 | |
| #define BASE_ROLLING_HASH_H
 | |
| 
 | |
| #include <boost/circular_buffer.hpp>
 | |
| #include <stdint.h>
 | |
| #include <boost/optional.hpp>
 | |
| 
 | |
| //----------------------------------------------------------------
 | |
| 
 | |
| namespace base {
 | |
| 	namespace hash_detail {
 | |
| 		uint32_t const MULTIPLIER = 4294967291UL;
 | |
| 		uint32_t const SEED = 123;
 | |
| 	}
 | |
| 
 | |
| 	class rolling_hash {
 | |
| 	public:
 | |
| 		rolling_hash(unsigned window_size);
 | |
| 
 | |
| 		void reset();
 | |
| 
 | |
| 		// Returns the current hash
 | |
| 		uint32_t step(uint8_t byte) {
 | |
| 			update_hash(byte);
 | |
| 			return hash_;
 | |
| 		}
 | |
| 
 | |
| 		uint32_t get_hash() const {
 | |
| 			return hash_;
 | |
| 		}
 | |
| 
 | |
| 	private:
 | |
| 		void update_hash(uint8_t byte) {
 | |
| 			hash_ -= a_to_k_minus_1_ * (buffer_.front() + hash_detail::SEED);
 | |
| 			buffer_.push_back(byte);
 | |
| 			hash_ = (hash_ * a_) + byte + hash_detail::SEED;
 | |
| 		}
 | |
| 
 | |
| 		uint32_t a_;
 | |
| 		uint32_t a_to_k_minus_1_;
 | |
| 
 | |
| 		uint32_t hash_;
 | |
| 		uint32_t window_size_;
 | |
| 
 | |
| 		boost::circular_buffer<uint8_t> buffer_;
 | |
| 	};
 | |
| 
 | |
| 	class content_based_hash {
 | |
| 	public:
 | |
| 		content_based_hash(unsigned window_size);
 | |
| 		void reset();
 | |
| 
 | |
| 		// Returns a break point relative to the last reset/break.
 | |
| 		boost::optional<unsigned> step(uint8_t byte) {
 | |
| 			boost::optional<unsigned> r;
 | |
| 
 | |
| 			rhash_.step(byte);
 | |
| 			len_++;
 | |
| 
 | |
| 			if (len_ < min_len_)
 | |
| 				return r;
 | |
| 
 | |
| 			if (hit_break(backup_div_))
 | |
| 				backup_break_ = len_;
 | |
| 
 | |
| 			if (hit_break(div_)) {
 | |
| 				// found a break
 | |
| 				r = len_;
 | |
| 				len_ = 0;
 | |
| 				backup_break_.reset();
 | |
| 
 | |
| 			} else if (len_ >= max_len_) {
 | |
| 				// too big, is there a backup?
 | |
| 				if (backup_break_) {
 | |
| 					len_ -= *backup_break_;
 | |
| 					r = backup_break_;
 | |
| 					backup_break_.reset();
 | |
| 
 | |
| 				} else {
 | |
| 					r = len_;
 | |
| 					len_ = 0;
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			return r;
 | |
| 		}
 | |
| 
 | |
| 	private:
 | |
| 		bool hit_break(uint32_t mask) const {
 | |
| 			uint32_t h = rhash_.get_hash() >> 8;
 | |
| 			return !(h & mask);
 | |
| 		}
 | |
| 
 | |
| 		rolling_hash rhash_;
 | |
| 
 | |
| 		uint32_t backup_div_;
 | |
| 		uint32_t div_;
 | |
| 
 | |
| 		unsigned min_len_;
 | |
| 		unsigned max_len_;
 | |
| 
 | |
| 		unsigned len_;
 | |
| 		boost::optional<unsigned> backup_break_;
 | |
| 	};
 | |
| }
 | |
| 
 | |
| //----------------------------------------------------------------
 | |
| 
 | |
| #endif
 |