From 90a7ac14a3de65622de3f8d08ccb89c32dcb45ed Mon Sep 17 00:00:00 2001
From: AAAlvesJr <aalvesju@gmail.com>
Date: Tue, 9 Mar 2021 03:46:10 +0100
Subject: [PATCH] adding more two implementations of squares

---
 random_iterator/Engine.hpp                    |  12 +-
 random_iterator/detail/Macros.h               |  41 +
 random_iterator/detail/Random123/array.h      |   2 +-
 random_iterator/detail/Squares3_128.hpp       | 150 ++++
 .../detail/{Squares3.hpp => Squares3_64.hpp}  |  10 +-
 random_iterator/detail/Squares4_128.hpp       | 152 ++++
 .../detail/{Squares4.hpp => Squares4_64.hpp}  |  10 +-
 random_iterator/detail/uint128.h_             | 816 ++++++++++++++++++
 random_iterator/detail/uint128.hpp            | 677 +++++++++++++++
 random_iterator/detail/uint128.inl            | 179 ++++
 tests/benchmark_engine.cpp                    |  84 +-
 11 files changed, 2114 insertions(+), 19 deletions(-)
 create mode 100644 random_iterator/detail/Macros.h
 create mode 100644 random_iterator/detail/Squares3_128.hpp
 rename random_iterator/detail/{Squares3.hpp => Squares3_64.hpp} (95%)
 create mode 100644 random_iterator/detail/Squares4_128.hpp
 rename random_iterator/detail/{Squares4.hpp => Squares4_64.hpp} (93%)
 create mode 100644 random_iterator/detail/uint128.h_
 create mode 100644 random_iterator/detail/uint128.hpp
 create mode 100644 random_iterator/detail/uint128.inl

diff --git a/random_iterator/Engine.hpp b/random_iterator/Engine.hpp
index 03970fb..1a41da6 100644
--- a/random_iterator/Engine.hpp
+++ b/random_iterator/Engine.hpp
@@ -31,8 +31,10 @@
 #include <limits>
 #include <random_iterator/detail/EngineTraits.hpp>
 #include <random_iterator/detail/SplitMix.hpp>
-#include <random_iterator/detail/Squares3.hpp>
-#include <random_iterator/detail/Squares4.hpp>
+#include <random_iterator/detail/Squares3_128.hpp>
+#include <random_iterator/detail/Squares4_128.hpp>
+#include <random_iterator/detail/Squares3_64.hpp>
+#include <random_iterator/detail/Squares4_64.hpp>
 
 namespace random_iterator {
 
@@ -211,8 +213,10 @@ private:
 
 typedef Engine<random_iterator_r123::Philox2x64>      philox;
 typedef Engine<random_iterator_r123::Threefry2x64>  threefry;
-typedef detail::Squares3                            squares3;
-typedef detail::Squares4                            squares4;
+typedef detail::Squares3_64                      squares3_64;
+typedef detail::Squares4_64                      squares4_64;
+typedef detail::Squares3_128                    squares3_128;
+typedef detail::Squares4_128                    squares4_128;
 
 #if RANDOM_ITERATOR_R123_USE_AES_NI
 typedef Engine<random_iterator_r123::ARS4x32>           ars;
diff --git a/random_iterator/detail/Macros.h b/random_iterator/detail/Macros.h
new file mode 100644
index 0000000..84bd4a4
--- /dev/null
+++ b/random_iterator/detail/Macros.h
@@ -0,0 +1,41 @@
+/*----------------------------------------------------------------------------
+ *
+ *   Copyright (C) 2021 Antonio Augusto Alves Junior
+ *
+ *   This file is part of RandomIterator.
+ *
+ *   RandomIterator is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   RandomIterator is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with RandomIterator.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *---------------------------------------------------------------------------*/
+/*
+ * Macros.h
+ *
+ *  Created on: 04/03/2021
+ *      Author: Antonio Augusto Alves Junior
+ */
+
+#pragma once
+
+#if defined(__has_builtin)
+	#if __has_builtin(__builtin_expect)
+		 #define   _LIKELY_(x) __builtin_expect(x, 1)
+		 #define _UNLIKELY_(x) __builtin_expect(x, 0)
+	#else
+		 #define   _LIKELY_(x) x
+		 #define _UNLIKELY_(x) x
+	#endif //__has_builtin(__builtin_expect)
+#else
+	#define   _LIKELY_(x) x
+	#define _UNLIKELY_(x) x
+#endif //defined(__has_builtin)
diff --git a/random_iterator/detail/Random123/array.h b/random_iterator/detail/Random123/array.h
index b6ca0ec..4f4443a 100644
--- a/random_iterator/detail/Random123/array.h
+++ b/random_iterator/detail/Random123/array.h
@@ -305,7 +305,7 @@ namespace random_iterator_r123{
    a typedef equivalent to r123arrayNxW.
 */
 
-#define _r123array_tpl(_N, W, T)                   \
+#define _r123array_tpl(_N, W, T)                    \
     /** @ingroup arrayNxW */                        \
     /** @see arrayNxW */                            \
 struct r123array##_N##x##W{                         \
diff --git a/random_iterator/detail/Squares3_128.hpp b/random_iterator/detail/Squares3_128.hpp
new file mode 100644
index 0000000..4efdccd
--- /dev/null
+++ b/random_iterator/detail/Squares3_128.hpp
@@ -0,0 +1,150 @@
+/*----------------------------------------------------------------------------
+ *
+ *   Copyright (C) 2021 Antonio Augusto Alves Junior
+ *
+ *   This file is part of RandomIterator.
+ *
+ *   RandomIterator is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   RandomIterator is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with RandomIterator.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *---------------------------------------------------------------------------*/
+/*
+ * Squares3_128.hpp
+ *
+ *  Created on: 25/02/2021
+ *      Author: Antonio Augusto Alves Junior
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <random_iterator/detail/SquaresKeys.hpp>
+#include <random_iterator/detail/uint128.hpp>
+
+namespace random_iterator {
+
+namespace detail {
+
+/*
+*  Three round counter-based middle square
+*
+*  squares3 - returns a 32-bit unsigned int [0,0xffffffff]
+*
+*
+*  Three rounds of squaring are performed and the result is returned.
+*  For the first two rounds, the result is rotated right 32 bits.
+*  This places the random data in the best position for the next round.
+*  y = ctr*key or z = (ctr+1)*key is added on each round.  For keys
+*  generated by the key utility, either ctr*key or (ctr+1)*key will
+*  have non-zero digits.  This improves randomization and also provides
+*  for a uniform output.
+*
+*  Note:  The squares RNG was based on ideas derived from Middle Square
+*  Weyl Sequence RNG.  One of the ideas was to obtain uniformity by adding
+*  the Weyl sequence after squaring.  Richard P. Brent (creator of the
+*  xorgens RNG) suggested this method.  It turns out that adding ctr*key
+*  is equivalent.  Brent's idea provides the basis for uniformity in this
+*  generator.
+*
+*  Implementation of the algorithm authored by Bernard Widynski and
+*  described in https://arxiv.org/pdf/2004.06278v2.pdf
+*/
+
+class Squares3_128
+{
+
+public:
+
+	typedef uint128_t    init_type;
+	typedef uint128_t   state_type;
+	typedef uint128_t    seed_type;
+	typedef uint128_t advance_type;
+	typedef uint64_t  result_type;
+
+	Squares3_128()=delete;
+
+	Squares3_128(size_t s, uint32_t  a=0):
+	  state_(0),
+      seed_(seed_type{ splitmix<size_t>(s) })
+    {}
+
+	Squares3_128( Squares3_128 const& other):
+	  state_(other.getState() ),
+      seed_(other.getSeed() )
+     {}
+
+	inline Squares3_128& operator=( Squares3_128 const& other)
+	{
+	  if(this==&other) return *this;
+
+	  state_  = other.getState();
+      seed_   = other.getSeed();
+
+      return *this;
+    }
+
+
+	inline result_type operator()(void)
+	{
+		uint128_t x, y, z;
+
+		y = x = seed_*state_ ; z = y + seed_;
+
+		x = x*x + y; x = rotate_right_64(x);      /* round 1 */
+
+		x = x*x + z; x = rotate_right_64(x);      /* round 2 */
+
+		++state_;                                 /* advance state */
+
+		return (x*x + y) >> 64;                   /* round 3 */
+
+	}
+
+	inline void discard(advance_type n){
+
+		state_ += n;
+	}
+
+	inline seed_type getSeed() const {
+		return seed_;
+	}
+
+	inline void setSeed(seed_type seed) {
+		seed_ = seed;
+	}
+
+	inline state_type getState() const {
+		return state_;
+	}
+
+	inline void setState( state_type state) {
+		state_ = state;
+	}
+
+	inline static uint64_t generateSeed( size_t i ){
+			return keys[i];
+		}
+
+	static constexpr result_type min(){ return 0;}
+
+	static constexpr result_type max(){ return std::numeric_limits<result_type>::max();}
+
+private:
+
+	state_type state_;
+	seed_type   seed_;
+};
+
+}  // namespace detail
+
+}  // namespace random_iterator
diff --git a/random_iterator/detail/Squares3.hpp b/random_iterator/detail/Squares3_64.hpp
similarity index 95%
rename from random_iterator/detail/Squares3.hpp
rename to random_iterator/detail/Squares3_64.hpp
index 0acd748..4e4454a 100644
--- a/random_iterator/detail/Squares3.hpp
+++ b/random_iterator/detail/Squares3_64.hpp
@@ -59,7 +59,7 @@ namespace detail {
 *  described in https://arxiv.org/pdf/2004.06278v2.pdf
 */
 
-class Squares3
+class Squares3_64
 {
 
 public:
@@ -70,19 +70,19 @@ public:
 	typedef uint64_t advance_type;
 	typedef uint32_t  result_type;
 
-	Squares3()=delete;
+	Squares3_64()=delete;
 
-	Squares3(seed_type s, uint32_t  a=0):
+	Squares3_64(seed_type s, uint32_t  a=0):
 	  state_(0),
       seed_(seed_type{ splitmix<seed_type>(s) })
     {}
 
-	Squares3( Squares3 const& other):
+	Squares3_64( Squares3_64 const& other):
 	  state_(other.getState() ),
       seed_(other.getSeed() )
      {}
 
-	inline Squares3& operator=( Squares3 const& other)
+	inline Squares3_64& operator=( Squares3_64 const& other)
 	{
 	  if(this==&other) return *this;
 
diff --git a/random_iterator/detail/Squares4_128.hpp b/random_iterator/detail/Squares4_128.hpp
new file mode 100644
index 0000000..8605e50
--- /dev/null
+++ b/random_iterator/detail/Squares4_128.hpp
@@ -0,0 +1,152 @@
+/*----------------------------------------------------------------------------
+ *
+ *   Copyright (C) 2021 Antonio Augusto Alves Junior
+ *
+ *   This file is part of RandomIterator.
+ *
+ *   RandomIterator is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   RandomIterator is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with RandomIterator.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *---------------------------------------------------------------------------*/
+/*
+ * Squares4_128.hpp
+ *
+ *  Created on: 25/02/2021
+ *      Author: Antonio Augusto Alves Junior
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <random_iterator/detail/SquaresKeys.hpp>
+#include <random_iterator/detail/uint128.hpp>
+
+namespace random_iterator {
+
+namespace detail {
+
+/*
+*  Three round counter-based middle square
+*
+*  squares3 - returns a 32-bit unsigned int [0,0xffffffff]
+*
+*
+*  Three rounds of squaring are performed and the result is returned.
+*  For the first two rounds, the result is rotated right 32 bits.
+*  This places the random data in the best position for the next round.
+*  y = ctr*key or z = (ctr+1)*key is added on each round.  For keys
+*  generated by the key utility, either ctr*key or (ctr+1)*key will
+*  have non-zero digits.  This improves randomization and also provides
+*  for a uniform output.
+*
+*  Note:  The squares RNG was based on ideas derived from Middle Square
+*  Weyl Sequence RNG.  One of the ideas was to obtain uniformity by adding
+*  the Weyl sequence after squaring.  Richard P. Brent (creator of the
+*  xorgens RNG) suggested this method.  It turns out that adding ctr*key
+*  is equivalent.  Brent's idea provides the basis for uniformity in this
+*  generator.
+*
+*  Implementation of the algorithm authored by Bernard Widynski and
+*  described in https://arxiv.org/pdf/2004.06278v2.pdf
+*/
+
+class Squares4_128
+{
+
+public:
+
+	typedef uint128_t    init_type;
+	typedef uint128_t   state_type;
+	typedef uint128_t    seed_type;
+	typedef uint128_t advance_type;
+	typedef uint64_t  result_type;
+
+	Squares4_128()=delete;
+
+	Squares4_128(size_t s, uint32_t  a=0):
+	  state_(0),
+      seed_(seed_type{ splitmix<size_t>(s) })
+    {}
+
+	Squares4_128( Squares4_128 const& other):
+	  state_(other.getState() ),
+      seed_(other.getSeed() )
+     {}
+
+	inline Squares4_128& operator=( Squares4_128 const& other)
+	{
+	  if(this==&other) return *this;
+
+	  state_  = other.getState();
+      seed_   = other.getSeed();
+
+      return *this;
+    }
+
+
+	inline result_type operator()(void)
+	{
+		uint128_t x, y, z;
+
+		y = x = seed_*state_ ; z = y + seed_;
+
+		x = x*x + y; x = rotate_right_64(x);      /* round 1 */
+
+		x = x*x + z; x = rotate_right_64(x);      /* round 2 */
+
+		x = x*x + y; x = rotate_right_64(x);      /* round 3 */
+
+		++state_;                                 /* advance state */
+
+		return (x*x + z) >> 64;                   /* round 3 */
+
+	}
+
+	inline void discard(advance_type n){
+
+		state_ += n;
+	}
+
+	inline seed_type getSeed() const {
+		return seed_;
+	}
+
+	inline void setSeed(seed_type seed) {
+		seed_ = seed;
+	}
+
+	inline state_type getState() const {
+		return state_;
+	}
+
+	inline void setState( state_type state) {
+		state_ = state;
+	}
+
+	inline static uint64_t generateSeed( size_t i ){
+			return keys[i];
+		}
+
+	static constexpr result_type min(){ return 0;}
+
+	static constexpr result_type max(){ return std::numeric_limits<result_type>::max();}
+
+private:
+
+	state_type state_;
+	seed_type   seed_;
+};
+
+}  // namespace detail
+
+}  // namespace random_iterator
diff --git a/random_iterator/detail/Squares4.hpp b/random_iterator/detail/Squares4_64.hpp
similarity index 93%
rename from random_iterator/detail/Squares4.hpp
rename to random_iterator/detail/Squares4_64.hpp
index e024d09..ed97ac3 100644
--- a/random_iterator/detail/Squares4.hpp
+++ b/random_iterator/detail/Squares4_64.hpp
@@ -34,7 +34,7 @@ namespace random_iterator {
 
 namespace detail {
 
-class Squares4
+class Squares4_64
 {
 
 public:
@@ -45,19 +45,19 @@ public:
 	typedef uint64_t advance_type;
 	typedef uint32_t  result_type;
 
-	Squares4()=delete;
+	Squares4_64()=delete;
 
-	Squares4(seed_type  s, uint32_t a=0):
+	Squares4_64(seed_type  s, uint32_t a=0):
 	  state_(0),
       seed_(seed_type{ splitmix<seed_type>(s)})
     {}
 
-	Squares4( Squares4 const& other):
+	Squares4_64( Squares4_64 const& other):
 	  state_(other.getState() ),
       seed_(other.getSeed() )
      {}
 
-	inline Squares4& operator=( Squares4 const& other)
+	inline Squares4_64& operator=( Squares4_64 const& other)
 	{
 	  if(this==&other) return *this;
 
diff --git a/random_iterator/detail/uint128.h_ b/random_iterator/detail/uint128.h_
new file mode 100644
index 0000000..875e89f
--- /dev/null
+++ b/random_iterator/detail/uint128.h_
@@ -0,0 +1,816 @@
+
+#pragma once
+
+#include <iostream>
+#include <iomanip>
+#include <cinttypes>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <iterator>
+#include <utility>
+
+#ifdef __has_builtin
+# define uint128_t_has_builtin(x) __has_builtin(x)
+#else
+# define uint128_t_has_builtin(x) 0
+#endif
+
+namespace hydra {
+
+class uint128_t;
+
+namespace detail {
+
+template<typename T>
+using check_uint128_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T,uint128_t >::value, uint128_t>::type;
+
+template<typename T>
+using T_type = typename std::enable_if<std::is_integral<T>::value, T>::type;
+
+}
+
+class uint128_t
+{
+
+	uint64_t lo;
+	uint64_t hi;
+
+	uint64_t v_[2];
+
+  public:
+
+	uint128_t() = default;
+
+	__hydra_host__ __hydra_device__
+	uint128_t(uint64_t a){
+
+		lo = a ;
+		hi = 0;
+	}
+
+	__hydra_host__ __hydra_device__
+	uint128_t(uint64_t a, uint64_t b){
+
+		lo = a ;
+		hi = b;
+	}
+
+	__hydra_host__ __hydra_device__
+	uint128_t(uint128_t const& a){
+
+		lo = a.lo ;
+		hi = a.hi;
+	}
+
+    template<typename T, typename=typename std::enable_if<std::is_integral<T>::value>::type>
+    __hydra_host__ __hydra_device__
+	inline   operator T() const {
+
+		return T(lo);
+	}
+
+	__hydra_host__ __hydra_device__
+	inline uint128_t&    operator=( uint128_t const& n){
+
+		lo = n.lo;
+		hi = n.hi;
+		return * this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline typename std::enable_if<std::is_integral<T>::value>::type&
+	   operator=(T const& n){
+
+		hi = 0;
+		lo = n;
+		return * this;
+	}
+
+    //
+	// Compound assignment
+	//--------------------------------------
+	//
+	// arithmetic
+	//
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T> &  operator+=(const T  a){
+
+		uint128_t b = (uint128_t) a;
+		hi += b.hi + ((lo + b.lo) < lo);
+        lo += b.lo;
+        return *this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>&   operator-=(const T  b){
+
+		uint128_t temp = (uint128_t)b;
+		if(lo < temp.lo) hi--;
+		lo -= temp.lo;
+		return * this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>&  operator*=( const T  b){
+
+		 *this = mul128(*this, b);
+		return *this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>& operator/=( const T  b){
+
+		 *this = divide_u128_to_u64(*this, (uint64_t)b);
+		return *this;
+	}
+
+	//
+	// increment
+	//
+	__hydra_host__ __hydra_device__
+	inline uint128_t&  operator++(){
+
+		return *this +=1;
+	}
+
+	__hydra_host__ __hydra_device__
+	inline uint128_t  operator++(int){
+		uint128_t temp = *this;
+          *this +=1;
+		return temp;
+	}
+
+	__hydra_host__ __hydra_device__
+	inline uint128_t &  operator--(){
+
+		return *this -=1;
+	}
+
+	__hydra_host__ __hydra_device__
+	inline uint128_t  operator--(int){
+		uint128_t temp = *this;
+          *this -=1;
+		return temp;
+	}
+	//
+	// bitwise
+	//
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>& operator>>=(const T  b)
+	{
+		if(HYDRA_HOST_UNLIKELY(b==64)){
+			lo = hi;
+			hi = 0;
+		}
+		else{
+
+		lo = (lo >> b) | (hi << (int)(b - 64));
+		(b < 64) ? hi >>= b : hi = 0;
+		}
+		return *this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>& operator<<=(const T  b){
+
+		if(HYDRA_HOST_UNLIKELY(b==64)){
+
+			hi = lo;
+			lo = 0;
+
+		}
+		else{
+			hi = (hi << b) | (lo << (int)(b - 64));
+			(b < 64) ? lo <<= b : lo = 0;
+		}
+		return *this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>  & operator|=(const T  b){
+
+		*this = *this | b;
+		return *this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>  & operator&=(const T  b){
+
+		*this = *this & b; return *this;
+	}
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	inline detail::check_uint128_t<T>  & operator^=(const T  b){
+
+		*this = *this ^ b;
+		return *this;
+	}
+
+	//
+	// friend functions
+	//-----------------------------
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T> operator>>(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator<<(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator+(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator-(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator*(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend T operator/(uint128_t x, const T  v);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend T operator%(uint128_t x, const T  v);
+
+	__hydra_host__ __hydra_device__
+	friend bool operator<(uint128_t const  a, uint128_t const  b);
+
+	__hydra_host__ __hydra_device__
+	friend bool operator>(uint128_t const  a, uint128_t const  b);
+
+	__hydra_host__ __hydra_device__
+	friend bool operator<=(uint128_t  const a, uint128_t const  b);
+
+	__hydra_host__ __hydra_device__
+	friend bool operator>=(uint128_t  const a, uint128_t  const b);
+
+	__hydra_host__ __hydra_device__
+	friend bool operator==(uint128_t  const a, uint128_t const b);
+
+	__hydra_host__ __hydra_device__
+	friend bool operator!=(uint128_t  const a, uint128_t  const b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator|(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator&(uint128_t a, const T  b);
+
+	template <typename T>
+	__hydra_host__ __hydra_device__
+	friend detail::check_uint128_t<T>  operator^(uint128_t a, const T  b);
+
+	__hydra_host__ __hydra_device__
+	friend uint128_t operator~(uint128_t a);
+
+	__hydra_host__ __hydra_device__
+	friend uint128_t min(uint128_t  const a, uint128_t  const b);
+
+	__hydra_host__ __hydra_device__
+	friend uint128_t max(uint128_t  const a, uint128_t const  b);
+
+    // This just makes it more convenient to count leading zeros for uint128_t
+	__hydra_host__ __hydra_device__
+	friend uint64_t clz128(uint128_t  const x);
+
+	//  iostream
+	//------------------
+	friend std::ostream & operator<<(std::ostream & out, uint128_t x);
+
+	//
+	// static members
+	//------------------------
+	__hydra_host__ __hydra_device__
+	static  inline bool is_less_than(uint128_t  const& a, uint128_t  const& b){
+
+		if(a.hi < b.hi) return 1;
+		if(a.hi > b.hi) return 0;
+		if(a.lo < b.lo) return 1;
+		else return 0;
+	}
+
+	__hydra_host__ __hydra_device__
+	static  inline bool is_less_than_or_equal(uint128_t  const& a, uint128_t const&  b){
+
+		if(a.hi < b.hi) return 1;
+		if(a.hi > b.hi) return 0;
+		if(a.lo <= b.lo) return 1;
+		else return 0;
+	}
+
+	__hydra_host__ __hydra_device__
+	static  inline bool is_greater_than(uint128_t  const& a, uint128_t  const& b){
+
+		if(a.hi < b.hi) return 0;
+		if(a.hi > b.hi) return 1;
+		if(a.lo <= b.lo) return 0;
+		else return 1;
+	}
+
+	__hydra_host__ __hydra_device__
+	static  inline bool is_greater_than_or_equal(uint128_t  const& a, uint128_t  const& b)
+	{
+		if(a.hi < b.hi) return 0;
+		if(a.hi > b.hi) return 1;
+		if(a.lo < b.lo) return 0;
+		else return 1;
+	}
+
+	__hydra_host__ __hydra_device__
+	static  inline bool is_equal_to(uint128_t  const& a, uint128_t  const& b)
+	{
+		if(a.lo == b.lo && a.hi == b.hi) return 1;
+		else return 0;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline  bool is_not_equal_to(uint128_t  const& a, uint128_t  const& b)
+	{
+		if(a.lo != b.lo || a.hi != b.hi) return 1;
+		else return 0;
+	}
+
+
+	//   bit operations
+	//---------------------
+
+	/// This counts leading zeros for 64 bit unsigned integers.  It is used internally
+	/// in a few of the functions defined below.
+	__hydra_host__ __hydra_device__
+	static inline int clz64(uint64_t const&  x)
+	{
+		int res;
+#ifdef __CUDA_ARCH__
+		res = __clzll(x);
+#elif __GNUC__ || uint128_t_has_builtin(__builtin_clzll)
+		res = __builtin_clzll(x);
+#elif __x86_64__
+		asm("bsr %1, %0\nxor $0x3f, %0" : "=r" (res) : "rm" (x) : "cc", "flags");
+#elif __aarch64__
+		asm("clz %0, %1" : "=r" (res) : "r" (x));
+#else
+# error Architecture not supported
+#endif
+		return res;
+	}
+
+
+	__hydra_host__ __hydra_device__
+	static  inline uint128_t bitwise_or(uint128_t a, uint128_t const&  b)
+	{
+		a.lo |= b.lo;
+		a.hi |= b.hi;
+		return a;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline  uint128_t bitwise_and(uint128_t a, uint128_t  const& b)
+	{
+		a.lo &= b.lo;
+		a.hi &= b.hi;
+		return a;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline  uint128_t bitwise_xor(uint128_t a, uint128_t const&  b)
+	{
+		a.lo ^= b.lo;
+		a.hi ^= b.hi;
+		return a;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline  uint128_t bitwise_not(uint128_t a)
+	{
+		a.lo = ~a.lo;
+		a.hi = ~a.hi;
+		return a;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline  uint128_t   rotate_right64( uint128_t x)
+	{
+		uint64_t temp = x.lo;
+		x.lo = x.hi;
+		x.hi = temp;
+		return x;
+	}
+	__hydra_host__ __hydra_device__
+	static inline uint128_t  pow2_add(uint128_t const x, uint128_t const y)
+	{
+		uint128_t z;
+#ifdef __CUDA_ARCH__
+		z.lo = x.lo * x.lo;
+		z.hi = __mul64hi(x.lo, x.lo);
+#elif __x86_64__
+		asm( "mulq %3\n\t"
+				: "=a" (z.lo), "=d" (z.hi)
+				  : "%0" (x.lo), "rm" (x.lo));
+#endif
+		z.hi +=2*(x.hi * x.lo);
+		z.hi += y.hi + ((z.lo + y.lo) < z.lo);
+		z.lo += y.lo;
+
+		return z;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t  pow2_add_rotate64(uint128_t const x, uint128_t const y)
+	{
+		uint128_t z;
+#ifdef __CUDA_ARCH__
+		z.lo = x.lo * x.lo;
+		z.hi = __mul64hi(x.lo, x.lo);
+#elif __x86_64__
+		asm( "mulq %3\n\t"
+				: "=a" (z.lo), "=d" (z.hi)
+				  : "%0" (x.lo), "rm" (x.lo));
+#endif
+		z.hi +=2*(x.hi * x.lo);
+		z.hi += y.hi + ((z.lo + y.lo) < z.lo);
+		z.lo += y.lo;
+		uint64_t temp = z.lo;
+		z.lo = z.hi;
+		z.hi = temp;
+		return z;
+	}
+
+	//   arithmetic
+	//-------------------
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t  add128(uint128_t x, uint128_t const y)
+	{
+		x.hi += y.hi + ((x.lo + y.lo) < x.lo);
+		x.lo += y.lo;
+		return x;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t  add128(uint128_t x, uint64_t  const y)
+	{
+		x+=uint128_t(y);
+		return x;
+
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t   mul128(uint64_t const  x, uint64_t  const y)
+	{
+		uint128_t res;
+#ifdef __CUDA_ARCH__
+		res.lo = x * y;
+		res.hi = __mul64hi(x, y);
+#elif __x86_64__
+		asm( "mulq %3\n\t"
+				: "=a" (res.lo), "=d" (res.hi)
+				  : "%0" (x), "rm" (y));
+#elif __aarch64__
+		asm( "mul %0, %2, %3\n\t"
+				"umulh %1, %2, %3\n\t"
+				: "=&r" (res.lo), "=r" (res.hi)
+				  : "r" (x), "r" (y));
+#else
+# error Architecture not supported
+#endif
+		return res;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t   mul128(uint128_t  const x, uint128_t  const y)
+	{
+		uint128_t z;
+#ifdef __CUDA_ARCH__
+		z.lo = x.lo * y.lo;
+		z.hi = __mul64hi(x.lo, y.lo);
+#elif __x86_64__
+		asm( "mulq %3\n\t"
+				: "=a" (z.lo), "=d" (z.hi)
+				  : "%0" (x.lo), "rm" (y.lo));
+#endif
+		z.hi +=(x.hi * y.lo) + (x.lo * y.hi);
+		return z;
+	}
+
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t   mul128(uint128_t  const x, uint64_t  const y)
+	{
+		uint128_t res;
+#ifdef __CUDA_ARCH__
+		res.lo = x.lo * y;
+		res.hi = __mul64hi(x.lo, y);
+		res.hi += x.hi * y;
+#elif __x86_64__
+		asm( "mulq %3\n\t"
+				: "=a" (res.lo), "=d" (res.hi)
+				  : "%0" (x.lo), "rm" (y));
+		res.hi += x.hi * y;
+#elif __aarch64__
+		res.lo = x.lo * y;
+		asm( "umulh %0, %1, %2\n\t"
+				: "=r" (res.hi)
+				  : "r" (x.lo), "r" (y));
+		res.hi += x.hi * y;
+#else
+# error Architecture not supported
+#endif
+		return res;
+	}
+
+	// taken from libdivide's adaptation of this implementation origininally in
+	// Hacker's Delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt
+	// License permits inclusion here per:
+	// http://www.hackersdelight.org/permissions.htm
+	__hydra_host__ __hydra_device__
+	static inline uint64_t divide_u128_to_u64(uint128_t  const& x, uint64_t v, uint64_t * r = NULL) // x / v
+	{
+		const uint64_t b = 1ull << 32;
+		uint64_t  un1, un0,
+		vn1, vn0,
+		q1, q0,
+		un64, un21, un10,
+		rhat;
+		int s;
+
+		if(x.hi >= v){
+			if( r != NULL) *r = (uint64_t) -1;
+			return  (uint64_t) -1;
+		}
+
+		s = clz64(v);
+
+		if(s > 0){
+			v = v << s;
+			un64 = (x.hi << s) | ((x.lo >> (64 - s)) & (-s >> 31));
+			un10 = x.lo << s;
+		}else{
+			un64 = x.lo | x.hi;
+			un10 = x.lo;
+		}
+
+		vn1 = v >> 32;
+		vn0 = v & 0xffffffff;
+
+		un1 = un10 >> 32;
+		un0 = un10 & 0xffffffff;
+
+		q1 = un64/vn1;
+		rhat = un64 - q1*vn1;
+
+		again1:
+		if (q1 >= b || q1*vn0 > b*rhat + un1){
+			q1 -= 1;
+			rhat = rhat + vn1;
+			if(rhat < b) goto again1;
+		}
+
+		un21 = un64*b + un1 - q1*v;
+
+		q0 = un21/vn1;
+		rhat = un21 - q0*vn1;
+		again2:
+		if(q0 >= b || q0 * vn0 > b*rhat + un0){
+			q0 = q0 - 1;
+			rhat = rhat + vn1;
+			if(rhat < b) goto again2;
+		}
+
+		if(r != NULL) *r = (un21*b + un0 - q0*v) >> s;
+		return q1*b + q0;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t divide_u128_to_u128(uint128_t  x, uint64_t v, uint64_t * r = NULL)
+	{
+		uint128_t res;
+
+		res.hi = x.hi/v;
+		x.hi %= v;
+		res.lo = divide_u128_to_u64(x, v, r);
+
+		return res;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t sub128(uint128_t  const& x, uint128_t const&  y) // x - y
+	{
+		uint128_t res;
+
+		res.lo = x.lo - y.lo;
+		res.hi = x.hi - y.hi;
+		if(x.lo < y.lo) res.hi--;
+
+		return res;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint64_t _isqrt(uint64_t  const& x)
+	{
+		uint64_t res0 = 0;
+#ifdef __CUDA_ARCH__
+		res0 = ::sqrtf(x);
+#else
+		res0 = ::sqrt(x);
+		for(uint16_t i = 0; i < 8; i++)
+			res0 = (res0 + x/res0) >> 1;
+#endif
+		return res0;
+	}
+
+	//    roots
+	//-------------------
+
+	__hydra_host__ __hydra_device__
+	static inline uint64_t _isqrt(const uint128_t & x) // this gives errors above 2^124
+	{
+		uint64_t res0 = 0;
+
+		if( (x.hi == 0 && x.lo == 0) || x.hi > 1ull << 60)
+			return 0;
+
+#ifdef __CUDA_ARCH__
+		res0 = ::sqrtf(u128_to_float(x));
+#else
+		res0 = ::sqrt(u128_to_float(x));
+#endif
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+		for(uint16_t i = 0; i < 8; i++)
+			res0 = (res0 + x/res0) >> 1;
+
+		return res0;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint64_t _icbrt(const uint128_t & x)
+	{
+		uint64_t res0 = 0;
+
+#ifdef __CUDA_ARCH__
+		res0 = cbrtf(u128_to_float(x));
+#else
+		res0 = std::cbrt(u128_to_float(x));
+#endif
+#ifdef __CUDA_ARCH__
+#pragma unroll
+#endif
+		for(uint16_t i = 0; i < 47; i++) // there needs to be an odd number of iterations
+			// for the case of numbers of the form x^2 - 1
+			// where this will not converge
+			res0 = (res0 + divide_u128_to_u128(x,res0)/res0) >> 1;
+		return res0;
+	}
+
+	__hydra_host__ __hydra_device__
+	// this function is to avoid off by 1 errors from nesting integer square roots
+	static inline uint64_t _iqrt(const uint128_t & x)
+	{
+		uint64_t res0 = 0, res1 = 0;
+
+		res0 = _isqrt(_isqrt(x));
+		res1 = (res0 + divide_u128_to_u128(x,res0*res0)/res0) >> 1;
+		res0 = (res1 + divide_u128_to_u128(x,res1*res1)/res1) >> 1;
+
+		return res0 < res1 ? res0 : res1;
+	}
+
+	//  typecasting
+	//-------------------------
+	static inline uint128_t string_to_u128(std::string s)
+	{
+		uint128_t res = 0;
+		for(std::string::iterator iter = s.begin(); iter != s.end() && (int) *iter >= 48; iter++){
+			res = mul128(res, 10);
+			res += (uint16_t) *iter - 48;
+		}
+		return res;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint64_t u128_to_u64(uint128_t x){
+
+		return x.lo;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline double u128_to_double(uint128_t x)
+	{
+		double dbl;
+#ifdef __CUDA_ARCH__
+		if(x.hi == 0) return __ull2double_rd(x.lo);
+#else
+		if(x.hi == 0) return (double) x.lo;
+#endif
+		uint64_t r = clz64(x.hi);
+		x <<= r;
+
+#ifdef __CUDA_ARCH__
+		dbl = __ull2double_rd(x.hi);
+#else
+		dbl = (double) x.lo;
+#endif
+
+		dbl *= (1ull << (64-r));
+
+		return dbl;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline float u128_to_float(uint128_t x)
+	{
+		float flt;
+#ifdef __CUDA_ARCH__
+		if(x.hi == 0) return __ull2float_rd(x.lo);
+#else
+		if(x.hi == 0) return (float) x.lo;
+#endif
+		uint64_t r = clz64(x.hi);
+		x <<= r;
+
+#ifdef __CUDA_ARCH__
+		flt = __ull2float_rd(x.hi);
+#else
+		flt = (float) x.hi;
+#endif
+
+		flt *= (1ull << (64-r));
+		flt *= 2;
+
+		return flt;
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t double_to_u128(double dbl)
+	{
+		uint128_t x;
+		if(dbl < 1 || dbl > 1e39) return 0;
+		else{
+
+#ifdef __CUDA_ARCH__
+			uint32_t shft = __double2uint_rd(log2(dbl));
+			uint64_t divisor = 1ull << shft;
+			dbl /= divisor;
+			x.lo = __double2ull_rd(dbl);
+			x <<= shft;
+#else
+			uint32_t shft = (uint32_t) log2(dbl);
+			uint64_t divisor = 1ull << shft;
+			dbl /= divisor;
+			x.lo = (uint64_t) dbl;
+			x <<= shft;
+#endif
+			return x;
+		}
+	}
+
+	__hydra_host__ __hydra_device__
+	static inline uint128_t float_to_u128(float flt)
+	{
+		uint128_t x;
+		if(flt < 1 || flt > 1e39) return 0;
+		else{
+
+#ifdef __CUDA_ARCH__
+			uint32_t shft = __double2uint_rd(log2(flt));
+			uint64_t divisor = 1ull << shft;
+			flt /= divisor;
+			x.lo = __double2ull_rd(flt);
+			x <<= shft;
+#else
+			uint32_t shft = (uint32_t) log2(flt);
+			uint64_t divisor = 1ull << shft;
+			flt /= divisor;
+			x.lo = (uint64_t) flt;
+			x <<= shft;
+#endif
+			return x;
+		}
+	}
+
+}; // class uint128_t
+
+}  // namespace random_iterator
+
+#include <hydra/detail/random/detail/uint128.inl>
+
diff --git a/random_iterator/detail/uint128.hpp b/random_iterator/detail/uint128.hpp
new file mode 100644
index 0000000..970ddfe
--- /dev/null
+++ b/random_iterator/detail/uint128.hpp
@@ -0,0 +1,677 @@
+/*----------------------------------------------------------------------------
+ *
+ *   Copyright (C) 2021 Antonio Augusto Alves Junior
+ *
+ *   This file is part of RandomIterator.
+ *
+ *   RandomIterator is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   RandomIterator is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with RandomIterator.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *---------------------------------------------------------------------------*/
+/*
+ * uint128.hpp
+ *
+ *  Created on: 01/03/2021
+ *      Author: Antonio Augusto Alves Junior
+ */
+
+#pragma once
+
+#include <random_iterator/detail/Macros.h>
+#include <iostream>
+#include <iomanip>
+#include <cinttypes>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <iterator>
+#include <utility>
+
+namespace random_iterator {
+
+class uint128_t;
+
+namespace detail {
+
+template<typename T>
+using check_uint128_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T,uint128_t >::value, uint128_t>::type;
+
+template<typename T>
+using T_type = typename std::enable_if<std::is_integral<T>::value, T>::type;
+
+}
+
+class uint128_t
+{
+	uint64_t v_[2];
+
+  public:
+
+	uint128_t(): v_({0,0}) {}
+
+	uint128_t(uint64_t a): v_({a,0}){}
+
+	uint128_t(uint64_t a, uint64_t b):
+	 v_({a,b})
+	{}
+
+	uint128_t(uint128_t const& other){
+
+		v_[0] = other.v_[0] ;
+		v_[1] = other.v_[1];
+	}
+
+	inline uint128_t& operator=(uint128_t const& other){
+
+		if(this==&other) return *this;
+		v_[0] = other.v_[0];
+		v_[1] = other.v_[1];
+		return *this;
+	}
+
+	template <typename T>
+	inline typename std::enable_if<std::is_integral<T>::value>::type&
+	operator=(T const& other){
+
+		v_[1] = 0;
+		v_[0] = other;
+		return * this;
+	}
+
+
+    template<typename T,  typename=typename std::enable_if<std::is_integral<T>::value>::type>
+	inline   operator T() const {
+		return T(v_[0]);
+	}
+
+    //
+	// Compound assignment
+	//--------------------------------------
+	//
+	// arithmetic
+	//
+
+	template <typename T>
+	inline uint128_t& operator+=(const T  a){
+
+		uint128_t b = (uint128_t) a;
+		v_[1] += b.v_[1] + ((v_[0] + b.v_[0]) < v_[0]);
+        v_[0] += b.v_[0];
+        return *this;
+	}
+
+	template <typename T>
+	inline uint128_t& operator-=(const T  b){
+
+		uint128_t temp = (uint128_t)b;
+		if(v_[0] < temp.v_[0]) v_[1]--;
+		v_[0] -= temp.v_[0];
+		return * this;
+	}
+
+	template <typename T>
+	inline uint128_t&  operator*=( const T  b){
+
+		*this = uint128_t::mul128(*this, b);
+		return *this;
+	}
+
+	template <typename T>
+	inline uint128_t& operator/=( const T  b){
+
+		*this = uint128_t::divide_u128_to_u64(*this, (uint64_t)b);
+		return *this;
+	}
+
+	//
+	// increment
+	//
+	inline uint128_t&  operator++(){
+
+		return *this +=1;
+	}
+
+	inline uint128_t  operator++(int){
+		uint128_t temp = *this;
+          *this +=1;
+		return temp;
+	}
+
+	inline uint128_t&  operator--(){
+
+		return *this -=1;
+	}
+
+	inline uint128_t  operator--(int){
+		uint128_t temp = *this;
+        *this -=1;
+		return temp;
+	}
+
+	//
+	// bitwise
+	//
+
+	template <typename T>
+	inline uint128_t& operator>>=(const T  b)
+	{
+		if(_LIKELY_(b==64)){
+			v_[0] = v_[1];
+			v_[1] = 0;
+		}
+		else{
+
+		v_[0] = (v_[0] >> b) | (v_[1] << (int)(b - 64));
+		(b < 64) ? v_[1] >>= b : v_[1] = 0;
+		}
+		return *this;
+	}
+
+	template <typename T>
+	inline uint128_t& operator<<=(const T  b){
+
+		if(_LIKELY_(b==64)){
+
+			v_[1] = v_[0];
+			v_[0] = 0;
+		}
+		else{
+			v_[1] = (v_[1] << b) | (v_[0] << (int)(b - 64));
+			(b < 64) ? v_[0] <<= b : v_[0] = 0;
+		}
+		return *this;
+	}
+
+	template <typename T>
+	inline uint128_t& operator|=(const T  b){
+
+		*this = *this | b;
+		return *this;
+	}
+
+	template <typename T>
+	inline uint128_t& operator&=(const T  b){
+
+		*this = *this & b; return *this;
+	}
+
+	template <typename T>
+	inline uint128_t& operator^=(const T  b){
+
+		*this = *this ^ b;
+		return *this;
+	}
+
+	//
+	// friend functions
+	//-----------------------------
+
+	template <typename T>
+	friend uint128_t operator>>(uint128_t a, const T  b);
+
+	template <typename T>
+	friend uint128_t  operator<<(uint128_t a, const T  b);
+
+	template <typename T>
+	friend uint128_t operator+(uint128_t a, const T  b);
+
+	template <typename T>
+	friend uint128_t operator-(uint128_t a, const T  b);
+
+	template <typename T>
+	friend  uint128_t operator*(uint128_t a, const T  b);
+
+	template <typename T>
+	friend  uint128_t operator/(uint128_t x, const T  v);
+
+	template <typename T>
+	friend  uint128_t operator%(uint128_t x, const T  v);
+
+	friend bool operator<(uint128_t const  a, uint128_t const  b);
+
+	friend bool operator>(uint128_t const  a, uint128_t const  b);
+
+	friend bool operator<=(uint128_t  const a, uint128_t const  b);
+
+	friend bool operator>=(uint128_t  const a, uint128_t  const b);
+
+	friend bool operator==(uint128_t  const a, uint128_t const b);
+
+	friend bool operator!=(uint128_t  const a, uint128_t  const b);
+
+	template <typename T>
+	friend uint128_t operator|(uint128_t a, const T  b);
+
+	template <typename T>
+	friend uint128_t operator&(uint128_t a, const T  b);
+
+	template <typename T>
+	friend uint128_t operator^(uint128_t a, const T  b);
+
+	friend uint128_t operator~(uint128_t a);
+
+	friend uint128_t min(uint128_t  const a, uint128_t  const b);
+
+	friend uint128_t max(uint128_t  const a, uint128_t const  b);
+
+    // This just makes it more convenient to count leading zeros for uint128_t
+	friend uint64_t clz128(uint128_t  const x);
+
+	// This just makes it more convenient to count leading zeros for uint128_t
+	friend  uint128_t  rotate_right_64( uint128_t x);
+
+	//  iostream
+	//------------------
+	friend std::ostream & operator<<(std::ostream & out, uint128_t x);
+
+	//
+	// static members
+	//------------------------
+	static  inline bool is_less_than(uint128_t  const& a, uint128_t  const& b){
+
+		if(a.v_[1] < b.v_[1]) return 1;
+		if(a.v_[1] > b.v_[1]) return 0;
+		if(a.v_[0] < b.v_[0]) return 1;
+		else return 0;
+	}
+
+	static  inline bool is_less_than_or_equal(uint128_t  const& a, uint128_t const&  b){
+
+		if(a.v_[1] < b.v_[1]) return 1;
+		if(a.v_[1] > b.v_[1]) return 0;
+		if(a.v_[0] <= b.v_[0]) return 1;
+		else return 0;
+	}
+
+	static  inline bool is_greater_than(uint128_t  const& a, uint128_t  const& b){
+
+		if(a.v_[1] < b.v_[1]) return 0;
+		if(a.v_[1] > b.v_[1]) return 1;
+		if(a.v_[0] <= b.v_[0]) return 0;
+		else return 1;
+	}
+
+	static  inline bool is_greater_than_or_equal(uint128_t  const& a, uint128_t  const& b)
+	{
+		if(a.v_[1] < b.v_[1]) return 0;
+		if(a.v_[1] > b.v_[1]) return 1;
+		if(a.v_[0] < b.v_[0]) return 0;
+		else return 1;
+	}
+
+	static  inline bool is_equal_to(uint128_t  const& a, uint128_t  const& b)
+	{
+		if(a.v_[0] == b.v_[0] && a.v_[1] == b.v_[1]) return 1;
+		else return 0;
+	}
+
+	static inline  bool is_not_equal_to(uint128_t  const& a, uint128_t  const& b)
+	{
+		if(a.v_[0] != b.v_[0] || a.v_[1] != b.v_[1]) return 1;
+		else return 0;
+	}
+
+
+	//   bit operations
+	//---------------------
+
+	/// This counts leading zeros for 64 bit unsigned integers.  It is used internally
+	/// in a few of the functions defined below.
+	static inline int clz64(uint64_t const&  x)
+	{
+		int res;
+
+		asm("bsr %1, %0\nxor $0x3f, %0" : "=r" (res) : "rm" (x) : "cc", "flags");
+
+		return res;
+	}
+
+	static  inline uint128_t bitwise_or(uint128_t a, uint128_t const&  b)
+	{
+		a.v_[0] |= b.v_[0];
+		a.v_[1] |= b.v_[1];
+		return a;
+	}
+
+	static inline  uint128_t bitwise_and(uint128_t a, uint128_t  const& b)
+	{
+		a.v_[0] &= b.v_[0];
+		a.v_[1] &= b.v_[1];
+		return a;
+	}
+
+	static inline  uint128_t bitwise_xor(uint128_t a, uint128_t const&  b)
+	{
+		a.v_[0] ^= b.v_[0];
+		a.v_[1] ^= b.v_[1];
+		return a;
+	}
+
+	static inline  uint128_t bitwise_not(uint128_t a)
+	{
+		a.v_[0] = ~a.v_[0];
+		a.v_[1] = ~a.v_[1];
+		return a;
+	}
+
+	static inline  uint128_t   rotate_right64( uint128_t x)
+	{
+		uint64_t temp = x.v_[0];
+		x.v_[0] = x.v_[1];
+		x.v_[1] = temp;
+		return x;
+	}
+
+	static inline uint128_t  pow2_add(uint128_t const x, uint128_t const y)
+	{
+		uint128_t z;
+
+		asm( "mulq %3\n\t"
+				: "=a" (z.v_[0]), "=d" (z.v_[1])
+				: "%0" (x.v_[0]), "rm" (x.v_[0]));
+
+		z.v_[1] +=2*(x.v_[1] * x.v_[0]);
+		z.v_[1] += y.v_[1] + ((z.v_[0] + y.v_[0]) < z.v_[0]);
+		z.v_[0] += y.v_[0];
+
+		return z;
+	}
+
+	static inline uint128_t  pow2_add_rotate64(uint128_t const x, uint128_t const y)
+	{
+		uint128_t z;
+
+		asm( "mulq %3\n\t"
+				: "=a" (z.v_[0]), "=d" (z.v_[1])
+				  : "%0" (x.v_[0]), "rm" (x.v_[0]));
+
+		z.v_[1] +=2*(x.v_[1] * x.v_[0]);
+		z.v_[1] += y.v_[1] + ((z.v_[0] + y.v_[0]) < z.v_[0]);
+		z.v_[0] += y.v_[0];
+		uint64_t temp = z.v_[0];
+		z.v_[0] = z.v_[1];
+		z.v_[1] = temp;
+		return z;
+	}
+
+	//   arithmetic
+	//-------------------
+
+	static inline uint128_t  add128(uint128_t x, uint128_t const y)
+	{
+		x.v_[1] += y.v_[1] + ((x.v_[0] + y.v_[0]) < x.v_[0]);
+		x.v_[0] += y.v_[0];
+		return x;
+	}
+
+	static inline uint128_t  add128(uint128_t x, uint64_t  const y)
+	{
+		x+=uint128_t(y);
+		return x;
+
+	}
+
+	static inline uint128_t   mul128(uint64_t const  x, uint64_t  const y)
+	{
+		uint128_t res;
+
+		asm( "mulq %3\n\t"
+				: "=a" (res.v_[0]), "=d" (res.v_[1])
+				  : "%0" (x), "rm" (y));
+
+		return res;
+	}
+
+	static inline uint128_t   mul128(uint128_t  const x, uint128_t  const y)
+	{
+		uint128_t z;
+
+		asm( "mulq %3\n\t"
+				: "=a" (z.v_[0]), "=d" (z.v_[1])
+				  : "%0" (x.v_[0]), "rm" (y.v_[0]));
+
+		z.v_[1] +=(x.v_[1] * y.v_[0]) + (x.v_[0] * y.v_[1]);
+		return z;
+	}
+
+	static inline uint128_t   mul128(uint128_t  const x, uint64_t  const y)
+	{
+		uint128_t res;
+
+		asm( "mulq %3\n\t"
+				: "=a" (res.v_[0]), "=d" (res.v_[1])
+				  : "%0" (x.v_[0]), "rm" (y));
+		res.v_[1] += x.v_[1] * y;
+
+		return res;
+	}
+
+	// taken from libdivide's adaptation of this implementation origininally in
+	// Hacker's Delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt
+	// License permits inclusion here per:
+	// http://www.hackersdelight.org/permissions.htm
+	static inline uint64_t divide_u128_to_u64(uint128_t  const& x, uint64_t v, uint64_t * r = NULL) // x / v
+	{
+		const uint64_t b = 1ull << 32;
+		uint64_t  un1, un0,
+		vn1, vn0,
+		q1, q0,
+		un64, un21, un10,
+		rhat;
+		int s;
+
+		if(x.v_[1] >= v){
+			if( r != NULL) *r = (uint64_t) -1;
+			return  (uint64_t) -1;
+		}
+
+		s = clz64(v);
+
+		if(s > 0){
+			v = v << s;
+			un64 = (x.v_[1] << s) | ((x.v_[0] >> (64 - s)) & (-s >> 31));
+			un10 = x.v_[0] << s;
+		}else{
+			un64 = x.v_[0] | x.v_[1];
+			un10 = x.v_[0];
+		}
+
+		vn1 = v >> 32;
+		vn0 = v & 0xffffffff;
+
+		un1 = un10 >> 32;
+		un0 = un10 & 0xffffffff;
+
+		q1 = un64/vn1;
+		rhat = un64 - q1*vn1;
+
+		again1:
+		if (q1 >= b || q1*vn0 > b*rhat + un1){
+			q1 -= 1;
+			rhat = rhat + vn1;
+			if(rhat < b) goto again1;
+		}
+
+		un21 = un64*b + un1 - q1*v;
+
+		q0 = un21/vn1;
+		rhat = un21 - q0*vn1;
+		again2:
+		if(q0 >= b || q0 * vn0 > b*rhat + un0){
+			q0 = q0 - 1;
+			rhat = rhat + vn1;
+			if(rhat < b) goto again2;
+		}
+
+		if(r != NULL) *r = (un21*b + un0 - q0*v) >> s;
+		return q1*b + q0;
+	}
+
+	static inline uint128_t divide_u128_to_u128(uint128_t  x, uint64_t v, uint64_t * r = NULL)
+	{
+		uint128_t res;
+
+		res.v_[1] = x.v_[1]/v;
+		x.v_[1] %= v;
+		res.v_[0] = divide_u128_to_u64(x, v, r);
+
+		return res;
+	}
+
+	static inline uint128_t sub128(uint128_t  const& x, uint128_t const&  y) // x - y
+	{
+		uint128_t res;
+
+		res.v_[0] = x.v_[0] - y.v_[0];
+		res.v_[1] = x.v_[1] - y.v_[1];
+		if(x.v_[0] < y.v_[0]) res.v_[1]--;
+
+		return res;
+	}
+
+	static inline uint64_t _isqrt(uint64_t  const& x)
+	{
+		uint64_t res0 = 0;
+		res0 = ::sqrt(x);
+		for(uint16_t i = 0; i < 8; i++)
+			res0 = (res0 + x/res0) >> 1;
+		return res0;
+	}
+
+	//    roots
+	//-------------------
+
+	static inline uint64_t _isqrt(const uint128_t & x) // this gives errors above 2^124
+	{
+		uint64_t res0 = 0;
+
+		if( (x.v_[1] == 0 && x.v_[0] == 0) || x.v_[1] > 1ull << 60)
+			return 0;
+
+		res0 = ::sqrt(u128_to_float(x));
+
+		for(uint16_t i = 0; i < 8; i++)
+			res0 = (res0 + x/res0) >> 1;
+
+		return res0;
+	}
+
+	static inline uint64_t _icbrt(const uint128_t & x)
+	{
+		uint64_t res0 = 0;
+
+		res0 = std::cbrt(u128_to_float(x));
+		for(uint16_t i = 0; i < 47; i++) // there needs to be an odd number of iterations
+			// for the case of numbers of the form x^2 - 1
+			// where this will not converge
+			res0 = (res0 + divide_u128_to_u128(x,res0)/res0) >> 1;
+		return res0;
+	}
+
+	// this function is to avoid off by 1 errors from nesting integer square roots
+	static inline uint64_t _iqrt(const uint128_t & x)
+	{
+		uint64_t res0 = 0, res1 = 0;
+
+		res0 = _isqrt(_isqrt(x));
+		res1 = (res0 + divide_u128_to_u128(x,res0*res0)/res0) >> 1;
+		res0 = (res1 + divide_u128_to_u128(x,res1*res1)/res1) >> 1;
+
+		return res0 < res1 ? res0 : res1;
+	}
+
+	//  typecasting
+	//-------------------------
+	static inline uint128_t string_to_u128(std::string s)
+	{
+		uint128_t res = 0;
+		for(std::string::iterator iter = s.begin(); iter != s.end() && (int) *iter >= 48; iter++){
+			res = mul128(res, 10);
+			res += (uint16_t) *iter - 48;
+		}
+		return res;
+	}
+
+	static inline uint64_t u128_to_u64(uint128_t x){
+
+		return x.v_[0];
+	}
+
+	static inline double u128_to_double(uint128_t x)
+	{
+		double dbl;
+
+		if(x.v_[1] == 0) return (double) x.v_[0];
+
+		uint64_t r = clz64(x.v_[1]);
+		x <<= r;
+
+		dbl = (double) x.v_[0];
+
+		dbl *= (1ull << (64-r));
+
+		return dbl;
+	}
+
+	static inline float u128_to_float(uint128_t x)
+	{
+		float flt;
+
+		if(x.v_[1] == 0) return (float) x.v_[0];
+		uint64_t r = clz64(x.v_[1]);
+		x <<= r;
+
+		flt = (float) x.v_[1];
+
+
+		flt *= (1ull << (64-r));
+		flt *= 2;
+
+		return flt;
+	}
+
+	static inline uint128_t double_to_u128(double dbl)
+	{
+		uint128_t x;
+		if(dbl < 1 || dbl > 1e39) return 0;
+		else{
+
+			uint32_t shft = (uint32_t) log2(dbl);
+			uint64_t divisor = 1ull << shft;
+			dbl /= divisor;
+			x.v_[0] = (uint64_t) dbl;
+			x <<= shft;
+			return x;
+		}
+	}
+
+	static inline uint128_t float_to_u128(float flt)
+	{
+		uint128_t x;
+		if(flt < 1 || flt > 1e39) return 0;
+		else{
+
+			uint32_t shft = (uint32_t) log2(flt);
+			uint64_t divisor = 1ull << shft;
+			flt /= divisor;
+			x.v_[0] = (uint64_t) flt;
+			x <<= shft;
+			return x;
+		}
+	}
+
+}; // class uint128_t
+
+}  // namespace random_iterator
+
+#include <random_iterator/detail/uint128.inl>
diff --git a/random_iterator/detail/uint128.inl b/random_iterator/detail/uint128.inl
new file mode 100644
index 0000000..f59316e
--- /dev/null
+++ b/random_iterator/detail/uint128.inl
@@ -0,0 +1,179 @@
+/*----------------------------------------------------------------------------
+ *
+ *   Copyright (C) 2016 - 2021 Antonio Augusto Alves Junior
+ *
+ *   This file is part of Hydra Data Analysis Framework.
+ *
+ *   Hydra is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   Hydra is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Hydra.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *---------------------------------------------------------------------------*/
+
+/*
+ * uint128.inl
+ *
+ *  Created on: 04/11/2020
+ *      Author: Antonio Augusto Alves Junior
+ */
+
+#pragma once
+
+namespace random_iterator  {
+
+template <typename T>
+inline uint128_t operator>>(uint128_t a, const T  b) {
+
+	a >>= b;
+	return a;
+}
+
+template <typename T>
+inline uint128_t operator<<(uint128_t a, const T b) {
+
+	a <<= b;
+	return a;
+}
+
+template <typename T>
+inline uint128_t operator+(uint128_t a, const T  b) {
+
+	return uint128_t::add128(a, b);
+}
+
+template <typename T>
+inline uint128_t operator-(uint128_t a, const T  b) {
+
+	return uint128_t::sub128(a, (uint128_t)b);
+}
+
+template <typename T>
+inline uint128_t operator*(uint128_t a, const T  b) {
+
+	return uint128_t::mul128(a, b);
+}
+
+template <typename T>
+inline T operator/(uint128_t x, const T  v) {
+
+	return uint128_t::divide_u128_to_u64(x, (uint64_t)v);
+}
+
+template <typename T>
+inline T operator%(uint128_t x, const T & v) {
+
+	uint64_t res;
+	uint128_t::divide_u128_to_u64(x, v, &res);
+	return (T)res;
+}
+
+inline bool operator<(uint128_t const  a, uint128_t  const b) {
+
+	return uint128_t::is_less_than(a, b);
+}
+
+inline bool operator>(uint128_t  const a, uint128_t  const b) {
+
+	return uint128_t::is_greater_than(a, b);
+}
+
+inline bool operator<=(uint128_t  const a, uint128_t const  b) {
+
+	return uint128_t::is_less_than_or_equal(a, b);
+}
+
+inline bool operator>=(uint128_t  const a, uint128_t  const b) {
+
+	return uint128_t::is_greater_than_or_equal(a, b);
+}
+
+inline bool operator==(uint128_t const  a, uint128_t  const b) {
+
+	return uint128_t::is_equal_to(a, b);
+}
+
+inline bool operator!=(uint128_t const  a, uint128_t const  b) {
+
+	return uint128_t::is_not_equal_to(a, b);
+}
+
+template <typename T>
+inline uint128_t operator|(uint128_t a, const T  b) {
+
+	return uint128_t::bitwise_or(a, (uint128_t)b);
+}
+
+template <typename T>
+inline uint128_t operator&(uint128_t a, const T  b) {
+
+	return uint128_t::bitwise_and(a, (uint128_t)b);
+}
+
+template <typename T>
+inline uint128_t operator^(uint128_t a, const T  b) {
+
+	return uint128_t::bitwise_xor(a, (uint128_t)b);
+}
+
+inline uint128_t operator~(uint128_t a) {
+
+	return uint128_t::bitwise_not(a);
+}
+
+inline uint128_t min(uint128_t const& a, uint128_t const b) {
+
+	return a < b ? a : b;
+}
+
+inline uint128_t max(uint128_t const& a, uint128_t const b) {
+
+	return a > b ? a : b;
+}
+
+inline uint64_t clz128(uint128_t const x){
+
+	uint64_t res;
+
+	res = x.v_[1] != 0 ? uint128_t::clz64(x.v_[1]) : 64 + uint128_t::clz64(x.v_[0]);
+
+	return res;
+}
+
+inline uint128_t  rotate_right_64( uint128_t  x){
+
+	uint64_t temp = x.v_[0];
+	x.v_[0] = x.v_[1];
+	x.v_[1] = temp;
+	return x;
+}
+
+//  iostream
+//------------------
+inline std::ostream & operator<<(std::ostream & out, uint128_t x)
+{
+	std::vector<uint16_t> rout;
+	uint64_t v = 10, r = 0;
+	if (x.v_[1] == 0 && x.v_[0]==0) {
+		out << "0";
+		return out;
+	}
+	do {
+		x = uint128_t::divide_u128_to_u128(x, v, &r);
+		rout.push_back(r);
+	} while(x.v_[0] != 0 || x.v_[1] != 0);
+	for(std::reverse_iterator<std::vector<uint16_t>::iterator> rit = rout.rbegin(); rit != rout.rend(); rit++){
+		out << *rit;
+	}
+	return out;
+}
+
+}  // namespace random_iterator
diff --git a/tests/benchmark_engine.cpp b/tests/benchmark_engine.cpp
index 84e9cec..bec57c0 100644
--- a/tests/benchmark_engine.cpp
+++ b/tests/benchmark_engine.cpp
@@ -43,10 +43,12 @@ static const uint64_t default_seed= 0x548c9decbce65295  ;
 TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 
 
-	random_iterator::squares3        RNG_squares3(default_seed);
-	random_iterator::squares4        RNG_squares4(default_seed);
-	random_iterator::threefry        RNG_threefry(default_seed);
-	random_iterator::philox          RNG_philox(default_seed);
+	random_iterator::squares3_64        RNG_squares3(default_seed);
+	random_iterator::squares4_64        RNG_squares4(default_seed);
+	random_iterator::squares3_128       RNG_squares3_long(default_seed);
+	random_iterator::squares4_128       RNG_squares4_long(default_seed);
+	random_iterator::threefry           RNG_threefry(default_seed);
+	random_iterator::philox             RNG_philox(default_seed);
 	std::mt19937         RNG_mt19937(default_seed );
 	std::ranlux48        RNG_ranlux48(default_seed);
 
@@ -57,11 +59,20 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares3();
 	};
 
+	BENCHMARK("random_iterator::squares3_long") {
+			return RNG_squares3_long();
+	};
+
 	BENCHMARK("random_iterator::squares4") {
 		return RNG_squares4();
 	};
 
 
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long();
+	};
+
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry();
 	};
@@ -93,6 +104,17 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
+
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
@@ -122,6 +144,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
@@ -152,6 +183,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
@@ -181,6 +221,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
@@ -211,6 +260,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
@@ -242,6 +300,14 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
@@ -272,6 +338,16 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") {
 		return RNG_squares4.discard(jump_size);
 	};
 
+
+
+	BENCHMARK("random_iterator::squares3_long") {
+		RNG_squares3_long.discard(jump_size);
+	};
+
+	BENCHMARK("random_iterator::squares4_long") {
+		return RNG_squares4_long.discard(jump_size);
+	};
+
 	BENCHMARK("random_iterator::threefry") {
 		return RNG_threefry.discard(jump_size);
 	};
-- 
GitLab