From 90a7ac14a3de65622de3f8d08ccb89c32dcb45ed Mon Sep 17 00:00:00 2001 From: AAAlvesJr <aalvesju@gmail.com> Date: Tue, 9 Mar 2021 03:46:10 +0100 Subject: [PATCH] adding more two implementations of squares --- random_iterator/Engine.hpp | 12 +- random_iterator/detail/Macros.h | 41 + random_iterator/detail/Random123/array.h | 2 +- random_iterator/detail/Squares3_128.hpp | 150 ++++ .../detail/{Squares3.hpp => Squares3_64.hpp} | 10 +- random_iterator/detail/Squares4_128.hpp | 152 ++++ .../detail/{Squares4.hpp => Squares4_64.hpp} | 10 +- random_iterator/detail/uint128.h_ | 816 ++++++++++++++++++ random_iterator/detail/uint128.hpp | 677 +++++++++++++++ random_iterator/detail/uint128.inl | 179 ++++ tests/benchmark_engine.cpp | 84 +- 11 files changed, 2114 insertions(+), 19 deletions(-) create mode 100644 random_iterator/detail/Macros.h create mode 100644 random_iterator/detail/Squares3_128.hpp rename random_iterator/detail/{Squares3.hpp => Squares3_64.hpp} (95%) create mode 100644 random_iterator/detail/Squares4_128.hpp rename random_iterator/detail/{Squares4.hpp => Squares4_64.hpp} (93%) create mode 100644 random_iterator/detail/uint128.h_ create mode 100644 random_iterator/detail/uint128.hpp create mode 100644 random_iterator/detail/uint128.inl diff --git a/random_iterator/Engine.hpp b/random_iterator/Engine.hpp index 03970fb..1a41da6 100644 --- a/random_iterator/Engine.hpp +++ b/random_iterator/Engine.hpp @@ -31,8 +31,10 @@ #include <limits> #include <random_iterator/detail/EngineTraits.hpp> #include <random_iterator/detail/SplitMix.hpp> -#include <random_iterator/detail/Squares3.hpp> -#include <random_iterator/detail/Squares4.hpp> +#include <random_iterator/detail/Squares3_128.hpp> +#include <random_iterator/detail/Squares4_128.hpp> +#include <random_iterator/detail/Squares3_64.hpp> +#include <random_iterator/detail/Squares4_64.hpp> namespace random_iterator { @@ -211,8 +213,10 @@ private: typedef Engine<random_iterator_r123::Philox2x64> philox; typedef Engine<random_iterator_r123::Threefry2x64> threefry; -typedef detail::Squares3 squares3; -typedef detail::Squares4 squares4; +typedef detail::Squares3_64 squares3_64; +typedef detail::Squares4_64 squares4_64; +typedef detail::Squares3_128 squares3_128; +typedef detail::Squares4_128 squares4_128; #if RANDOM_ITERATOR_R123_USE_AES_NI typedef Engine<random_iterator_r123::ARS4x32> ars; diff --git a/random_iterator/detail/Macros.h b/random_iterator/detail/Macros.h new file mode 100644 index 0000000..84bd4a4 --- /dev/null +++ b/random_iterator/detail/Macros.h @@ -0,0 +1,41 @@ +/*---------------------------------------------------------------------------- + * + * Copyright (C) 2021 Antonio Augusto Alves Junior + * + * This file is part of RandomIterator. + * + * RandomIterator is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RandomIterator is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RandomIterator. If not, see <http://www.gnu.org/licenses/>. + * + *---------------------------------------------------------------------------*/ +/* + * Macros.h + * + * Created on: 04/03/2021 + * Author: Antonio Augusto Alves Junior + */ + +#pragma once + +#if defined(__has_builtin) + #if __has_builtin(__builtin_expect) + #define _LIKELY_(x) __builtin_expect(x, 1) + #define _UNLIKELY_(x) __builtin_expect(x, 0) + #else + #define _LIKELY_(x) x + #define _UNLIKELY_(x) x + #endif //__has_builtin(__builtin_expect) +#else + #define _LIKELY_(x) x + #define _UNLIKELY_(x) x +#endif //defined(__has_builtin) diff --git a/random_iterator/detail/Random123/array.h b/random_iterator/detail/Random123/array.h index b6ca0ec..4f4443a 100644 --- a/random_iterator/detail/Random123/array.h +++ b/random_iterator/detail/Random123/array.h @@ -305,7 +305,7 @@ namespace random_iterator_r123{ a typedef equivalent to r123arrayNxW. */ -#define _r123array_tpl(_N, W, T) \ +#define _r123array_tpl(_N, W, T) \ /** @ingroup arrayNxW */ \ /** @see arrayNxW */ \ struct r123array##_N##x##W{ \ diff --git a/random_iterator/detail/Squares3_128.hpp b/random_iterator/detail/Squares3_128.hpp new file mode 100644 index 0000000..4efdccd --- /dev/null +++ b/random_iterator/detail/Squares3_128.hpp @@ -0,0 +1,150 @@ +/*---------------------------------------------------------------------------- + * + * Copyright (C) 2021 Antonio Augusto Alves Junior + * + * This file is part of RandomIterator. + * + * RandomIterator is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RandomIterator is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RandomIterator. If not, see <http://www.gnu.org/licenses/>. + * + *---------------------------------------------------------------------------*/ +/* + * Squares3_128.hpp + * + * Created on: 25/02/2021 + * Author: Antonio Augusto Alves Junior + */ + +#pragma once + +#include <stdint.h> +#include <random_iterator/detail/SquaresKeys.hpp> +#include <random_iterator/detail/uint128.hpp> + +namespace random_iterator { + +namespace detail { + +/* +* Three round counter-based middle square +* +* squares3 - returns a 32-bit unsigned int [0,0xffffffff] +* +* +* Three rounds of squaring are performed and the result is returned. +* For the first two rounds, the result is rotated right 32 bits. +* This places the random data in the best position for the next round. +* y = ctr*key or z = (ctr+1)*key is added on each round. For keys +* generated by the key utility, either ctr*key or (ctr+1)*key will +* have non-zero digits. This improves randomization and also provides +* for a uniform output. +* +* Note: The squares RNG was based on ideas derived from Middle Square +* Weyl Sequence RNG. One of the ideas was to obtain uniformity by adding +* the Weyl sequence after squaring. Richard P. Brent (creator of the +* xorgens RNG) suggested this method. It turns out that adding ctr*key +* is equivalent. Brent's idea provides the basis for uniformity in this +* generator. +* +* Implementation of the algorithm authored by Bernard Widynski and +* described in https://arxiv.org/pdf/2004.06278v2.pdf +*/ + +class Squares3_128 +{ + +public: + + typedef uint128_t init_type; + typedef uint128_t state_type; + typedef uint128_t seed_type; + typedef uint128_t advance_type; + typedef uint64_t result_type; + + Squares3_128()=delete; + + Squares3_128(size_t s, uint32_t a=0): + state_(0), + seed_(seed_type{ splitmix<size_t>(s) }) + {} + + Squares3_128( Squares3_128 const& other): + state_(other.getState() ), + seed_(other.getSeed() ) + {} + + inline Squares3_128& operator=( Squares3_128 const& other) + { + if(this==&other) return *this; + + state_ = other.getState(); + seed_ = other.getSeed(); + + return *this; + } + + + inline result_type operator()(void) + { + uint128_t x, y, z; + + y = x = seed_*state_ ; z = y + seed_; + + x = x*x + y; x = rotate_right_64(x); /* round 1 */ + + x = x*x + z; x = rotate_right_64(x); /* round 2 */ + + ++state_; /* advance state */ + + return (x*x + y) >> 64; /* round 3 */ + + } + + inline void discard(advance_type n){ + + state_ += n; + } + + inline seed_type getSeed() const { + return seed_; + } + + inline void setSeed(seed_type seed) { + seed_ = seed; + } + + inline state_type getState() const { + return state_; + } + + inline void setState( state_type state) { + state_ = state; + } + + inline static uint64_t generateSeed( size_t i ){ + return keys[i]; + } + + static constexpr result_type min(){ return 0;} + + static constexpr result_type max(){ return std::numeric_limits<result_type>::max();} + +private: + + state_type state_; + seed_type seed_; +}; + +} // namespace detail + +} // namespace random_iterator diff --git a/random_iterator/detail/Squares3.hpp b/random_iterator/detail/Squares3_64.hpp similarity index 95% rename from random_iterator/detail/Squares3.hpp rename to random_iterator/detail/Squares3_64.hpp index 0acd748..4e4454a 100644 --- a/random_iterator/detail/Squares3.hpp +++ b/random_iterator/detail/Squares3_64.hpp @@ -59,7 +59,7 @@ namespace detail { * described in https://arxiv.org/pdf/2004.06278v2.pdf */ -class Squares3 +class Squares3_64 { public: @@ -70,19 +70,19 @@ public: typedef uint64_t advance_type; typedef uint32_t result_type; - Squares3()=delete; + Squares3_64()=delete; - Squares3(seed_type s, uint32_t a=0): + Squares3_64(seed_type s, uint32_t a=0): state_(0), seed_(seed_type{ splitmix<seed_type>(s) }) {} - Squares3( Squares3 const& other): + Squares3_64( Squares3_64 const& other): state_(other.getState() ), seed_(other.getSeed() ) {} - inline Squares3& operator=( Squares3 const& other) + inline Squares3_64& operator=( Squares3_64 const& other) { if(this==&other) return *this; diff --git a/random_iterator/detail/Squares4_128.hpp b/random_iterator/detail/Squares4_128.hpp new file mode 100644 index 0000000..8605e50 --- /dev/null +++ b/random_iterator/detail/Squares4_128.hpp @@ -0,0 +1,152 @@ +/*---------------------------------------------------------------------------- + * + * Copyright (C) 2021 Antonio Augusto Alves Junior + * + * This file is part of RandomIterator. + * + * RandomIterator is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RandomIterator is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RandomIterator. If not, see <http://www.gnu.org/licenses/>. + * + *---------------------------------------------------------------------------*/ +/* + * Squares4_128.hpp + * + * Created on: 25/02/2021 + * Author: Antonio Augusto Alves Junior + */ + +#pragma once + +#include <stdint.h> +#include <random_iterator/detail/SquaresKeys.hpp> +#include <random_iterator/detail/uint128.hpp> + +namespace random_iterator { + +namespace detail { + +/* +* Three round counter-based middle square +* +* squares3 - returns a 32-bit unsigned int [0,0xffffffff] +* +* +* Three rounds of squaring are performed and the result is returned. +* For the first two rounds, the result is rotated right 32 bits. +* This places the random data in the best position for the next round. +* y = ctr*key or z = (ctr+1)*key is added on each round. For keys +* generated by the key utility, either ctr*key or (ctr+1)*key will +* have non-zero digits. This improves randomization and also provides +* for a uniform output. +* +* Note: The squares RNG was based on ideas derived from Middle Square +* Weyl Sequence RNG. One of the ideas was to obtain uniformity by adding +* the Weyl sequence after squaring. Richard P. Brent (creator of the +* xorgens RNG) suggested this method. It turns out that adding ctr*key +* is equivalent. Brent's idea provides the basis for uniformity in this +* generator. +* +* Implementation of the algorithm authored by Bernard Widynski and +* described in https://arxiv.org/pdf/2004.06278v2.pdf +*/ + +class Squares4_128 +{ + +public: + + typedef uint128_t init_type; + typedef uint128_t state_type; + typedef uint128_t seed_type; + typedef uint128_t advance_type; + typedef uint64_t result_type; + + Squares4_128()=delete; + + Squares4_128(size_t s, uint32_t a=0): + state_(0), + seed_(seed_type{ splitmix<size_t>(s) }) + {} + + Squares4_128( Squares4_128 const& other): + state_(other.getState() ), + seed_(other.getSeed() ) + {} + + inline Squares4_128& operator=( Squares4_128 const& other) + { + if(this==&other) return *this; + + state_ = other.getState(); + seed_ = other.getSeed(); + + return *this; + } + + + inline result_type operator()(void) + { + uint128_t x, y, z; + + y = x = seed_*state_ ; z = y + seed_; + + x = x*x + y; x = rotate_right_64(x); /* round 1 */ + + x = x*x + z; x = rotate_right_64(x); /* round 2 */ + + x = x*x + y; x = rotate_right_64(x); /* round 3 */ + + ++state_; /* advance state */ + + return (x*x + z) >> 64; /* round 3 */ + + } + + inline void discard(advance_type n){ + + state_ += n; + } + + inline seed_type getSeed() const { + return seed_; + } + + inline void setSeed(seed_type seed) { + seed_ = seed; + } + + inline state_type getState() const { + return state_; + } + + inline void setState( state_type state) { + state_ = state; + } + + inline static uint64_t generateSeed( size_t i ){ + return keys[i]; + } + + static constexpr result_type min(){ return 0;} + + static constexpr result_type max(){ return std::numeric_limits<result_type>::max();} + +private: + + state_type state_; + seed_type seed_; +}; + +} // namespace detail + +} // namespace random_iterator diff --git a/random_iterator/detail/Squares4.hpp b/random_iterator/detail/Squares4_64.hpp similarity index 93% rename from random_iterator/detail/Squares4.hpp rename to random_iterator/detail/Squares4_64.hpp index e024d09..ed97ac3 100644 --- a/random_iterator/detail/Squares4.hpp +++ b/random_iterator/detail/Squares4_64.hpp @@ -34,7 +34,7 @@ namespace random_iterator { namespace detail { -class Squares4 +class Squares4_64 { public: @@ -45,19 +45,19 @@ public: typedef uint64_t advance_type; typedef uint32_t result_type; - Squares4()=delete; + Squares4_64()=delete; - Squares4(seed_type s, uint32_t a=0): + Squares4_64(seed_type s, uint32_t a=0): state_(0), seed_(seed_type{ splitmix<seed_type>(s)}) {} - Squares4( Squares4 const& other): + Squares4_64( Squares4_64 const& other): state_(other.getState() ), seed_(other.getSeed() ) {} - inline Squares4& operator=( Squares4 const& other) + inline Squares4_64& operator=( Squares4_64 const& other) { if(this==&other) return *this; diff --git a/random_iterator/detail/uint128.h_ b/random_iterator/detail/uint128.h_ new file mode 100644 index 0000000..875e89f --- /dev/null +++ b/random_iterator/detail/uint128.h_ @@ -0,0 +1,816 @@ + +#pragma once + +#include <iostream> +#include <iomanip> +#include <cinttypes> +#include <cmath> +#include <string> +#include <vector> +#include <iterator> +#include <utility> + +#ifdef __has_builtin +# define uint128_t_has_builtin(x) __has_builtin(x) +#else +# define uint128_t_has_builtin(x) 0 +#endif + +namespace hydra { + +class uint128_t; + +namespace detail { + +template<typename T> +using check_uint128_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T,uint128_t >::value, uint128_t>::type; + +template<typename T> +using T_type = typename std::enable_if<std::is_integral<T>::value, T>::type; + +} + +class uint128_t +{ + + uint64_t lo; + uint64_t hi; + + uint64_t v_[2]; + + public: + + uint128_t() = default; + + __hydra_host__ __hydra_device__ + uint128_t(uint64_t a){ + + lo = a ; + hi = 0; + } + + __hydra_host__ __hydra_device__ + uint128_t(uint64_t a, uint64_t b){ + + lo = a ; + hi = b; + } + + __hydra_host__ __hydra_device__ + uint128_t(uint128_t const& a){ + + lo = a.lo ; + hi = a.hi; + } + + template<typename T, typename=typename std::enable_if<std::is_integral<T>::value>::type> + __hydra_host__ __hydra_device__ + inline operator T() const { + + return T(lo); + } + + __hydra_host__ __hydra_device__ + inline uint128_t& operator=( uint128_t const& n){ + + lo = n.lo; + hi = n.hi; + return * this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline typename std::enable_if<std::is_integral<T>::value>::type& + operator=(T const& n){ + + hi = 0; + lo = n; + return * this; + } + + // + // Compound assignment + //-------------------------------------- + // + // arithmetic + // + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T> & operator+=(const T a){ + + uint128_t b = (uint128_t) a; + hi += b.hi + ((lo + b.lo) < lo); + lo += b.lo; + return *this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T>& operator-=(const T b){ + + uint128_t temp = (uint128_t)b; + if(lo < temp.lo) hi--; + lo -= temp.lo; + return * this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T>& operator*=( const T b){ + + *this = mul128(*this, b); + return *this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T>& operator/=( const T b){ + + *this = divide_u128_to_u64(*this, (uint64_t)b); + return *this; + } + + // + // increment + // + __hydra_host__ __hydra_device__ + inline uint128_t& operator++(){ + + return *this +=1; + } + + __hydra_host__ __hydra_device__ + inline uint128_t operator++(int){ + uint128_t temp = *this; + *this +=1; + return temp; + } + + __hydra_host__ __hydra_device__ + inline uint128_t & operator--(){ + + return *this -=1; + } + + __hydra_host__ __hydra_device__ + inline uint128_t operator--(int){ + uint128_t temp = *this; + *this -=1; + return temp; + } + // + // bitwise + // + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T>& operator>>=(const T b) + { + if(HYDRA_HOST_UNLIKELY(b==64)){ + lo = hi; + hi = 0; + } + else{ + + lo = (lo >> b) | (hi << (int)(b - 64)); + (b < 64) ? hi >>= b : hi = 0; + } + return *this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T>& operator<<=(const T b){ + + if(HYDRA_HOST_UNLIKELY(b==64)){ + + hi = lo; + lo = 0; + + } + else{ + hi = (hi << b) | (lo << (int)(b - 64)); + (b < 64) ? lo <<= b : lo = 0; + } + return *this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T> & operator|=(const T b){ + + *this = *this | b; + return *this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T> & operator&=(const T b){ + + *this = *this & b; return *this; + } + + template <typename T> + __hydra_host__ __hydra_device__ + inline detail::check_uint128_t<T> & operator^=(const T b){ + + *this = *this ^ b; + return *this; + } + + // + // friend functions + //----------------------------- + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator>>(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator<<(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator+(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator-(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator*(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend T operator/(uint128_t x, const T v); + + template <typename T> + __hydra_host__ __hydra_device__ + friend T operator%(uint128_t x, const T v); + + __hydra_host__ __hydra_device__ + friend bool operator<(uint128_t const a, uint128_t const b); + + __hydra_host__ __hydra_device__ + friend bool operator>(uint128_t const a, uint128_t const b); + + __hydra_host__ __hydra_device__ + friend bool operator<=(uint128_t const a, uint128_t const b); + + __hydra_host__ __hydra_device__ + friend bool operator>=(uint128_t const a, uint128_t const b); + + __hydra_host__ __hydra_device__ + friend bool operator==(uint128_t const a, uint128_t const b); + + __hydra_host__ __hydra_device__ + friend bool operator!=(uint128_t const a, uint128_t const b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator|(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator&(uint128_t a, const T b); + + template <typename T> + __hydra_host__ __hydra_device__ + friend detail::check_uint128_t<T> operator^(uint128_t a, const T b); + + __hydra_host__ __hydra_device__ + friend uint128_t operator~(uint128_t a); + + __hydra_host__ __hydra_device__ + friend uint128_t min(uint128_t const a, uint128_t const b); + + __hydra_host__ __hydra_device__ + friend uint128_t max(uint128_t const a, uint128_t const b); + + // This just makes it more convenient to count leading zeros for uint128_t + __hydra_host__ __hydra_device__ + friend uint64_t clz128(uint128_t const x); + + // iostream + //------------------ + friend std::ostream & operator<<(std::ostream & out, uint128_t x); + + // + // static members + //------------------------ + __hydra_host__ __hydra_device__ + static inline bool is_less_than(uint128_t const& a, uint128_t const& b){ + + if(a.hi < b.hi) return 1; + if(a.hi > b.hi) return 0; + if(a.lo < b.lo) return 1; + else return 0; + } + + __hydra_host__ __hydra_device__ + static inline bool is_less_than_or_equal(uint128_t const& a, uint128_t const& b){ + + if(a.hi < b.hi) return 1; + if(a.hi > b.hi) return 0; + if(a.lo <= b.lo) return 1; + else return 0; + } + + __hydra_host__ __hydra_device__ + static inline bool is_greater_than(uint128_t const& a, uint128_t const& b){ + + if(a.hi < b.hi) return 0; + if(a.hi > b.hi) return 1; + if(a.lo <= b.lo) return 0; + else return 1; + } + + __hydra_host__ __hydra_device__ + static inline bool is_greater_than_or_equal(uint128_t const& a, uint128_t const& b) + { + if(a.hi < b.hi) return 0; + if(a.hi > b.hi) return 1; + if(a.lo < b.lo) return 0; + else return 1; + } + + __hydra_host__ __hydra_device__ + static inline bool is_equal_to(uint128_t const& a, uint128_t const& b) + { + if(a.lo == b.lo && a.hi == b.hi) return 1; + else return 0; + } + + __hydra_host__ __hydra_device__ + static inline bool is_not_equal_to(uint128_t const& a, uint128_t const& b) + { + if(a.lo != b.lo || a.hi != b.hi) return 1; + else return 0; + } + + + // bit operations + //--------------------- + + /// This counts leading zeros for 64 bit unsigned integers. It is used internally + /// in a few of the functions defined below. + __hydra_host__ __hydra_device__ + static inline int clz64(uint64_t const& x) + { + int res; +#ifdef __CUDA_ARCH__ + res = __clzll(x); +#elif __GNUC__ || uint128_t_has_builtin(__builtin_clzll) + res = __builtin_clzll(x); +#elif __x86_64__ + asm("bsr %1, %0\nxor $0x3f, %0" : "=r" (res) : "rm" (x) : "cc", "flags"); +#elif __aarch64__ + asm("clz %0, %1" : "=r" (res) : "r" (x)); +#else +# error Architecture not supported +#endif + return res; + } + + + __hydra_host__ __hydra_device__ + static inline uint128_t bitwise_or(uint128_t a, uint128_t const& b) + { + a.lo |= b.lo; + a.hi |= b.hi; + return a; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t bitwise_and(uint128_t a, uint128_t const& b) + { + a.lo &= b.lo; + a.hi &= b.hi; + return a; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t bitwise_xor(uint128_t a, uint128_t const& b) + { + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t bitwise_not(uint128_t a) + { + a.lo = ~a.lo; + a.hi = ~a.hi; + return a; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t rotate_right64( uint128_t x) + { + uint64_t temp = x.lo; + x.lo = x.hi; + x.hi = temp; + return x; + } + __hydra_host__ __hydra_device__ + static inline uint128_t pow2_add(uint128_t const x, uint128_t const y) + { + uint128_t z; +#ifdef __CUDA_ARCH__ + z.lo = x.lo * x.lo; + z.hi = __mul64hi(x.lo, x.lo); +#elif __x86_64__ + asm( "mulq %3\n\t" + : "=a" (z.lo), "=d" (z.hi) + : "%0" (x.lo), "rm" (x.lo)); +#endif + z.hi +=2*(x.hi * x.lo); + z.hi += y.hi + ((z.lo + y.lo) < z.lo); + z.lo += y.lo; + + return z; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t pow2_add_rotate64(uint128_t const x, uint128_t const y) + { + uint128_t z; +#ifdef __CUDA_ARCH__ + z.lo = x.lo * x.lo; + z.hi = __mul64hi(x.lo, x.lo); +#elif __x86_64__ + asm( "mulq %3\n\t" + : "=a" (z.lo), "=d" (z.hi) + : "%0" (x.lo), "rm" (x.lo)); +#endif + z.hi +=2*(x.hi * x.lo); + z.hi += y.hi + ((z.lo + y.lo) < z.lo); + z.lo += y.lo; + uint64_t temp = z.lo; + z.lo = z.hi; + z.hi = temp; + return z; + } + + // arithmetic + //------------------- + + __hydra_host__ __hydra_device__ + static inline uint128_t add128(uint128_t x, uint128_t const y) + { + x.hi += y.hi + ((x.lo + y.lo) < x.lo); + x.lo += y.lo; + return x; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t add128(uint128_t x, uint64_t const y) + { + x+=uint128_t(y); + return x; + + } + + __hydra_host__ __hydra_device__ + static inline uint128_t mul128(uint64_t const x, uint64_t const y) + { + uint128_t res; +#ifdef __CUDA_ARCH__ + res.lo = x * y; + res.hi = __mul64hi(x, y); +#elif __x86_64__ + asm( "mulq %3\n\t" + : "=a" (res.lo), "=d" (res.hi) + : "%0" (x), "rm" (y)); +#elif __aarch64__ + asm( "mul %0, %2, %3\n\t" + "umulh %1, %2, %3\n\t" + : "=&r" (res.lo), "=r" (res.hi) + : "r" (x), "r" (y)); +#else +# error Architecture not supported +#endif + return res; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t mul128(uint128_t const x, uint128_t const y) + { + uint128_t z; +#ifdef __CUDA_ARCH__ + z.lo = x.lo * y.lo; + z.hi = __mul64hi(x.lo, y.lo); +#elif __x86_64__ + asm( "mulq %3\n\t" + : "=a" (z.lo), "=d" (z.hi) + : "%0" (x.lo), "rm" (y.lo)); +#endif + z.hi +=(x.hi * y.lo) + (x.lo * y.hi); + return z; + } + + + __hydra_host__ __hydra_device__ + static inline uint128_t mul128(uint128_t const x, uint64_t const y) + { + uint128_t res; +#ifdef __CUDA_ARCH__ + res.lo = x.lo * y; + res.hi = __mul64hi(x.lo, y); + res.hi += x.hi * y; +#elif __x86_64__ + asm( "mulq %3\n\t" + : "=a" (res.lo), "=d" (res.hi) + : "%0" (x.lo), "rm" (y)); + res.hi += x.hi * y; +#elif __aarch64__ + res.lo = x.lo * y; + asm( "umulh %0, %1, %2\n\t" + : "=r" (res.hi) + : "r" (x.lo), "r" (y)); + res.hi += x.hi * y; +#else +# error Architecture not supported +#endif + return res; + } + + // taken from libdivide's adaptation of this implementation origininally in + // Hacker's Delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt + // License permits inclusion here per: + // http://www.hackersdelight.org/permissions.htm + __hydra_host__ __hydra_device__ + static inline uint64_t divide_u128_to_u64(uint128_t const& x, uint64_t v, uint64_t * r = NULL) // x / v + { + const uint64_t b = 1ull << 32; + uint64_t un1, un0, + vn1, vn0, + q1, q0, + un64, un21, un10, + rhat; + int s; + + if(x.hi >= v){ + if( r != NULL) *r = (uint64_t) -1; + return (uint64_t) -1; + } + + s = clz64(v); + + if(s > 0){ + v = v << s; + un64 = (x.hi << s) | ((x.lo >> (64 - s)) & (-s >> 31)); + un10 = x.lo << s; + }else{ + un64 = x.lo | x.hi; + un10 = x.lo; + } + + vn1 = v >> 32; + vn0 = v & 0xffffffff; + + un1 = un10 >> 32; + un0 = un10 & 0xffffffff; + + q1 = un64/vn1; + rhat = un64 - q1*vn1; + + again1: + if (q1 >= b || q1*vn0 > b*rhat + un1){ + q1 -= 1; + rhat = rhat + vn1; + if(rhat < b) goto again1; + } + + un21 = un64*b + un1 - q1*v; + + q0 = un21/vn1; + rhat = un21 - q0*vn1; + again2: + if(q0 >= b || q0 * vn0 > b*rhat + un0){ + q0 = q0 - 1; + rhat = rhat + vn1; + if(rhat < b) goto again2; + } + + if(r != NULL) *r = (un21*b + un0 - q0*v) >> s; + return q1*b + q0; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t divide_u128_to_u128(uint128_t x, uint64_t v, uint64_t * r = NULL) + { + uint128_t res; + + res.hi = x.hi/v; + x.hi %= v; + res.lo = divide_u128_to_u64(x, v, r); + + return res; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t sub128(uint128_t const& x, uint128_t const& y) // x - y + { + uint128_t res; + + res.lo = x.lo - y.lo; + res.hi = x.hi - y.hi; + if(x.lo < y.lo) res.hi--; + + return res; + } + + __hydra_host__ __hydra_device__ + static inline uint64_t _isqrt(uint64_t const& x) + { + uint64_t res0 = 0; +#ifdef __CUDA_ARCH__ + res0 = ::sqrtf(x); +#else + res0 = ::sqrt(x); + for(uint16_t i = 0; i < 8; i++) + res0 = (res0 + x/res0) >> 1; +#endif + return res0; + } + + // roots + //------------------- + + __hydra_host__ __hydra_device__ + static inline uint64_t _isqrt(const uint128_t & x) // this gives errors above 2^124 + { + uint64_t res0 = 0; + + if( (x.hi == 0 && x.lo == 0) || x.hi > 1ull << 60) + return 0; + +#ifdef __CUDA_ARCH__ + res0 = ::sqrtf(u128_to_float(x)); +#else + res0 = ::sqrt(u128_to_float(x)); +#endif +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for(uint16_t i = 0; i < 8; i++) + res0 = (res0 + x/res0) >> 1; + + return res0; + } + + __hydra_host__ __hydra_device__ + static inline uint64_t _icbrt(const uint128_t & x) + { + uint64_t res0 = 0; + +#ifdef __CUDA_ARCH__ + res0 = cbrtf(u128_to_float(x)); +#else + res0 = std::cbrt(u128_to_float(x)); +#endif +#ifdef __CUDA_ARCH__ +#pragma unroll +#endif + for(uint16_t i = 0; i < 47; i++) // there needs to be an odd number of iterations + // for the case of numbers of the form x^2 - 1 + // where this will not converge + res0 = (res0 + divide_u128_to_u128(x,res0)/res0) >> 1; + return res0; + } + + __hydra_host__ __hydra_device__ + // this function is to avoid off by 1 errors from nesting integer square roots + static inline uint64_t _iqrt(const uint128_t & x) + { + uint64_t res0 = 0, res1 = 0; + + res0 = _isqrt(_isqrt(x)); + res1 = (res0 + divide_u128_to_u128(x,res0*res0)/res0) >> 1; + res0 = (res1 + divide_u128_to_u128(x,res1*res1)/res1) >> 1; + + return res0 < res1 ? res0 : res1; + } + + // typecasting + //------------------------- + static inline uint128_t string_to_u128(std::string s) + { + uint128_t res = 0; + for(std::string::iterator iter = s.begin(); iter != s.end() && (int) *iter >= 48; iter++){ + res = mul128(res, 10); + res += (uint16_t) *iter - 48; + } + return res; + } + + __hydra_host__ __hydra_device__ + static inline uint64_t u128_to_u64(uint128_t x){ + + return x.lo; + } + + __hydra_host__ __hydra_device__ + static inline double u128_to_double(uint128_t x) + { + double dbl; +#ifdef __CUDA_ARCH__ + if(x.hi == 0) return __ull2double_rd(x.lo); +#else + if(x.hi == 0) return (double) x.lo; +#endif + uint64_t r = clz64(x.hi); + x <<= r; + +#ifdef __CUDA_ARCH__ + dbl = __ull2double_rd(x.hi); +#else + dbl = (double) x.lo; +#endif + + dbl *= (1ull << (64-r)); + + return dbl; + } + + __hydra_host__ __hydra_device__ + static inline float u128_to_float(uint128_t x) + { + float flt; +#ifdef __CUDA_ARCH__ + if(x.hi == 0) return __ull2float_rd(x.lo); +#else + if(x.hi == 0) return (float) x.lo; +#endif + uint64_t r = clz64(x.hi); + x <<= r; + +#ifdef __CUDA_ARCH__ + flt = __ull2float_rd(x.hi); +#else + flt = (float) x.hi; +#endif + + flt *= (1ull << (64-r)); + flt *= 2; + + return flt; + } + + __hydra_host__ __hydra_device__ + static inline uint128_t double_to_u128(double dbl) + { + uint128_t x; + if(dbl < 1 || dbl > 1e39) return 0; + else{ + +#ifdef __CUDA_ARCH__ + uint32_t shft = __double2uint_rd(log2(dbl)); + uint64_t divisor = 1ull << shft; + dbl /= divisor; + x.lo = __double2ull_rd(dbl); + x <<= shft; +#else + uint32_t shft = (uint32_t) log2(dbl); + uint64_t divisor = 1ull << shft; + dbl /= divisor; + x.lo = (uint64_t) dbl; + x <<= shft; +#endif + return x; + } + } + + __hydra_host__ __hydra_device__ + static inline uint128_t float_to_u128(float flt) + { + uint128_t x; + if(flt < 1 || flt > 1e39) return 0; + else{ + +#ifdef __CUDA_ARCH__ + uint32_t shft = __double2uint_rd(log2(flt)); + uint64_t divisor = 1ull << shft; + flt /= divisor; + x.lo = __double2ull_rd(flt); + x <<= shft; +#else + uint32_t shft = (uint32_t) log2(flt); + uint64_t divisor = 1ull << shft; + flt /= divisor; + x.lo = (uint64_t) flt; + x <<= shft; +#endif + return x; + } + } + +}; // class uint128_t + +} // namespace random_iterator + +#include <hydra/detail/random/detail/uint128.inl> + diff --git a/random_iterator/detail/uint128.hpp b/random_iterator/detail/uint128.hpp new file mode 100644 index 0000000..970ddfe --- /dev/null +++ b/random_iterator/detail/uint128.hpp @@ -0,0 +1,677 @@ +/*---------------------------------------------------------------------------- + * + * Copyright (C) 2021 Antonio Augusto Alves Junior + * + * This file is part of RandomIterator. + * + * RandomIterator is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RandomIterator is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RandomIterator. If not, see <http://www.gnu.org/licenses/>. + * + *---------------------------------------------------------------------------*/ +/* + * uint128.hpp + * + * Created on: 01/03/2021 + * Author: Antonio Augusto Alves Junior + */ + +#pragma once + +#include <random_iterator/detail/Macros.h> +#include <iostream> +#include <iomanip> +#include <cinttypes> +#include <cmath> +#include <string> +#include <vector> +#include <iterator> +#include <utility> + +namespace random_iterator { + +class uint128_t; + +namespace detail { + +template<typename T> +using check_uint128_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T,uint128_t >::value, uint128_t>::type; + +template<typename T> +using T_type = typename std::enable_if<std::is_integral<T>::value, T>::type; + +} + +class uint128_t +{ + uint64_t v_[2]; + + public: + + uint128_t(): v_({0,0}) {} + + uint128_t(uint64_t a): v_({a,0}){} + + uint128_t(uint64_t a, uint64_t b): + v_({a,b}) + {} + + uint128_t(uint128_t const& other){ + + v_[0] = other.v_[0] ; + v_[1] = other.v_[1]; + } + + inline uint128_t& operator=(uint128_t const& other){ + + if(this==&other) return *this; + v_[0] = other.v_[0]; + v_[1] = other.v_[1]; + return *this; + } + + template <typename T> + inline typename std::enable_if<std::is_integral<T>::value>::type& + operator=(T const& other){ + + v_[1] = 0; + v_[0] = other; + return * this; + } + + + template<typename T, typename=typename std::enable_if<std::is_integral<T>::value>::type> + inline operator T() const { + return T(v_[0]); + } + + // + // Compound assignment + //-------------------------------------- + // + // arithmetic + // + + template <typename T> + inline uint128_t& operator+=(const T a){ + + uint128_t b = (uint128_t) a; + v_[1] += b.v_[1] + ((v_[0] + b.v_[0]) < v_[0]); + v_[0] += b.v_[0]; + return *this; + } + + template <typename T> + inline uint128_t& operator-=(const T b){ + + uint128_t temp = (uint128_t)b; + if(v_[0] < temp.v_[0]) v_[1]--; + v_[0] -= temp.v_[0]; + return * this; + } + + template <typename T> + inline uint128_t& operator*=( const T b){ + + *this = uint128_t::mul128(*this, b); + return *this; + } + + template <typename T> + inline uint128_t& operator/=( const T b){ + + *this = uint128_t::divide_u128_to_u64(*this, (uint64_t)b); + return *this; + } + + // + // increment + // + inline uint128_t& operator++(){ + + return *this +=1; + } + + inline uint128_t operator++(int){ + uint128_t temp = *this; + *this +=1; + return temp; + } + + inline uint128_t& operator--(){ + + return *this -=1; + } + + inline uint128_t operator--(int){ + uint128_t temp = *this; + *this -=1; + return temp; + } + + // + // bitwise + // + + template <typename T> + inline uint128_t& operator>>=(const T b) + { + if(_LIKELY_(b==64)){ + v_[0] = v_[1]; + v_[1] = 0; + } + else{ + + v_[0] = (v_[0] >> b) | (v_[1] << (int)(b - 64)); + (b < 64) ? v_[1] >>= b : v_[1] = 0; + } + return *this; + } + + template <typename T> + inline uint128_t& operator<<=(const T b){ + + if(_LIKELY_(b==64)){ + + v_[1] = v_[0]; + v_[0] = 0; + } + else{ + v_[1] = (v_[1] << b) | (v_[0] << (int)(b - 64)); + (b < 64) ? v_[0] <<= b : v_[0] = 0; + } + return *this; + } + + template <typename T> + inline uint128_t& operator|=(const T b){ + + *this = *this | b; + return *this; + } + + template <typename T> + inline uint128_t& operator&=(const T b){ + + *this = *this & b; return *this; + } + + template <typename T> + inline uint128_t& operator^=(const T b){ + + *this = *this ^ b; + return *this; + } + + // + // friend functions + //----------------------------- + + template <typename T> + friend uint128_t operator>>(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator<<(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator+(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator-(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator*(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator/(uint128_t x, const T v); + + template <typename T> + friend uint128_t operator%(uint128_t x, const T v); + + friend bool operator<(uint128_t const a, uint128_t const b); + + friend bool operator>(uint128_t const a, uint128_t const b); + + friend bool operator<=(uint128_t const a, uint128_t const b); + + friend bool operator>=(uint128_t const a, uint128_t const b); + + friend bool operator==(uint128_t const a, uint128_t const b); + + friend bool operator!=(uint128_t const a, uint128_t const b); + + template <typename T> + friend uint128_t operator|(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator&(uint128_t a, const T b); + + template <typename T> + friend uint128_t operator^(uint128_t a, const T b); + + friend uint128_t operator~(uint128_t a); + + friend uint128_t min(uint128_t const a, uint128_t const b); + + friend uint128_t max(uint128_t const a, uint128_t const b); + + // This just makes it more convenient to count leading zeros for uint128_t + friend uint64_t clz128(uint128_t const x); + + // This just makes it more convenient to count leading zeros for uint128_t + friend uint128_t rotate_right_64( uint128_t x); + + // iostream + //------------------ + friend std::ostream & operator<<(std::ostream & out, uint128_t x); + + // + // static members + //------------------------ + static inline bool is_less_than(uint128_t const& a, uint128_t const& b){ + + if(a.v_[1] < b.v_[1]) return 1; + if(a.v_[1] > b.v_[1]) return 0; + if(a.v_[0] < b.v_[0]) return 1; + else return 0; + } + + static inline bool is_less_than_or_equal(uint128_t const& a, uint128_t const& b){ + + if(a.v_[1] < b.v_[1]) return 1; + if(a.v_[1] > b.v_[1]) return 0; + if(a.v_[0] <= b.v_[0]) return 1; + else return 0; + } + + static inline bool is_greater_than(uint128_t const& a, uint128_t const& b){ + + if(a.v_[1] < b.v_[1]) return 0; + if(a.v_[1] > b.v_[1]) return 1; + if(a.v_[0] <= b.v_[0]) return 0; + else return 1; + } + + static inline bool is_greater_than_or_equal(uint128_t const& a, uint128_t const& b) + { + if(a.v_[1] < b.v_[1]) return 0; + if(a.v_[1] > b.v_[1]) return 1; + if(a.v_[0] < b.v_[0]) return 0; + else return 1; + } + + static inline bool is_equal_to(uint128_t const& a, uint128_t const& b) + { + if(a.v_[0] == b.v_[0] && a.v_[1] == b.v_[1]) return 1; + else return 0; + } + + static inline bool is_not_equal_to(uint128_t const& a, uint128_t const& b) + { + if(a.v_[0] != b.v_[0] || a.v_[1] != b.v_[1]) return 1; + else return 0; + } + + + // bit operations + //--------------------- + + /// This counts leading zeros for 64 bit unsigned integers. It is used internally + /// in a few of the functions defined below. + static inline int clz64(uint64_t const& x) + { + int res; + + asm("bsr %1, %0\nxor $0x3f, %0" : "=r" (res) : "rm" (x) : "cc", "flags"); + + return res; + } + + static inline uint128_t bitwise_or(uint128_t a, uint128_t const& b) + { + a.v_[0] |= b.v_[0]; + a.v_[1] |= b.v_[1]; + return a; + } + + static inline uint128_t bitwise_and(uint128_t a, uint128_t const& b) + { + a.v_[0] &= b.v_[0]; + a.v_[1] &= b.v_[1]; + return a; + } + + static inline uint128_t bitwise_xor(uint128_t a, uint128_t const& b) + { + a.v_[0] ^= b.v_[0]; + a.v_[1] ^= b.v_[1]; + return a; + } + + static inline uint128_t bitwise_not(uint128_t a) + { + a.v_[0] = ~a.v_[0]; + a.v_[1] = ~a.v_[1]; + return a; + } + + static inline uint128_t rotate_right64( uint128_t x) + { + uint64_t temp = x.v_[0]; + x.v_[0] = x.v_[1]; + x.v_[1] = temp; + return x; + } + + static inline uint128_t pow2_add(uint128_t const x, uint128_t const y) + { + uint128_t z; + + asm( "mulq %3\n\t" + : "=a" (z.v_[0]), "=d" (z.v_[1]) + : "%0" (x.v_[0]), "rm" (x.v_[0])); + + z.v_[1] +=2*(x.v_[1] * x.v_[0]); + z.v_[1] += y.v_[1] + ((z.v_[0] + y.v_[0]) < z.v_[0]); + z.v_[0] += y.v_[0]; + + return z; + } + + static inline uint128_t pow2_add_rotate64(uint128_t const x, uint128_t const y) + { + uint128_t z; + + asm( "mulq %3\n\t" + : "=a" (z.v_[0]), "=d" (z.v_[1]) + : "%0" (x.v_[0]), "rm" (x.v_[0])); + + z.v_[1] +=2*(x.v_[1] * x.v_[0]); + z.v_[1] += y.v_[1] + ((z.v_[0] + y.v_[0]) < z.v_[0]); + z.v_[0] += y.v_[0]; + uint64_t temp = z.v_[0]; + z.v_[0] = z.v_[1]; + z.v_[1] = temp; + return z; + } + + // arithmetic + //------------------- + + static inline uint128_t add128(uint128_t x, uint128_t const y) + { + x.v_[1] += y.v_[1] + ((x.v_[0] + y.v_[0]) < x.v_[0]); + x.v_[0] += y.v_[0]; + return x; + } + + static inline uint128_t add128(uint128_t x, uint64_t const y) + { + x+=uint128_t(y); + return x; + + } + + static inline uint128_t mul128(uint64_t const x, uint64_t const y) + { + uint128_t res; + + asm( "mulq %3\n\t" + : "=a" (res.v_[0]), "=d" (res.v_[1]) + : "%0" (x), "rm" (y)); + + return res; + } + + static inline uint128_t mul128(uint128_t const x, uint128_t const y) + { + uint128_t z; + + asm( "mulq %3\n\t" + : "=a" (z.v_[0]), "=d" (z.v_[1]) + : "%0" (x.v_[0]), "rm" (y.v_[0])); + + z.v_[1] +=(x.v_[1] * y.v_[0]) + (x.v_[0] * y.v_[1]); + return z; + } + + static inline uint128_t mul128(uint128_t const x, uint64_t const y) + { + uint128_t res; + + asm( "mulq %3\n\t" + : "=a" (res.v_[0]), "=d" (res.v_[1]) + : "%0" (x.v_[0]), "rm" (y)); + res.v_[1] += x.v_[1] * y; + + return res; + } + + // taken from libdivide's adaptation of this implementation origininally in + // Hacker's Delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt + // License permits inclusion here per: + // http://www.hackersdelight.org/permissions.htm + static inline uint64_t divide_u128_to_u64(uint128_t const& x, uint64_t v, uint64_t * r = NULL) // x / v + { + const uint64_t b = 1ull << 32; + uint64_t un1, un0, + vn1, vn0, + q1, q0, + un64, un21, un10, + rhat; + int s; + + if(x.v_[1] >= v){ + if( r != NULL) *r = (uint64_t) -1; + return (uint64_t) -1; + } + + s = clz64(v); + + if(s > 0){ + v = v << s; + un64 = (x.v_[1] << s) | ((x.v_[0] >> (64 - s)) & (-s >> 31)); + un10 = x.v_[0] << s; + }else{ + un64 = x.v_[0] | x.v_[1]; + un10 = x.v_[0]; + } + + vn1 = v >> 32; + vn0 = v & 0xffffffff; + + un1 = un10 >> 32; + un0 = un10 & 0xffffffff; + + q1 = un64/vn1; + rhat = un64 - q1*vn1; + + again1: + if (q1 >= b || q1*vn0 > b*rhat + un1){ + q1 -= 1; + rhat = rhat + vn1; + if(rhat < b) goto again1; + } + + un21 = un64*b + un1 - q1*v; + + q0 = un21/vn1; + rhat = un21 - q0*vn1; + again2: + if(q0 >= b || q0 * vn0 > b*rhat + un0){ + q0 = q0 - 1; + rhat = rhat + vn1; + if(rhat < b) goto again2; + } + + if(r != NULL) *r = (un21*b + un0 - q0*v) >> s; + return q1*b + q0; + } + + static inline uint128_t divide_u128_to_u128(uint128_t x, uint64_t v, uint64_t * r = NULL) + { + uint128_t res; + + res.v_[1] = x.v_[1]/v; + x.v_[1] %= v; + res.v_[0] = divide_u128_to_u64(x, v, r); + + return res; + } + + static inline uint128_t sub128(uint128_t const& x, uint128_t const& y) // x - y + { + uint128_t res; + + res.v_[0] = x.v_[0] - y.v_[0]; + res.v_[1] = x.v_[1] - y.v_[1]; + if(x.v_[0] < y.v_[0]) res.v_[1]--; + + return res; + } + + static inline uint64_t _isqrt(uint64_t const& x) + { + uint64_t res0 = 0; + res0 = ::sqrt(x); + for(uint16_t i = 0; i < 8; i++) + res0 = (res0 + x/res0) >> 1; + return res0; + } + + // roots + //------------------- + + static inline uint64_t _isqrt(const uint128_t & x) // this gives errors above 2^124 + { + uint64_t res0 = 0; + + if( (x.v_[1] == 0 && x.v_[0] == 0) || x.v_[1] > 1ull << 60) + return 0; + + res0 = ::sqrt(u128_to_float(x)); + + for(uint16_t i = 0; i < 8; i++) + res0 = (res0 + x/res0) >> 1; + + return res0; + } + + static inline uint64_t _icbrt(const uint128_t & x) + { + uint64_t res0 = 0; + + res0 = std::cbrt(u128_to_float(x)); + for(uint16_t i = 0; i < 47; i++) // there needs to be an odd number of iterations + // for the case of numbers of the form x^2 - 1 + // where this will not converge + res0 = (res0 + divide_u128_to_u128(x,res0)/res0) >> 1; + return res0; + } + + // this function is to avoid off by 1 errors from nesting integer square roots + static inline uint64_t _iqrt(const uint128_t & x) + { + uint64_t res0 = 0, res1 = 0; + + res0 = _isqrt(_isqrt(x)); + res1 = (res0 + divide_u128_to_u128(x,res0*res0)/res0) >> 1; + res0 = (res1 + divide_u128_to_u128(x,res1*res1)/res1) >> 1; + + return res0 < res1 ? res0 : res1; + } + + // typecasting + //------------------------- + static inline uint128_t string_to_u128(std::string s) + { + uint128_t res = 0; + for(std::string::iterator iter = s.begin(); iter != s.end() && (int) *iter >= 48; iter++){ + res = mul128(res, 10); + res += (uint16_t) *iter - 48; + } + return res; + } + + static inline uint64_t u128_to_u64(uint128_t x){ + + return x.v_[0]; + } + + static inline double u128_to_double(uint128_t x) + { + double dbl; + + if(x.v_[1] == 0) return (double) x.v_[0]; + + uint64_t r = clz64(x.v_[1]); + x <<= r; + + dbl = (double) x.v_[0]; + + dbl *= (1ull << (64-r)); + + return dbl; + } + + static inline float u128_to_float(uint128_t x) + { + float flt; + + if(x.v_[1] == 0) return (float) x.v_[0]; + uint64_t r = clz64(x.v_[1]); + x <<= r; + + flt = (float) x.v_[1]; + + + flt *= (1ull << (64-r)); + flt *= 2; + + return flt; + } + + static inline uint128_t double_to_u128(double dbl) + { + uint128_t x; + if(dbl < 1 || dbl > 1e39) return 0; + else{ + + uint32_t shft = (uint32_t) log2(dbl); + uint64_t divisor = 1ull << shft; + dbl /= divisor; + x.v_[0] = (uint64_t) dbl; + x <<= shft; + return x; + } + } + + static inline uint128_t float_to_u128(float flt) + { + uint128_t x; + if(flt < 1 || flt > 1e39) return 0; + else{ + + uint32_t shft = (uint32_t) log2(flt); + uint64_t divisor = 1ull << shft; + flt /= divisor; + x.v_[0] = (uint64_t) flt; + x <<= shft; + return x; + } + } + +}; // class uint128_t + +} // namespace random_iterator + +#include <random_iterator/detail/uint128.inl> diff --git a/random_iterator/detail/uint128.inl b/random_iterator/detail/uint128.inl new file mode 100644 index 0000000..f59316e --- /dev/null +++ b/random_iterator/detail/uint128.inl @@ -0,0 +1,179 @@ +/*---------------------------------------------------------------------------- + * + * Copyright (C) 2016 - 2021 Antonio Augusto Alves Junior + * + * This file is part of Hydra Data Analysis Framework. + * + * Hydra is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Hydra is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Hydra. If not, see <http://www.gnu.org/licenses/>. + * + *---------------------------------------------------------------------------*/ + +/* + * uint128.inl + * + * Created on: 04/11/2020 + * Author: Antonio Augusto Alves Junior + */ + +#pragma once + +namespace random_iterator { + +template <typename T> +inline uint128_t operator>>(uint128_t a, const T b) { + + a >>= b; + return a; +} + +template <typename T> +inline uint128_t operator<<(uint128_t a, const T b) { + + a <<= b; + return a; +} + +template <typename T> +inline uint128_t operator+(uint128_t a, const T b) { + + return uint128_t::add128(a, b); +} + +template <typename T> +inline uint128_t operator-(uint128_t a, const T b) { + + return uint128_t::sub128(a, (uint128_t)b); +} + +template <typename T> +inline uint128_t operator*(uint128_t a, const T b) { + + return uint128_t::mul128(a, b); +} + +template <typename T> +inline T operator/(uint128_t x, const T v) { + + return uint128_t::divide_u128_to_u64(x, (uint64_t)v); +} + +template <typename T> +inline T operator%(uint128_t x, const T & v) { + + uint64_t res; + uint128_t::divide_u128_to_u64(x, v, &res); + return (T)res; +} + +inline bool operator<(uint128_t const a, uint128_t const b) { + + return uint128_t::is_less_than(a, b); +} + +inline bool operator>(uint128_t const a, uint128_t const b) { + + return uint128_t::is_greater_than(a, b); +} + +inline bool operator<=(uint128_t const a, uint128_t const b) { + + return uint128_t::is_less_than_or_equal(a, b); +} + +inline bool operator>=(uint128_t const a, uint128_t const b) { + + return uint128_t::is_greater_than_or_equal(a, b); +} + +inline bool operator==(uint128_t const a, uint128_t const b) { + + return uint128_t::is_equal_to(a, b); +} + +inline bool operator!=(uint128_t const a, uint128_t const b) { + + return uint128_t::is_not_equal_to(a, b); +} + +template <typename T> +inline uint128_t operator|(uint128_t a, const T b) { + + return uint128_t::bitwise_or(a, (uint128_t)b); +} + +template <typename T> +inline uint128_t operator&(uint128_t a, const T b) { + + return uint128_t::bitwise_and(a, (uint128_t)b); +} + +template <typename T> +inline uint128_t operator^(uint128_t a, const T b) { + + return uint128_t::bitwise_xor(a, (uint128_t)b); +} + +inline uint128_t operator~(uint128_t a) { + + return uint128_t::bitwise_not(a); +} + +inline uint128_t min(uint128_t const& a, uint128_t const b) { + + return a < b ? a : b; +} + +inline uint128_t max(uint128_t const& a, uint128_t const b) { + + return a > b ? a : b; +} + +inline uint64_t clz128(uint128_t const x){ + + uint64_t res; + + res = x.v_[1] != 0 ? uint128_t::clz64(x.v_[1]) : 64 + uint128_t::clz64(x.v_[0]); + + return res; +} + +inline uint128_t rotate_right_64( uint128_t x){ + + uint64_t temp = x.v_[0]; + x.v_[0] = x.v_[1]; + x.v_[1] = temp; + return x; +} + +// iostream +//------------------ +inline std::ostream & operator<<(std::ostream & out, uint128_t x) +{ + std::vector<uint16_t> rout; + uint64_t v = 10, r = 0; + if (x.v_[1] == 0 && x.v_[0]==0) { + out << "0"; + return out; + } + do { + x = uint128_t::divide_u128_to_u128(x, v, &r); + rout.push_back(r); + } while(x.v_[0] != 0 || x.v_[1] != 0); + for(std::reverse_iterator<std::vector<uint16_t>::iterator> rit = rout.rbegin(); rit != rout.rend(); rit++){ + out << *rit; + } + return out; +} + +} // namespace random_iterator diff --git a/tests/benchmark_engine.cpp b/tests/benchmark_engine.cpp index 84e9cec..bec57c0 100644 --- a/tests/benchmark_engine.cpp +++ b/tests/benchmark_engine.cpp @@ -43,10 +43,12 @@ static const uint64_t default_seed= 0x548c9decbce65295 ; TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { - random_iterator::squares3 RNG_squares3(default_seed); - random_iterator::squares4 RNG_squares4(default_seed); - random_iterator::threefry RNG_threefry(default_seed); - random_iterator::philox RNG_philox(default_seed); + random_iterator::squares3_64 RNG_squares3(default_seed); + random_iterator::squares4_64 RNG_squares4(default_seed); + random_iterator::squares3_128 RNG_squares3_long(default_seed); + random_iterator::squares4_128 RNG_squares4_long(default_seed); + random_iterator::threefry RNG_threefry(default_seed); + random_iterator::philox RNG_philox(default_seed); std::mt19937 RNG_mt19937(default_seed ); std::ranlux48 RNG_ranlux48(default_seed); @@ -57,11 +59,20 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares3(); }; + BENCHMARK("random_iterator::squares3_long") { + return RNG_squares3_long(); + }; + BENCHMARK("random_iterator::squares4") { return RNG_squares4(); }; + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long(); + }; + + BENCHMARK("random_iterator::threefry") { return RNG_threefry(); }; @@ -93,6 +104,17 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + + + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; @@ -122,6 +144,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; @@ -152,6 +183,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; @@ -181,6 +221,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; @@ -211,6 +260,15 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; @@ -242,6 +300,14 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; @@ -272,6 +338,16 @@ TEST_CASE("Performance of counter based PRNGs and corresponding iterators") { return RNG_squares4.discard(jump_size); }; + + + BENCHMARK("random_iterator::squares3_long") { + RNG_squares3_long.discard(jump_size); + }; + + BENCHMARK("random_iterator::squares4_long") { + return RNG_squares4_long.discard(jump_size); + }; + BENCHMARK("random_iterator::threefry") { return RNG_threefry.discard(jump_size); }; -- GitLab