IAP GITLAB

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • AirShowerPhysics/corsika
  • rulrich/corsika
  • AAAlvesJr/corsika
  • Andre/corsika
  • arrabito/corsika
  • Nikos/corsika
  • olheiser73/corsika
  • AirShowerPhysics/papers/corsika
  • pranav/corsika
9 results
Show changes
Showing
with 4345 additions and 0 deletions
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __gccfeatures_dot_hpp
#define __gccfeatures_dot_hpp
#define RANDOM_ITERATOR_R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__aarch64__)
# error "This code has only been tested on x86, powerpc and a few arm platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifdef __powerpc__
#include <ppu_intrinsics.h>
#endif
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE static __inline__
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#if RANDOM_ITERATOR_R123_GNUC_VERSION >= 40000
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#else
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#include <assert.h>
#define RANDOM_ITERATOR_R123_ASSERT(x) assert(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
/* According to the C++0x standard, we should be able to test the numeric
value of __cplusplus == 199701L for C++98, __cplusplus == 201103L for C++11
But gcc has had an open bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=1773
since early 2001, which was finally fixed in 4.7 (early 2012). For
earlier versions, the only way to detect whether --std=c++0x was requested
on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
*/
#if defined(__GCC_EXPERIMENTAL_CXX0X__)
#define GNU_CXX11 (__cplusplus>=201103L || (RANDOM_ITERATOR_R123_GNUC_VERSION<40700 && 1/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
#else
#define GNU_CXX11 (__cplusplus>=201103L || (RANDOM_ITERATOR_R123_GNUC_VERSION<40700 && 0/* defined(__GCC_EXPERIMENTAL_CXX0X__) */))
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CXX11_UNRESTRICTED_UNIONS
#define RANDOM_ITERATOR_R123_USE_CXX11_UNRESTRICTED_UNIONS ((RANDOM_ITERATOR_R123_GNUC_VERSION >= 40600) && GNU_CXX11)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CXX11_STATIC_ASSERT
#define RANDOM_ITERATOR_R123_USE_CXX11_STATIC_ASSERT ((RANDOM_ITERATOR_R123_GNUC_VERSION >= 40300) && GNU_CXX11)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CXX11_CONSTEXPR
#define RANDOM_ITERATOR_R123_USE_CXX11_CONSTEXPR ((RANDOM_ITERATOR_R123_GNUC_VERSION >= 40600) && GNU_CXX11)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CXX11_EXPLICIT_CONVERSIONS
#define RANDOM_ITERATOR_R123_USE_CXX11_EXPLICIT_CONVERSIONS ((RANDOM_ITERATOR_R123_GNUC_VERSION >= 40500) && GNU_CXX11)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CXX11_RANDOM
#define RANDOM_ITERATOR_R123_USE_CXX11_RANDOM ((RANDOM_ITERATOR_R123_GNUC_VERSION>=40500) && GNU_CXX11)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CXX11_TYPE_TRAITS
#define RANDOM_ITERATOR_R123_USE_CXX11_TYPE_TRAITS ((RANDOM_ITERATOR_R123_GNUC_VERSION>=40400) && GNU_CXX11)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#ifdef __AES__
#define RANDOM_ITERATOR_R123_USE_AES_NI 1
#else
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#ifdef __SSE4_2__
#define RANDOM_ITERATOR_R123_USE_SSE4_2 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#ifdef __SSE4_1__
#define RANDOM_ITERATOR_R123_USE_SSE4_1 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
/* There's no point in trying to compile SSE code in Random123
unless SSE2 is available. */
#ifdef __SSE2__
#define RANDOM_ITERATOR_R123_USE_SSE 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DRANDOM_ITERATOR_R123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define RANDOM_ITERATOR_R123_USE_AES_OPENSSL 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#if defined(__x86_64__) || defined(__aarch64__)
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 1
#else
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_ASM_GNU
#if (defined(__x86_64__)||defined(__i386__))
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 1
#else
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 1
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CPUID_MSVC
#define RANDOM_ITERATOR_R123_USE_CPUID_MSVC 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#if (defined(__x86_64__)||defined(__i386__))
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H (1/* (defined(__x86_64__)||defined(__i386__)) */ && RANDOM_ITERATOR_R123_GNUC_VERSION >= 40402)
#else
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H (0/* (defined(__x86_64__)||defined(__i386__)) */ && RANDOM_ITERATOR_R123_GNUC_VERSION >= 40402)
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#define RANDOM_ITERATOR_R123_USE_IA32INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_XMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
emmintrin.h in the include search path. This is
so broken that I refuse to try to work around it. If this
affects you, figure out where your emmintrin.h lives and
add an appropriate -I to your CPPFLAGS. Or add -DRANDOM_ITERATOR_R123_USE_SSE=0. */
#define RANDOM_ITERATOR_R123_USE_EMMINTRIN_H (RANDOM_ITERATOR_R123_USE_SSE && (RANDOM_ITERATOR_R123_GNUC_VERSION < 40402))
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_SMMINTRIN_H ((RANDOM_ITERATOR_R123_USE_SSE4_1 || RANDOM_ITERATOR_R123_USE_SSE4_2) && (RANDOM_ITERATOR_R123_GNUC_VERSION < 40402))
#endif
#ifndef RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_WMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_INTRIN_H
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO32_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN
#if (defined(__powerpc64__))
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN 1
#else
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_MULHILO64_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_MULHILO64_MULHI_INTRIN __mulhdu
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_MULHILO32_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_MULHILO32_MULHI_INTRIN __mulhwu
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __icpcfeatures_dot_hpp
#define __icpcfeatures_dot_hpp
// icc relies on gcc libraries and other toolchain components.
#define RANDOM_ITERATOR_R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#if !defined(__x86_64__) && !defined(__i386__)
# error "This code has only been tested on x86 platforms."
{ // maybe an unbalanced brace will terminate the compilation
// You are invited to try Easy123 on other architectures, by changing
// the conditions that reach this error, but you should consider it a
// porting exercise and expect to encounter bugs and deficiencies.
// Please let the authors know of any successes (or failures).
#endif
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE static inline
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#include <assert.h>
#define RANDOM_ITERATOR_R123_ASSERT(x) assert(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
// The basic idiom is:
// #ifndef RANDOM_ITERATOR_R123_SOMETHING
// #if some condition
// #define RANDOM_ITERATOR_R123_SOMETHING 1
// #else
// #define RANDOM_ITERATOR_R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DRANDOM_ITERATOR_R123_SOMETHING=1 or -DRANDOM_ITERATOR_R123_SOMETHINE=0
// An alternative idiom is:
// #ifndef RANDOM_ITERATOR_R123_SOMETHING
// #define RANDOM_ITERATOR_R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined RANDOM_ITERATOR_R123_SOMETHING_ELSE
// pp-symbols.
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#ifdef __SSE4_2__
#define RANDOM_ITERATOR_R123_USE_SSE4_2 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#ifdef __SSE4_1__
#define RANDOM_ITERATOR_R123_USE_SSE4_1 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
#ifdef __SSE2__
#define RANDOM_ITERATOR_R123_USE_SSE 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
// Unlike gcc, icc (version 12) does not pre-define an __AES__
// pp-symbol when -maes or -xHost is on the command line. This feels
// like a defect in icc (it defines __SSE4_2__ in analogous
// circumstances), but until Intel fixes it, we're better off erring
// on the side of caution and not generating instructions that are
// going to raise SIGILL when executed. To get the AES-NI
// instructions with icc, the caller must puts something like
// -DRANDOM_ITERATOR_R123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the
// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
// 11.1 onwards.
//
#if defined(__AES__)
#define RANDOM_ITERATOR_R123_USE_AES_NI ((__ICC>=1101) && 1/*defined(__AES__)*/)
#else
#define RANDOM_ITERATOR_R123_USE_AES_NI ((__ICC>=1101) && 0/*defined(__AES__)*/)
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DRANDOM_ITERATOR_R123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define RANDOM_ITERATOR_R123_USE_AES_OPENSSL 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_ASM_GNU
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CPUID_MSVC
#define RANDOM_ITERATOR_R123_USE_CPUID_MSVC 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#define RANDOM_ITERATOR_R123_USE_IA32INTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_XMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_EMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_SMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_WMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_INTRIN_H
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO16_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO16_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO32_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Written by Tom Schoonjans <Tom.Schoonjans@me.com>
*/
#ifndef __metalfeatures_dot_hpp
#define __metalfeatures_dot_hpp
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE inline
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_METAL_THREAD_ADDRESS_SPACE
#define RANDOM_ITERATOR_R123_METAL_THREAD_ADDRESS_SPACE thread
#endif
#ifndef RANDOM_ITERATOR_R123_METAL_CONSTANT_ADDRESS_SPACE
#define RANDOM_ITERATOR_R123_METAL_CONSTANT_ADDRESS_SPACE constant
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#define RANDOM_ITERATOR_R123_ASSERT(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN 1
#endif
#if RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN
#include <metal_integer>
#define RANDOM_ITERATOR_R123_MULHILO32_MULHI_INTRIN metal::mulhi
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_64BIT
#define RANDOM_ITERATOR_R123_USE_64BIT 0 /* Metal currently (Feb 2019, Specification-2) does not support 64-bit variable types */
#endif
#ifndef RANDOM_ITERATOR_R123_ULONG_LONG
/* the longest integer type in Metal (Feb 2019, Specification-2) is a
* 32-bit unsigned int. Let's hope for the best... */
#define RANDOM_ITERATOR_R123_ULONG_LONG unsigned int
#endif
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __msvcfeatures_dot_hpp
#define __msvcfeatures_dot_hpp
//#if _MSVC_FULL_VER <= 15
//#error "We've only tested MSVC_FULL_VER==15."
//#endif
#if !defined(_M_IX86) && !defined(_M_X64)
# error "This code has only been tested on x86 platforms."
{ // maybe an unbalanced brace will terminate the compilation
// You are invited to try Random123 on other architectures, by changing
// the conditions that reach this error, but you should consider it a
// porting exercise and expect to encounter bugs and deficiencies.
// Please let the authors know of any successes (or failures).
#endif
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE static __inline
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) _forceinline decl
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#include <assert.h>
#define RANDOM_ITERATOR_R123_ASSERT(x) assert(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) expr
#endif
// The basic idiom is:
// #ifndef RANDOM_ITERATOR_R123_SOMETHING
// #if some condition
// #define RANDOM_ITERATOR_R123_SOMETHING 1
// #else
// #define RANDOM_ITERATOR_R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DRANDOM_ITERATOR_R123_SOMETHING=1 or -DRANDOM_ITERATOR_R123_SOMETHINE=0
// An alternative idiom is:
// #ifndef RANDOM_ITERATOR_R123_SOMETHING
// #define RANDOM_ITERATOR_R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined RANDOM_ITERATOR_R123_SOMETHING_ELSE
// pp-symbols.
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#if defined(_M_X64)
#define RANDOM_ITERATOR_R123_USE_AES_NI 1
#else
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#if defined(_M_X64)
#define RANDOM_ITERATOR_R123_USE_SSE4_2 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#if defined(_M_X64)
#define RANDOM_ITERATOR_R123_USE_SSE4_1 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
#define RANDOM_ITERATOR_R123_USE_SSE 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_OPENSSL
#define RANDOM_ITERATOR_R123_USE_AES_OPENSSL 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_ASM_GNU
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CPUID_MSVC
#define RANDOM_ITERATOR_R123_USE_CPUID_MSVC 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#define RANDOM_ITERATOR_R123_USE_IA32INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_XMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_EMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_SMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_WMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_INTRIN_H
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO16_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO16_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO32_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#if defined(_M_X64)
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 1
#else
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
#pragma warning(disable:4244)
#pragma warning(disable:4996)
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __r123_nvcc_features_dot_h__
#define __r123_nvcc_features_dot_h__
#if !defined(CUDART_VERSION)
#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined"
#endif
#if CUDART_VERSION < 4010
#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h"
// This test was added in Random123-1.08 (August, 2013) because we
// discovered that Ftype(maxTvalue<T>()) with Ftype=double and
// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
// earlier. We can't be sure this bug doesn't also affect invocations
// of other templated functions, e.g., essentially all of Random123.
// Thus, we no longer trust CUDA versions earlier than 4.1 even though
// we had previously tested and timed Random123 with CUDA 3.x and 4.0.
// If you feel lucky or desperate, you can change #error to #warning, but
// please take extra care to be sure that you are getting correct
// results.
#endif
// nvcc falls through to gcc or msvc. So first define
// a couple of things and then include either gccfeatures.h
// or msvcfeatures.h
//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled
//for both device and host functions in CUDA by setting compiler flags
//for the device function
#ifdef __CUDA_ARCH__
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE __device__
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 1
#endif
#ifndef RANDOM_ITERATOR_R123_THROW
// No exceptions in CUDA, at least upto 4.0
#define RANDOM_ITERATOR_R123_THROW(x) RANDOM_ITERATOR_R123_ASSERT(0)
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#define RANDOM_ITERATOR_R123_ASSERT(x) if((x)) ; else asm("trap;")
#endif
#else // ! __CUDA_ARCH__
// If we're using nvcc not compiling for the CUDA architecture,
// then we must be compiling for the host. In that case,
// tell the philox code to use the mulhilo64 asm because
// nvcc doesn't grok uint128_t.
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 1
#endif
#endif // __CUDA_ARCH__
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
#define RANDOM_ITERATOR_R123_USE_SSE 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_ULONG_LONG
// uint64_t, which is what we'd get without this, is
// not the same as unsigned long long
#define RANDOM_ITERATOR_R123_ULONG_LONG unsigned long long
#endif
#if defined(__GNUC__)
#include "gccfeatures.h"
#elif defined(_MSC_FULL_VER)
#include "msvcfeatures.h"
#endif
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __open64features_dot_hpp
#define __open64features_dot_hpp
/* The gcc features are mostly right. We just override a few and then include gccfeatures.h */
/* Open64 4.2.3 and 4.2.4 accept the __uint128_t code without complaint
but produce incorrect code for 64-bit philox. The MULHILO64_ASM
seems to work fine */
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 1
#endif
#include "gccfeatures.h"
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __openclfeatures_dot_hpp
#define __openclfeatures_dot_hpp
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE inline
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#define RANDOM_ITERATOR_R123_ASSERT(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) expr
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
// ulong to mul_hi. And gets lots of complaints from stdint.h
// on some machines.
// But these typedefs mean we cannot include stdint.h with
// these headers? Do we need RANDOM_ITERATOR_R123_64T, RANDOM_ITERATOR_R123_32T, RANDOM_ITERATOR_R123_8T?
typedef ulong uint64_t;
typedef uint uint32_t;
typedef uchar uint8_t;
#define UINT64_C(x) ((ulong)(x##UL))
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Copyright (c) 2013, Los Alamos National Security, LLC
All rights reserved.
Copyright 2013. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
to produce derivative works, such modified software should be clearly marked,
so as not to confuse it with the version available from LANL.
*/
#ifndef __pgccfeatures_dot_hpp
#define __pgccfeatures_dot_hpp
#if !defined(__x86_64__) && !defined(__i386__)
# error "This code has only been tested on x86 platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE static inline
#endif
/* Found this example in PGI's emmintrin.h. */
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#include <assert.h>
#define RANDOM_ITERATOR_R123_ASSERT(x) assert(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) (expr)
#endif
/* PGI through 13.2 doesn't appear to support AES-NI. */
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and
ABM, but not SSE4.1 or SSE4.2. */
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
/* There's no point in trying to compile SSE code in Random123
unless SSE2 is available. */
#ifdef __SSE2__
#define RANDOM_ITERATOR_R123_USE_SSE 1
#else
#define RANDOM_ITERATOR_R123_USE_SSE 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DRANDOM_ITERATOR_R123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define RANDOM_ITERATOR_R123_USE_AES_OPENSSL 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_ASM_GNU
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CPUID_MSVC
#define RANDOM_ITERATOR_R123_USE_CPUID_MSVC 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#define RANDOM_ITERATOR_R123_USE_IA32INTRIN_H 0
#endif
/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time
about undefined references to _mm_castsi128_ps(__m128i). Why? */
#ifndef RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_XMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_EMMINTRIN_H 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_SMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_WMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_INTRIN_H
#ifdef __ABM__
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 1
#else
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO32_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _Random123_sse_dot_h__
#define _Random123_sse_dot_h__
#if RANDOM_ITERATOR_R123_USE_SSE
#if RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#include <x86intrin.h>
#endif
#if RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#include <ia32intrin.h>
#endif
#if RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#include <xmmintrin.h>
#endif
#if RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
#include <emmintrin.h>
#endif
#if RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#include <smmintrin.h>
#endif
#if RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#include <wmmintrin.h>
#endif
#if RANDOM_ITERATOR_R123_USE_INTRIN_H
#include <intrin.h>
#endif
#ifdef __cplusplus
#include <iostream>
#include <limits>
#include <stdexcept>
#endif
#if RANDOM_ITERATOR_R123_USE_ASM_GNU
/* bit25 of CX tells us whether AES is enabled. */
RANDOM_ITERATOR_R123_STATIC_INLINE int haveAESNI(){
unsigned int eax, ebx, ecx, edx;
__asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
"a" (1));
return (ecx>>25) & 1;
}
#elif RANDOM_ITERATOR_R123_USE_CPUID_MSVC
RANDOM_ITERATOR_R123_STATIC_INLINE int haveAESNI(){
int CPUInfo[4];
__cpuid(CPUInfo, 1);
return (CPUInfo[2]>>25)&1;
}
#else /* RANDOM_ITERATOR_R123_USE_CPUID_??? */
#warning "No RANDOM_ITERATOR_R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
RANDOM_ITERATOR_R123_STATIC_INLINE int haveAESNI(){
return 0;
}
#endif /* RANDOM_ITERATOR_R123_USE_ASM_GNU || RANDOM_ITERATOR_R123_USE_CPUID_MSVC */
// There is a lot of annoying and inexplicable variation in the
// SSE intrinsics available in different compilation environments.
// The details seem to depend on the compiler, the version and
// the target architecture. Rather than insisting on
// RANDOM_ITERATOR_R123_USE_feature tests for each of these in each of the
// compilerfeatures.h files we just keep the complexity localized
// to here...
#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
added _mm_set_epi64x to icc version 12.1 in Jan 2012.
*/
RANDOM_ITERATOR_R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
union{
uint64_t u64;
uint32_t u32[2];
} u1, u0;
u1.u64 = v1;
u0.u64 = v0;
return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
}
#endif
/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
assertions in ut_M128.cpp and ut_carray.cpp when we use the
_mm_cvtsi128_si64 intrinsic. (See
https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
Finally, even if the intrinsic exists, it may be spelled with or
without the 'x'.
*/
#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
RANDOM_ITERATOR_R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
union{
uint64_t u64[2];
__m128i m;
}u;
_mm_store_si128(&u.m, si);
return u.u64[0];
}
#elif defined(__llvm__) || defined(__ICC)
RANDOM_ITERATOR_R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
return (uint64_t)_mm_cvtsi128_si64(si);
}
#else /* GNUC, others */
/* FWIW, gcc's emmintrin.h has had the 'x' spelling
since at least gcc-3.4.4. The no-'x' spelling showed up
around 4.2. */
RANDOM_ITERATOR_R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
return (uint64_t)_mm_cvtsi128_si64x(si);
}
#endif
#if defined(__GNUC__) && __GNUC__ < 4
/* the cast builtins showed up in gcc4. */
RANDOM_ITERATOR_R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
return (__m128)si;
}
#endif
#ifdef __cplusplus
struct r123m128i{
__m128i m;
#if RANDOM_ITERATOR_R123_USE_CXX11_UNRESTRICTED_UNIONS
// C++98 forbids a union member from having *any* constructors.
// C++11 relaxes this, and allows union members to have constructors
// as long as there is a "trivial" default construtor. So in C++11
// we can provide a r123m128i constructor with an __m128i argument, and still
// have the default (and hence trivial) default constructor.
r123m128i() = default;
r123m128i(__m128i _m): m(_m){}
#endif
r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
r123m128i& operator=(RANDOM_ITERATOR_R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
#if RANDOM_ITERATOR_R123_USE_CXX11_EXPLICIT_CONVERSIONS
// With C++11 we can attach explicit to the bool conversion operator
// to disambiguate undesired promotions. For g++, this works
// only in 4.5 and above.
explicit operator bool() const {return _bool();}
#else
// Pre-C++11, we have to do something else. Google for the "safe bool"
// idiom for other ideas...
operator const void*() const{return _bool()?this:0;}
#endif
operator __m128i() const {return m;}
private:
#if RANDOM_ITERATOR_R123_USE_SSE4_1
bool _bool() const{ return !_mm_testz_si128(m,m); }
#else
bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
#endif
};
RANDOM_ITERATOR_R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
__m128i& c = v.m;
__m128i zeroone = _mm_set_epi64x(RANDOM_ITERATOR_R123_64BIT(0), RANDOM_ITERATOR_R123_64BIT(1));
c = _mm_add_epi64(c, zeroone);
//return c;
#if RANDOM_ITERATOR_R123_USE_SSE4_1
__m128i zerofff = _mm_set_epi64x(0, ~(RANDOM_ITERATOR_R123_64BIT(0)));
if( RANDOM_ITERATOR_R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
__m128i onezero = _mm_set_epi64x(RANDOM_ITERATOR_R123_64BIT(1), RANDOM_ITERATOR_R123_64BIT(0));
c = _mm_add_epi64(c, onezero);
}
#else
unsigned mask = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
// The low two bits of mask are 11 iff the low 64 bits of
// c are zero.
if( RANDOM_ITERATOR_R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
__m128i onezero = _mm_set_epi64x(1,0);
c = _mm_add_epi64(c, onezero);
}
#endif
return v;
}
RANDOM_ITERATOR_R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, RANDOM_ITERATOR_R123_ULONG_LONG n){
__m128i c = lhs.m;
__m128i incr128 = _mm_set_epi64x(0, n);
c = _mm_add_epi64(c, incr128);
// return c; // NO CARRY!
int64_t lo64 = _mm_extract_lo64(c);
if((uint64_t)lo64 < n)
c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
lhs.m = c;
return lhs;
}
// We need this one because it's present, but never used in r123array1xm128i::incr
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator<=(RANDOM_ITERATOR_R123_ULONG_LONG, const r123m128i &){
throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
// The comparisons aren't implemented, but if we leave them out, and
// somebody writes, e.g., M1 < M2, the compiler will do an implicit
// conversion through void*. Sigh...
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){
return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){
return !(lhs==rhs);}
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator==(RANDOM_ITERATOR_R123_ULONG_LONG lhs, const r123m128i &rhs){
r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
RANDOM_ITERATOR_R123_STATIC_INLINE bool operator!=(RANDOM_ITERATOR_R123_ULONG_LONG lhs, const r123m128i &rhs){
return !(lhs==rhs);}
RANDOM_ITERATOR_R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
union{
uint64_t u64[2];
__m128i m;
}u;
_mm_storeu_si128(&u.m, m.m);
return os << u.u64[0] << " " << u.u64[1];
}
RANDOM_ITERATOR_R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
uint64_t u64[2];
is >> u64[0] >> u64[1];
m.m = _mm_set_epi64x(u64[1], u64[0]);
return is;
}
template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
template <>
inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
r123m128i ret;
ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
return ret;
}
#else
typedef struct {
__m128i m;
} r123m128i;
#endif /* __cplusplus */
#else /* !RANDOM_ITERATOR_R123_USE_SSE */
RANDOM_ITERATOR_R123_STATIC_INLINE int haveAESNI(){
return 0;
}
#endif /* RANDOM_ITERATOR_R123_USE_SSE */
#endif /* _Random123_sse_dot_h__ */
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __sunprofeatures_dot_hpp
#define __sunprofeatures_dot_hpp
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE static inline
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#include <assert.h>
#define RANDOM_ITERATOR_R123_ASSERT(x) assert(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) expr
#endif
// The basic idiom is:
// #ifndef RANDOM_ITERATOR_R123_SOMETHING
// #if some condition
// #define RANDOM_ITERATOR_R123_SOMETHING 1
// #else
// #define RANDOM_ITERATOR_R123_SOMETHING 0
// #endif
// #endif
// This idiom allows an external user to override any decision
// in this file with a command-line -DRANDOM_ITERATOR_R123_SOMETHING=1 or -DRANDOM_ITERATOR_R123_SOMETHINE=0
// An alternative idiom is:
// #ifndef RANDOM_ITERATOR_R123_SOMETHING
// #define RANDOM_ITERATOR_R123_SOMETHING (some boolean expression)
// #endif
// where the boolean expression might contain previously-defined RANDOM_ITERATOR_R123_SOMETHING_ELSE
// pp-symbols.
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
#define RANDOM_ITERATOR_R123_USE_SSE 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_OPENSSL
#define RANDOM_ITERATOR_R123_USE_AES_OPENSSL 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_ASM_GNU
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CPUID_MSVC
#define RANDOM_ITERATOR_R123_USE_CPUID_MSVC 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#define RANDOM_ITERATOR_R123_USE_IA32INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_XMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_EMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_SMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_WMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_INTRIN_H
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO16_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO16_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO32_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_PHILOX_64BIT
#define RANDOM_ITERATOR_R123_USE_PHILOX_64BIT 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
// If you add something, it must go in all the other XXfeatures.hpp
// and in ../ut_features.cpp
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Copyright (c) 2013, Los Alamos National Security, LLC
All rights reserved.
Copyright 2013. Los Alamos National Security, LLC. This software was produced
under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
Laboratory (LANL), which is operated by Los Alamos National Security, LLC for
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
to produce derivative works, such modified software should be clearly marked,
so as not to confuse it with the version available from LANL.
*/
#ifndef __xlcfeatures_dot_hpp
#define __xlcfeatures_dot_hpp
#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
# error "This code has only been tested on x86 and PowerPC platforms."
#include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
{ /* maybe an unbalanced brace will terminate the compilation */
/* Feel free to try the Random123 library on other architectures by changing
the conditions that reach this error, but you should consider it a
porting exercise and expect to encounter bugs and deficiencies.
Please let the authors know of any successes (or failures). */
#endif
#ifdef __cplusplus
/* builtins are automatically available to xlc. To use them with xlc++,
one must include builtins.h. c.f
http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html
*/
#include <builtins.h>
#endif
#ifndef RANDOM_ITERATOR_R123_STATIC_INLINE
#define RANDOM_ITERATOR_R123_STATIC_INLINE static inline
#endif
#ifndef RANDOM_ITERATOR_R123_FORCE_INLINE
#define RANDOM_ITERATOR_R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__))
#endif
#ifndef RANDOM_ITERATOR_R123_CUDA_DEVICE
#define RANDOM_ITERATOR_R123_CUDA_DEVICE
#endif
#ifndef RANDOM_ITERATOR_R123_ASSERT
#include <assert.h>
#define RANDOM_ITERATOR_R123_ASSERT(x) assert(x)
#endif
#ifndef RANDOM_ITERATOR_R123_BUILTIN_EXPECT
#define RANDOM_ITERATOR_R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_NI
#define RANDOM_ITERATOR_R123_USE_AES_NI 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_2
#define RANDOM_ITERATOR_R123_USE_SSE4_2 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE4_1
#define RANDOM_ITERATOR_R123_USE_SSE4_1 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SSE
#define RANDOM_ITERATOR_R123_USE_SSE 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_AES_OPENSSL
/* There isn't really a good way to tell at compile time whether
openssl is available. Without a pre-compilation configure-like
tool, it's less error-prone to guess that it isn't available. Add
-DRANDOM_ITERATOR_R123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to
play with openssl */
#define RANDOM_ITERATOR_R123_USE_AES_OPENSSL 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_GNU_UINT128
#define RANDOM_ITERATOR_R123_USE_GNU_UINT128 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_ASM_GNU
#define RANDOM_ITERATOR_R123_USE_ASM_GNU 1
#endif
#ifndef RANDOM_ITERATOR_R123_USE_CPUID_MSVC
#define RANDOM_ITERATOR_R123_USE_CPUID_MSVC 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_X86INTRIN_H
#define RANDOM_ITERATOR_R123_USE_X86INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_IA32INTRIN_H
#define RANDOM_ITERATOR_R123_USE_IA32INTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_XMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_XMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_EMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_EMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_SMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_SMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_WMMINTRIN_H
#define RANDOM_ITERATOR_R123_USE_WMMINTRIN_H 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_INTRIN_H
#ifdef __ABM__
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 1
#else
#define RANDOM_ITERATOR_R123_USE_INTRIN_H 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#define RANDOM_ITERATOR_R123_USE_MULHILO32_ASM 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN
#if (defined(__powerpc64__))
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN 1
#else
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN 0
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_MULHILO64_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_MULHILO64_MULHI_INTRIN __mulhdu
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_MULHILO32_MULHI_INTRIN
#define RANDOM_ITERATOR_R123_MULHILO32_MULHI_INTRIN __mulhwu
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#if defined(__powerpc64__)
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM (1 /*defined(__powerpc64__)*/ && !(RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN))
#else
#define RANDOM_ITERATOR_R123_USE_MULHILO64_ASM (0 /*defined(__powerpc64__)*/ && !(RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN))
#endif
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN 0
#endif
#ifndef RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
#define RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN 0
#endif
#ifndef __STDC_CONSTANT_MACROS
#define __STDC_CONSTANT_MACROS
#endif
#include <stdint.h>
#ifndef UINT64_C
#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
#endif
/* If you add something, it must go in all the other XXfeatures.hpp
and in ../ut_features.cpp */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __r123_gslmicrorng_dot_h__
#define __r123_gslmicrorng_dot_h__
#include <gsl/gsl_rng.h>
#include <string.h>
/** The macro: GSL_MICRORNG(NAME, CBRNGNAME) is the GSL
analog analog of the C++ r123::MicroURNG template. It declares a gsl_rng
type named gsl_rng_NAME which uses the underlying CBRNGNAME
and can be invoked a limited number of times between calls to NAME_reset.
When the underlying CBRNG's \c ctr_t is an \ref arrayNxW "r123arrayNxW",
and the gsl_rng_NAME may called up to \c N*2^32 times
between calls to \c NAME_reset.
\c NAME_reset takes a gsl_rng_NAME type, a counter and a key as arguments.
It restarts the micro-rng with a new base counter and key.
Note that you must call NAME_reset before the first use
of a gsl_rng. NAME_reset is not called automatically by
gsl_rng_alloc().
@code
#include <Random123/threefry.h>
#include <Random123/gsl_microrng.h> // this file
GSL_MICRORNG(microcbrng, threefry4x64, 20) // creates gsl_rng_microcbrng
int main(int argc, char** argv) {
gsl_rng *r = gsl_rng_alloc(gsl_rng_microcbrng);
threefry4x64_ctr_t c = {{}};
threefry4x64_key_t k = {{}};
for (...) {
c.v[0] = ??; // some application variable
microcbrng_reset(r, c, k);
for (...) {
// gaussian calls r several times. It is safe for
// r to be used upto 2^20 times in this loop
something[i] = gsl_ran_gaussian(r, 1.5);
}
}
}
@endcode
*/
#define GSL_MICRORNG(NAME, CBRNGNAME) \
const gsl_rng_type *gsl_rng_##NAME; \
\
typedef struct{ \
CBRNGNAME##_ctr_t ctr; \
CBRNGNAME##_ctr_t r; \
CBRNGNAME##_key_t key; \
RANDOM_ITERATOR_R123_ULONG_LONG n; \
int elem; \
} NAME##_state; \
\
static unsigned long int NAME##_get(void *vstate){ \
NAME##_state *st = (NAME##_state *)vstate; \
const int N=sizeof(st->ctr.v)/sizeof(st->ctr.v[0]); \
if( st->elem == 0 ){ \
CBRNGNAME##_ctr_t c = st->ctr; \
c.v[N-1] |= st->n<<(RANDOM_ITERATOR_R123_W(CBRNGNAME##_ctr_t)-32); \
st->n++; \
st->r = CBRNGNAME(c, st->key); \
st->elem = N; \
} \
return 0xffffffff & st->r.v[--st->elem]; \
} \
\
static double \
NAME##_get_double (void * vstate) \
{ \
return NAME##_get (vstate)/4294967296.; \
} \
\
static void NAME##_set(void *vstate, unsigned long int s){ \
NAME##_state *st = (NAME##_state *)vstate; \
(void)s; /* ignored */ \
st->elem = 0; \
st->n = ~0; /* will abort if _reset is not called */ \
} \
\
static const gsl_rng_type NAME##_type = { \
#NAME, \
0xffffffffUL, \
0, \
sizeof(NAME##_state), \
&NAME##_set, \
&NAME##_get, \
&NAME##_get_double \
}; \
\
RANDOM_ITERATOR_R123_STATIC_INLINE void NAME##_reset(const gsl_rng* gr, CBRNGNAME##_ctr_t c, CBRNGNAME##_key_t k) { \
NAME##_state* state = (NAME##_state *)gr->state; \
state->ctr = c; \
state->key = k; \
state->n = 0; \
state->elem = 0; \
} \
\
const gsl_rng_type *gsl_rng_##NAME = &NAME##_type
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _philox_dot_h_
#define _philox_dot_h_
/** \cond HIDDEN_FROM_DOXYGEN */
#include "features/compilerfeatures.h"
#include "array.h"
/*
// Macros _Foo_tpl are code generation 'templates' They define
// inline functions with names obtained by mangling Foo and the
// macro arguments. E.g.,
// _mulhilo_tpl(32, uint32_t, uint64_t)
// expands to a definition of:
// mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *)
// We then 'instantiate the template' to define
// several different functions, e.g.,
// mulhilo32
// mulhilo64
// These functions will be visible to user code, and may
// also be used later in subsequent templates and definitions.
// A template for mulhilo using a temporary of twice the word-width.
// Gcc figures out that this can be reduced to a single 'mul' instruction,
// despite the apparent use of double-wide variables, shifts, etc. It's
// obviously not guaranteed that all compilers will be that smart, so
// other implementations might be preferable, e.g., using an intrinsic
// or an asm block. On the other hand, for 32-bit multiplies,
// this *is* perfectly standard C99 - any C99 compiler should
// understand it and produce correct code. For 64-bit multiplies,
// it's only usable if the compiler recognizes that it can do
// arithmetic on a 128-bit type. That happens to be true for gcc on
// x86-64, and powerpc64 but not much else.
*/
#define _mulhilo_dword_tpl(W, Word, Dword) \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
Dword product = ((Dword)a)*((Dword)b); \
*hip = product>>W; \
return (Word)product; \
}
/*
// A template for mulhilo using gnu-style asm syntax.
// INSN can be "mulw", "mull" or "mulq".
// FIXME - porting to other architectures, we'll need still-more conditional
// branching here. Note that intrinsics are usually preferable.
*/
#ifdef __powerpc__
#define _mulhilo_asm_tpl(W, Word, INSN) \
RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
Word dx = 0; \
__asm__("\n\t" \
INSN " %0,%1,%2\n\t" \
: "=r"(dx) \
: "r"(b), "r"(ax) \
); \
*hip = dx; \
return ax*b; \
}
#else
#define _mulhilo_asm_tpl(W, Word, INSN) \
RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
Word dx; \
__asm__("\n\t" \
INSN " %2\n\t" \
: "=a"(ax), "=d"(dx) \
: "r"(b), "0"(ax) \
); \
*hip = dx; \
return ax; \
}
#endif /* __powerpc__ */
/*
// A template for mulhilo using MSVC-style intrinsics
// For example,_umul128 is an msvc intrinsic, c.f.
// http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
*/
#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
return INTRIN(a, b, hip); \
}
/* N.B. This really should be called _mulhilo_mulhi_intrin. It just
happens that CUDA was the first time we used the idiom. */
#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, RANDOM_ITERATOR_R123_METAL_THREAD_ADDRESS_SPACE Word* hip){ \
*hip = INTRIN(a, b); \
return a*b; \
}
/*
// A template for mulhilo using only word-size operations and
// C99 operators (no adc, no mulhi). It
// requires four multiplies and a dozen or so shifts, adds
// and tests. It's *SLOW*. It can be used to
// implement philoxNx32 on platforms that completely lack
// 64-bit types, e.g., Metal.
// On 32-bit platforms, it could be used to
// implement philoxNx64, but on such platforms both the philoxNx32
// and the threefryNx64 cbrngs are going to have much better
// performance. It is enabled below by RANDOM_ITERATOR_R123_USE_MULHILO64_C99,
// but that is currently (Feb 2019) only set by
// features/metalfeatures.h headers. It can, of course, be
// set with a compile-time -D option.
*/
#define _mulhilo_c99_tpl(W, Word) \
RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, RANDOM_ITERATOR_R123_METAL_THREAD_ADDRESS_SPACE Word *hip){ \
const unsigned WHALF = W/2; \
const Word LOMASK = ((((Word)1)<<WHALF)-1); \
Word lo = a*b; /* full low multiply */ \
Word ahi = a>>WHALF; \
Word alo = a& LOMASK; \
Word bhi = b>>WHALF; \
Word blo = b& LOMASK; \
\
Word ahbl = ahi*blo; \
Word albh = alo*bhi; \
\
Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
/* carry from the sum with alo*blo */ \
hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
*hip = hi; \
return lo; \
}
/*
// A template for mulhilo on a platform that can't do it
// We could put a C version here, but is it better to run *VERY*
// slowly or to just stop and force the user to find another CBRNG?
*/
#define _mulhilo_fail_tpl(W, Word) \
RANDOM_ITERATOR_R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
RANDOM_ITERATOR_R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
}
/*
// N.B. There's an MSVC intrinsic called _emul,
// which *might* compile into better code than
// _mulhilo_dword_tpl
*/
#if RANDOM_ITERATOR_R123_USE_MULHILO32_ASM
#ifdef __powerpc__
_mulhilo_asm_tpl(32, uint32_t, "mulhwu")
#else
_mulhilo_asm_tpl(32, uint32_t, "mull")
#endif /* __powerpc__ */
#else
#if RANDOM_ITERATOR_R123_USE_64BIT
_mulhilo_dword_tpl(32, uint32_t, uint64_t)
#elif RANDOM_ITERATOR_R123_USE_MULHILO32_MULHI_INTRIN
_mulhilo_cuda_intrin_tpl(32, uint32_t, RANDOM_ITERATOR_R123_MULHILO32_MULHI_INTRIN)
#else
_mulhilo_c99_tpl(32, uint32_t)
#endif
#endif
#if RANDOM_ITERATOR_R123_USE_PHILOX_64BIT
#if RANDOM_ITERATOR_R123_USE_MULHILO64_ASM
#ifdef __powerpc64__
_mulhilo_asm_tpl(64, uint64_t, "mulhdu")
#else
_mulhilo_asm_tpl(64, uint64_t, "mulq")
#endif /* __powerpc64__ */
#elif RANDOM_ITERATOR_R123_USE_MULHILO64_MSVC_INTRIN
_mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
#elif RANDOM_ITERATOR_R123_USE_MULHILO64_CUDA_INTRIN
_mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
#elif RANDOM_ITERATOR_R123_USE_MULHILO64_OPENCL_INTRIN
_mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
#elif RANDOM_ITERATOR_R123_USE_MULHILO64_MULHI_INTRIN
_mulhilo_cuda_intrin_tpl(64, uint64_t, RANDOM_ITERATOR_R123_MULHILO64_MULHI_INTRIN)
#elif RANDOM_ITERATOR_R123_USE_GNU_UINT128
_mulhilo_dword_tpl(64, uint64_t, __uint128_t)
#elif RANDOM_ITERATOR_R123_USE_MULHILO64_C99
_mulhilo_c99_tpl(64, uint64_t)
#else
_mulhilo_fail_tpl(64, uint64_t)
#endif
#endif
/*
// The multipliers and Weyl constants are "hard coded".
// To change them, you can #define them with different
// values before #include-ing this file.
// This isn't terribly elegant, but it works for C as
// well as C++. A nice C++-only solution would be to
// use template parameters in the style of <random>
*/
#ifndef PHILOX_M2x64_0
#define PHILOX_M2x64_0 RANDOM_ITERATOR_R123_64BIT(0xD2B74407B1CE6E93)
#endif
#ifndef PHILOX_M4x64_0
#define PHILOX_M4x64_0 RANDOM_ITERATOR_R123_64BIT(0xD2E7470EE14C6C93)
#endif
#ifndef PHILOX_M4x64_1
#define PHILOX_M4x64_1 RANDOM_ITERATOR_R123_64BIT(0xCA5A826395121157)
#endif
#ifndef PHILOX_M2x32_0
#define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
#endif
#ifndef PHILOX_M4x32_0
#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
#endif
#ifndef PHILOX_M4x32_1
#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
#endif
#ifndef PHILOX_W64_0
#define PHILOX_W64_0 RANDOM_ITERATOR_R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */
#endif
#ifndef PHILOX_W64_1
#define PHILOX_W64_1 RANDOM_ITERATOR_R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */
#endif
#ifndef PHILOX_W32_0
#define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
#endif
#ifndef PHILOX_W32_1
#define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
#endif
/** \endcond */
#ifndef PHILOX2x32_DEFAULT_ROUNDS
#define PHILOX2x32_DEFAULT_ROUNDS 10
#endif
#ifndef PHILOX2x64_DEFAULT_ROUNDS
#define PHILOX2x64_DEFAULT_ROUNDS 10
#endif
#ifndef PHILOX4x32_DEFAULT_ROUNDS
#define PHILOX4x32_DEFAULT_ROUNDS 10
#endif
#ifndef PHILOX4x64_DEFAULT_ROUNDS
#define PHILOX4x64_DEFAULT_ROUNDS 10
#endif
/** \cond HIDDEN_FROM_DOXYGEN */
/* The ignored fourth argument allows us to instantiate the
same macro regardless of N. */
#define _philox2xWround_tpl(W, T) \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
T hi; \
T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
return out; \
}
#define _philox2xWbumpkey_tpl(W) \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
key.v[0] += PHILOX_W##W##_0; \
return key; \
}
#define _philox4xWround_tpl(W, T) \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
T hi0; \
T hi1; \
T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
hi0^ctr.v[3]^key.v[1], lo0}}; \
return out; \
}
#define _philox4xWbumpkey_tpl(W) \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
key.v[0] += PHILOX_W##W##_0; \
key.v[1] += PHILOX_W##W##_1; \
return key; \
}
/** \endcond */
#define _philoxNxW_tpl(N, Nhalf, W, T) \
/** @ingroup PhiloxNxW */ \
enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
RANDOM_ITERATOR_R123_ASSERT(R<=16); \
if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
return ctr; \
}
_philox2xWbumpkey_tpl(32)
_philox4xWbumpkey_tpl(32)
_philox2xWround_tpl(32, uint32_t) /* philox2x32round */
_philox4xWround_tpl(32, uint32_t) /* philo4x32round */
_philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */
_philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */
#if RANDOM_ITERATOR_R123_USE_PHILOX_64BIT
/** \cond HIDDEN_FROM_DOXYGEN */
_philox2xWbumpkey_tpl(64)
_philox4xWbumpkey_tpl(64)
_philox2xWround_tpl(64, uint64_t) /* philo2x64round */
_philox4xWround_tpl(64, uint64_t) /* philo4x64round */
/** \endcond */
_philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */
_philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */
#endif /* RANDOM_ITERATOR_R123_USE_PHILOX_64BIT */
#define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
#define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
#if RANDOM_ITERATOR_R123_USE_PHILOX_64BIT
#define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
#define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
#endif /* RANDOM_ITERATOR_R123_USE_PHILOX_64BIT */
#if defined(__cplusplus)
#define _PhiloxNxW_base_tpl(CType, KType, N, W) \
namespace random_iterator_r123{ \
template<unsigned int ROUNDS> \
struct Philox##N##x##W##_R{ \
typedef CType ctr_type; \
typedef KType key_type; \
typedef KType ukey_type; \
static const RANDOM_ITERATOR_R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
inline RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
RANDOM_ITERATOR_R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
return philox##N##x##W##_R(ROUNDS, ctr, key); \
} \
}; \
typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
} // namespace random_iterator_r123
_PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R<R>
_PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R<R>
#if RANDOM_ITERATOR_R123_USE_PHILOX_64BIT
_PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R<R>
_PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R<R>
#endif
/* The _tpl macros don't quite work to do string-pasting inside comments.
so we just write out the boilerplate documentation four times... */
/**
@defgroup PhiloxNxW Philox Classes and Typedefs
The PhiloxNxW classes export the member functions, typedefs and
operator overloads required by a @ref CBRNG "CBRNG" class.
As described in
<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>.
The Philox family of counter-based RNGs use integer multiplication, xor and permutation of W-bit words
to scramble its N-word input key. Philox is a mnemonic for Product HI LO Xor).
@class r123::Philox2x32_R
@ingroup PhiloxNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Philox round
function will be applied.
As of November 2011, the authors know of no statistical flaws with
ROUNDS=6 or more for Philox2x32.
@typedef r123::Philox2x32
@ingroup PhiloxNxW
Philox2x32 is equivalent to Philox2x32_R<10>. With 10 rounds,
Philox2x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Philox2x64_R
@ingroup PhiloxNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Philox round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=6 or more for Philox2x64.
@typedef r123::Philox2x64
@ingroup PhiloxNxW
Philox2x64 is equivalent to Philox2x64_R<10>. With 10 rounds,
Philox2x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Philox4x32_R
@ingroup PhiloxNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Philox round
function will be applied.
In November 2011, the authors recorded some suspicious p-values (approximately 1.e-7) from
some very long (longer than the default BigCrush length) SimpPoker tests. Despite
the fact that even longer tests reverted to "passing" p-values, a cloud remains over
Philox4x32 with 7 rounds. The authors know of no statistical flaws with
ROUNDS=8 or more for Philox4x32.
@typedef r123::Philox4x32
@ingroup PhiloxNxW
Philox4x32 is equivalent to Philox4x32_R<10>. With 10 rounds,
Philox4x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Philox4x64_R
@ingroup PhiloxNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Philox round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=7 or more for Philox4x64.
@typedef r123::Philox4x64
@ingroup PhiloxNxW
Philox4x64 is equivalent to Philox4x64_R<10>. With 10 rounds,
Philox4x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
*/
#endif /* __cplusplus */
#endif /* _philox_dot_h_ */
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _threefry_dot_h_
#define _threefry_dot_h_
#include "features/compilerfeatures.h"
#include "array.h"
/** \cond HIDDEN_FROM_DOXYGEN */
/* Significant parts of this file were copied from
from:
Skein_FinalRnd/ReferenceImplementation/skein.h
Skein_FinalRnd/ReferenceImplementation/skein_block.c
in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
This file has been modified so that it may no longer perform its originally
intended function. If you're looking for a Skein or Threefish source code,
please consult the original file.
The original file had the following header:
**************************************************************************
**
** Interface declarations and internal definitions for Skein hashing.
**
** Source code author: Doug Whiting, 2008.
**
** This algorithm and source code is released to the public domain.
**
***************************************************************************
*/
/* See comment at the top of philox.h for the macro pre-process
strategy. */
/* Rotation constants: */
enum r123_enum_threefry64x4 {
/* These are the R_256 constants from the Threefish reference sources
with names changed to R_64x4... */
R_64x4_0_0=14, R_64x4_0_1=16,
R_64x4_1_0=52, R_64x4_1_1=57,
R_64x4_2_0=23, R_64x4_2_1=40,
R_64x4_3_0= 5, R_64x4_3_1=37,
R_64x4_4_0=25, R_64x4_4_1=33,
R_64x4_5_0=46, R_64x4_5_1=12,
R_64x4_6_0=58, R_64x4_6_1=22,
R_64x4_7_0=32, R_64x4_7_1=32
};
enum r123_enum_threefry64x2 {
/*
// Output from skein_rot_search: (srs64_B64-X1000)
// Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
// Start: Tue Mar 1 10:07:48 2011
// rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
*/
R_64x2_0_0=16,
R_64x2_1_0=42,
R_64x2_2_0=12,
R_64x2_3_0=31,
R_64x2_4_0=16,
R_64x2_5_0=32,
R_64x2_6_0=24,
R_64x2_7_0=21
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
// 5 rounds: minHW = 8 [ 8 8 8 8 ]
// 6 rounds: minHW = 16 [ 16 16 16 16 ]
// 7 rounds: minHW = 32 [ 32 32 32 32 ]
// 8 rounds: minHW = 64 [ 64 64 64 64 ]
// 9 rounds: minHW = 64 [ 64 64 64 64 ]
//10 rounds: minHW = 64 [ 64 64 64 64 ]
//11 rounds: minHW = 64 [ 64 64 64 64 ] */
};
enum r123_enum_threefry32x4 {
/* Output from skein_rot_search: (srs-B128-X5000.out)
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
// Start: Mon Aug 24 22:41:36 2009
// ...
// rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
R_32x4_0_0=10, R_32x4_0_1=26,
R_32x4_1_0=11, R_32x4_1_1=21,
R_32x4_2_0=13, R_32x4_2_1=27,
R_32x4_3_0=23, R_32x4_3_1= 5,
R_32x4_4_0= 6, R_32x4_4_1=20,
R_32x4_5_0=17, R_32x4_5_1=11,
R_32x4_6_0=25, R_32x4_6_1=10,
R_32x4_7_0=18, R_32x4_7_1=20
/* 4 rounds: minHW = 3 [ 3 3 3 3 ]
// 5 rounds: minHW = 7 [ 7 7 7 7 ]
// 6 rounds: minHW = 12 [ 13 12 13 12 ]
// 7 rounds: minHW = 22 [ 22 23 22 23 ]
// 8 rounds: minHW = 31 [ 31 31 31 31 ]
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
//10 rounds: minHW = 32 [ 32 32 32 32 ]
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
};
enum r123_enum_threefry32x2 {
/* Output from skein_rot_search (srs32x2-X5000.out)
// Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
// Start: Tue Jul 12 11:11:33 2011
// rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
R_32x2_0_0=13,
R_32x2_1_0=15,
R_32x2_2_0=26,
R_32x2_3_0= 6,
R_32x2_4_0=17,
R_32x2_5_0=29,
R_32x2_6_0=16,
R_32x2_7_0=24
/* 4 rounds: minHW = 4 [ 4 4 4 4 ]
// 5 rounds: minHW = 6 [ 6 8 6 8 ]
// 6 rounds: minHW = 9 [ 9 12 9 12 ]
// 7 rounds: minHW = 16 [ 16 24 16 24 ]
// 8 rounds: minHW = 32 [ 32 32 32 32 ]
// 9 rounds: minHW = 32 [ 32 32 32 32 ]
//10 rounds: minHW = 32 [ 32 32 32 32 ]
//11 rounds: minHW = 32 [ 32 32 32 32 ] */
};
enum r123_enum_threefry_wcnt {
WCNT2=2,
WCNT4=4
};
#if RANDOM_ITERATOR_R123_USE_64BIT
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
{
return (x << (N & 63)) | (x >> ((64-N) & 63));
}
#endif
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
{
return (x << (N & 31)) | (x >> ((32-N) & 31));
}
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
#define SKEIN_KS_PARITY32 0x1BD11BDA
/** \endcond */
#ifndef THREEFRY2x32_DEFAULT_ROUNDS
#define THREEFRY2x32_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY2x64_DEFAULT_ROUNDS
#define THREEFRY2x64_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY4x32_DEFAULT_ROUNDS
#define THREEFRY4x32_DEFAULT_ROUNDS 20
#endif
#ifndef THREEFRY4x64_DEFAULT_ROUNDS
#define THREEFRY4x64_DEFAULT_ROUNDS 20
#endif
#define _threefry2x_tpl(W) \
typedef struct r123array2x##W threefry2x##W##_ctr_t; \
typedef struct r123array2x##W threefry2x##W##_key_t; \
typedef struct r123array2x##W threefry2x##W##_ukey_t; \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE \
threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
threefry2x##W##_ctr_t X; \
uint##W##_t ks[2+1]; \
int i; /* avoid size_t to avoid need for stddef.h */ \
RANDOM_ITERATOR_R123_ASSERT(Nrounds<=32); \
ks[2] = SKEIN_KS_PARITY##W; \
for (i=0;i < 2; i++) \
{ \
ks[i] = k.v[i]; \
X.v[i] = in.v[i]; \
ks[2] ^= k.v[i]; \
} \
\
/* Insert initial key before round 0 */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; \
\
if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>3){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; \
X.v[1] += 1; /* X.v[2-1] += r */ \
} \
if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>7){ \
/* InjectKey(r=2) */ \
X.v[0] += ks[2]; X.v[1] += ks[0]; \
X.v[1] += 2; \
} \
if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>11){ \
/* InjectKey(r=3) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; \
X.v[1] += 3; \
} \
if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>15){ \
/* InjectKey(r=4) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; \
X.v[1] += 4; \
} \
if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>19){ \
/* InjectKey(r=5) */ \
X.v[0] += ks[2]; X.v[1] += ks[0]; \
X.v[1] += 5; \
} \
if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>23){ \
/* InjectKey(r=6) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; \
X.v[1] += 6; \
} \
if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>27){ \
/* InjectKey(r=7) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; \
X.v[1] += 7; \
} \
if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
if(Nrounds>31){ \
/* InjectKey(r=8) */ \
X.v[0] += ks[2]; X.v[1] += ks[0]; \
X.v[1] += 8; \
} \
return X; \
} \
/** @ingroup ThreefryNxW */ \
enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE \
threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \
}
#define _threefry4x_tpl(W) \
typedef struct r123array4x##W threefry4x##W##_ctr_t; \
typedef struct r123array4x##W threefry4x##W##_key_t; \
typedef struct r123array4x##W threefry4x##W##_ukey_t; \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE \
threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
threefry4x##W##_ctr_t X; \
uint##W##_t ks[4+1]; \
int i; /* avoid size_t to avoid need for stddef.h */ \
RANDOM_ITERATOR_R123_ASSERT(Nrounds<=72); \
ks[4] = SKEIN_KS_PARITY##W; \
for (i=0;i < 4; i++) \
{ \
ks[i] = k.v[i]; \
X.v[i] = in.v[i]; \
ks[4] ^= k.v[i]; \
} \
\
/* Insert initial key before round 0 */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
\
if(Nrounds>0){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>1){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>2){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>3){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>3){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>4){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>5){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>6){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>7){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>7){ \
/* InjectKey(r=2) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>8){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>9){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>10){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>11){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>11){ \
/* InjectKey(r=3) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>12){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>13){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>14){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>15){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>15){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>16){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>17){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>18){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>19){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>19){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>20){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>21){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>22){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>23){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>23){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>24){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>25){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>26){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>27){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>27){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>28){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>29){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>30){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>31){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>31){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>32){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>33){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>34){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>35){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>35){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>36){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>37){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>38){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>39){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>39){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>40){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>41){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>42){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>43){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>43){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>44){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>45){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>46){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>47){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>47){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>48){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>49){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>50){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>51){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>51){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>52){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>53){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>54){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>55){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>55){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>56){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>57){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>58){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>59){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>59){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>60){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>61){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>62){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>63){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>63){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>64){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>65){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>66){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>67){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>67){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \
} \
\
if(Nrounds>68){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>69){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>70){ \
X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
} \
if(Nrounds>71){ \
X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
} \
if(Nrounds>71){ \
/* InjectKey(r=1) */ \
X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \
} \
\
return X; \
} \
\
/** @ingroup ThreefryNxW */ \
enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE RANDOM_ITERATOR_R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE \
threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \
}
#if RANDOM_ITERATOR_R123_USE_64BIT
_threefry2x_tpl(64)
_threefry4x_tpl(64)
#endif
_threefry2x_tpl(32)
_threefry4x_tpl(32)
/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
than a static inline function. Why? */
#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
#if defined(__cplusplus)
#define _threefryNxWclass_tpl(NxW) \
namespace random_iterator_r123{ \
template<unsigned int ROUNDS> \
struct Threefry##NxW##_R{ \
typedef threefry##NxW##_ctr_t ctr_type; \
typedef threefry##NxW##_key_t key_type; \
typedef threefry##NxW##_key_t ukey_type; \
static const RANDOM_ITERATOR_R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds=ROUNDS; \
inline RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
RANDOM_ITERATOR_R123_STATIC_ASSERT(ROUNDS<=72, "threefry is only unrolled up to 72 rounds\n"); \
return threefry##NxW##_R(ROUNDS, ctr, key); \
} \
}; \
typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
} // namespace random_iterator_r123
_threefryNxWclass_tpl(2x32)
_threefryNxWclass_tpl(4x32)
#if RANDOM_ITERATOR_R123_USE_64BIT
_threefryNxWclass_tpl(2x64)
_threefryNxWclass_tpl(4x64)
#endif
/* The _tpl macros don't quite work to do string-pasting inside comments.
so we just write out the boilerplate documentation four times... */
/**
@defgroup ThreefryNxW Threefry Classes and Typedefs
The ThreefryNxW classes export the member functions, typedefs and
operator overloads required by a @ref CBRNG "CBRNG" class.
As described in
<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers: As Easy as 1, 2, 3</i> </a>,
the Threefry family is closely related to the Threefish block cipher from
<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
Threefry is \b not suitable for cryptographic use.
Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
@class r123::Threefry2x32_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=13 or more for Threefry2x32.
@typedef r123::Threefry2x32
@ingroup ThreefryNxW
Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds,
Threefry2x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry2x64_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
In November 2011, the authors discovered that 13 rounds of
Threefry2x64 sequenced by strided, interleaved key and counter
increments failed a very long (longer than the default BigCrush
length) WeightDistrub test. At the same time, it was confirmed that
14 rounds passes much longer tests (up to 5x10^12 samples) of a
similar nature. The authors know of no statistical flaws with
ROUNDS=14 or more for Threefry2x64.
@typedef r123::Threefry2x64
@ingroup ThreefryNxW
Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds,
Threefry2x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry4x32_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=12 or more for Threefry4x32.
@typedef r123::Threefry4x32
@ingroup ThreefryNxW
Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds,
Threefry4x32 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
@class r123::Threefry4x64_R
@ingroup ThreefryNxW
exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
The template argument, ROUNDS, is the number of times the Threefry round
function will be applied.
As of September 2011, the authors know of no statistical flaws with
ROUNDS=12 or more for Threefry4x64.
@typedef r123::Threefry4x64
@ingroup ThreefryNxW
Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds,
Threefry4x64 has a considerable safety margin over the minimum number
of rounds with no known statistical flaws, but still has excellent
performance.
*/
#endif
#endif
/*
Copyright 2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _random123_ufixed01_dot_h_
#define _random123_ufixed01_dot_h_
#include "features/compilerfeatures.h"
/** @defgroup u01fixedpt The u01fixedpt conversion functions
These functions convert unsigned W-bit integers to uniformly
spaced real values (float or double) between 0.0 and 1.0 with
mantissas of M bits.
PLEASE THINK CAREFULLY BEFORE USING THESE FUNCTIONS. THEY MAY
NOT BE WHAT YOU WANT. YOU MAY BE MUCH BETTER SERVED BY THE
FUNCTIONS IN ./uniform.hpp.
These functions produce a finite number *uniformly spaced* values
in the range from 0.0 to 1.0 with uniform probability. The price
of uniform spacing is that they may not utilize the entire space
of possible outputs. E.g., u01fixedpt_closed_open_32_24 will never
produce a non-zero value less than 2^-24, even though such values
are representable in single-precision floating point.
There are 12 functions, corresponding to the following choices:
- W = 32 or 64
- M = 24 (float) or 53 (double)
- open0 or closed0 : whether the output is open or closed at 0.0
- open1 or closed1 : whether the output is open or closed at 1.0
The W=64 M=24 cases are not implemented. To obtain an M=24 float
from a uint64_t, use a cast (possibly with right-shift and bitwise
and) to convert some of the bits of the uint64_t to a uint32_t and
then use u01fixedpt_x_y_32_float. Note that the 64-bit random integers
produced by the Random123 library are random in "all the bits", so
with a little extra effort you can obtain two floats this way --
one from the high bits and one from the low bits of the 64-bit
value.
If the output is open at one end, then the extreme
value (0.0 or 1.0) will never be returned. Conversely, if the output
is closed at one end, then the extreme value is a possible
return value.
The values returned are as follows. All values are returned
with equal frequency, except as noted in the closed_closed case:
closed_open: Let P=min(M,W)
there are 2^P possible output values:
{0, 1, 2, ..., 2^P-1}/2^P
open_closed: Let P=min(M,W)
there are 2^P possible values:
{1, 2, ..., 2^P}/2^P
open_open: Let P=min(M, W+1)
there are 2^(P-1) possible values:
{1, 3, 5, ..., 2^P-1}/2^P
closed_closed: Let P=min(M, W-1)
there are 1+2^P possible values:
{0, 1, 2, ... 2^P}/2^P
The extreme values (0.0 and 1.0) are
returned with half the frequency of
all others.
On x86 hardware, especially on 32bit machines, the use of
internal 80bit x87-style floating point may result in
'bonus' precision, which may cause closed intervals to not
be really closed, i.e. the conversions below might not
convert UINT{32,64}_MAX to 1.0. This sort of issue is
likely to occur when storing the output of a u01fixedpt_*_32_float
function in a double, though one can imagine getting extra
precision artifacts when going from 64_53 as well. Other
artifacts may exist on some GPU hardware. The tests in
kat_u01_main.h try to expose such issues, but caveat emptor.
@cond HIDDEN_FROM_DOXYGEN
*/
/* Hex floats were standardized by C in 1999, but weren't standardized
by C++ until 2011. So, we're obliged to write out our constants in
decimal, even though they're most naturally expressed in binary.
We cross our fingers and hope that the compiler does the compile-time
constant arithmetic properly.
*/
#define RANDOM_ITERATOR_R123_0x1p_31f (1.f/(1024.f*1024.f*1024.f*2.f))
#define RANDOM_ITERATOR_R123_0x1p_24f (128.f*RANDOM_ITERATOR_R123_0x1p_31f)
#define RANDOM_ITERATOR_R123_0x1p_23f (256.f*RANDOM_ITERATOR_R123_0x1p_31f)
#define RANDOM_ITERATOR_R123_0x1p_32 (1./(1024.*1024.*1024.*4.))
#define RANDOM_ITERATOR_R123_0x1p_63 (2.*RANDOM_ITERATOR_R123_0x1p_32*RANDOM_ITERATOR_R123_0x1p_32)
#define RANDOM_ITERATOR_R123_0x1p_53 (1024.*RANDOM_ITERATOR_R123_0x1p_63)
#define RANDOM_ITERATOR_R123_0x1p_52 (2048.*RANDOM_ITERATOR_R123_0x1p_63)
/** @endcond */
#ifndef RANDOM_ITERATOR_R123_USE_U01_DOUBLE
#define RANDOM_ITERATOR_R123_USE_U01_DOUBLE 1
#endif
#ifdef __cplusplus
extern "C"{
#endif
/* narrowing conversions: uint32_t to float */
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE float u01fixedpt_closed_closed_32_float(uint32_t i){
/* N.B. we ignore the high bit, so output is not monotonic */
return ((i&0x7fffffc0) + (i&0x40))*RANDOM_ITERATOR_R123_0x1p_31f; /* 0x1.p-31f */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE float u01fixedpt_closed_open_32_float(uint32_t i){
return (i>>8)*RANDOM_ITERATOR_R123_0x1p_24f; /* 0x1.0p-24f; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE float u01fixedpt_open_closed_32_float(uint32_t i){
return (1+(i>>8))*RANDOM_ITERATOR_R123_0x1p_24f; /* *0x1.0p-24f; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE float u01fixedpt_open_open_32_float(uint32_t i){
return (0.5f+(i>>9))*RANDOM_ITERATOR_R123_0x1p_23f; /* 0x1.p-23f; */
}
#if RANDOM_ITERATOR_R123_USE_U01_DOUBLE
/* narrowing conversions: uint64_t to double */
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_closed_closed_64_double(uint64_t i){
/* N.B. we ignore the high bit, so output is not monotonic */
return ((i&RANDOM_ITERATOR_R123_64BIT(0x7ffffffffffffe00)) + (i&0x200))*RANDOM_ITERATOR_R123_0x1p_63; /* 0x1.p-63; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_closed_open_64_double(uint64_t i){
return (i>>11)*RANDOM_ITERATOR_R123_0x1p_53; /* 0x1.0p-53; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_open_closed_64_double(uint64_t i){
return (1+(i>>11))*RANDOM_ITERATOR_R123_0x1p_53; /* 0x1.0p-53; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_open_open_64_double(uint64_t i){
return (0.5+(i>>12))*RANDOM_ITERATOR_R123_0x1p_52; /* 0x1.0p-52; */
}
/* widening conversions: u32 to double */
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_closed_closed_32_double(uint32_t i){
/* j = i+(i&1) takes on 2^31+1 possible values with a 'trapezoid' distribution:
p_j = 1 0 2 0 2 .... 2 0 2 0 1
j = 0 1 2 3 4 .... 2^32
by converting to double *before* doing the add, we don't wrap the high bit.
*/
return (((double)(i&1)) + i)*RANDOM_ITERATOR_R123_0x1p_32; /* 0x1.p-32; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_closed_open_32_double(uint32_t i){
return i*RANDOM_ITERATOR_R123_0x1p_32; /* 0x1.p-32; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_open_closed_32_double(uint32_t i){
return (1.+i)*RANDOM_ITERATOR_R123_0x1p_32; /* 0x1.p-32; */
}
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE double u01fixedpt_open_open_32_double(uint32_t i){
return (0.5+i)*RANDOM_ITERATOR_R123_0x1p_32; /* 0x1.p-32; */
}
#endif /* RANDOM_ITERATOR_R123_USE_U01_DOUBLE */
#ifdef __cplusplus
}
#endif
/** @} */
#endif
/*
Copyright 2010-2011, D. E. Shaw Research.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of D. E. Shaw Research nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __r123_uniform_dot_hpp
#define __r123_uniform_dot_hpp
/** @defgroup uniform Uniform distribution scalar conversion functions
This file provides some simple functions that can be used to convert
integers of various widths to floats and doubles with various
characteristics. It can be used to generate real-valued, uniformly
distributed random variables from the random integers produced by
the Random123 CBRNGs.
There are three templated functions:
- u01: output is as dense as possible in (0,1}, never 0.0. May
return 1.0 if and only if the number of output mantissa bits
is less than the width of the input.
- uneg11: output is as dense as possible in {-1,1}, never 0.0. May
return 1.0 or -1.0 if and only if the number of output mantissa bits
is less than the width of the input.
- u01fixedpt: output is "fixed point", equispaced, open at both ends,
and is never 0.0, 0.5 nor 1.0.
The behavior of u01 and uneg11 depend on the pre-processor symbol:
RANDOM_ITERATOR_R123_UNIFORM_FLOAT_STORE. When #defined to a non-zero value, u01
and uneg11 declare a volatile intermediate result, with the
intention of forcing architectures that have "extra bits" in their
floating point registers to more closely conform to IEEE
arithmetic. When compiled this way, u01 and uneg11 will be
significantly slower, as they will incur a memory write and read on
every call. Without it, they may fail the "known answer test"
implemented in ut_uniform_IEEEkat.cpp even though they perform
perfectly reasonable int to float conversions. We have used
this option to get 32-bit x86 to produce the same results as
64-bit x86-64 code, but we do not recommend it for normal
use.
Three additional functions are defined when C++11 or newer is in use:
- u01all
- uneg11all
- u01fixedptall
These functions apply the corresponding conversion to every
element of their argument, which must be a staticly sized
array, e.g., an r123array or a std::array of an integer type.
This file may not be as portable, and has not been tested as
rigorously as other files in the library, e.g., the generators.
Nevertheless, we hope it is useful and we encourage developers to
copy it and modify it for their own use. We invite comments and
improvements.
*/
#include <Random123/features/compilerfeatures.h>
#include <limits>
#include <cstdint>
#include <cstddef>
#if RANDOM_ITERATOR_R123_USE_CXX11_TYPE_TRAITS
#include <type_traits>
#endif
#if __cplusplus >= 201103L
#include <array>
#endif
namespace random_iterator_r123 {
/**
@{
@cond HIDDEN_FROM_DOXYGEN
*/
#if RANDOM_ITERATOR_R123_USE_CXX11_TYPE_TRAITS
using std::make_signed;
using std::make_unsigned;
#else
// Sigh... We could try to find another <type_traits>, e.g., from
// boost or TR1. Or we can do it ourselves in the r123 namespace.
// It's not clear which will cause less headache...
template <typename T>
struct make_signed {};
template <typename T>
struct make_unsigned {};
#define RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED(ST, UT) \
template <> \
struct make_signed<ST> { \
typedef ST type; \
}; \
template <> \
struct make_signed<UT> { \
typedef ST type; \
}; \
template <> \
struct make_unsigned<ST> { \
typedef UT type; \
}; \
template <> \
struct make_unsigned<UT> { \
typedef UT type; \
}
RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED(int8_t, uint8_t);
RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED(int16_t, uint16_t);
RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED(int32_t, uint32_t);
RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED(int64_t, uint64_t);
#if RANDOM_ITERATOR_R123_USE_GNU_UINT128
RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED(__int128_t, __uint128_t);
#endif
#undef RANDOM_ITERATOR_R123_MK_SIGNED_UNSIGNED
#endif
#if defined(__CUDACC__) || defined(_LIBCPP_HAS_NO_CONSTEXPR)
// Amazing! cuda thinks numeric_limits::max() is a __host__ function, so
// we can't use it in a device function.
//
// The LIBCPP_HAS_NO_CONSTEXP test catches situations where the libc++
// library thinks that the compiler doesn't support constexpr, but we
// think it does. As a consequence, the library declares
// numeric_limits::max without constexpr. This workaround should only
// affect a narrow range of compiler/library pairings.
//
// In both cases, we find max() by computing ~(unsigned)0 right-shifted
// by is_signed.
template <typename T>
RANDOM_ITERATOR_R123_CONSTEXPR RANDOM_ITERATOR_R123_STATIC_INLINE
RANDOM_ITERATOR_R123_CUDA_DEVICE T
maxTvalue() {
typedef typename make_unsigned<T>::type uT;
return (~uT(0)) >> std::numeric_limits<T>::is_signed;
}
#else
template <typename T>
RANDOM_ITERATOR_R123_CONSTEXPR RANDOM_ITERATOR_R123_STATIC_INLINE T maxTvalue() {
return std::numeric_limits<T>::max();
}
#endif
/** @endcond
@}
*/
//! Return a uniform real value in (0, 1]
/**
@ingroup uniform
Input is a W-bit integer (signed or unsigned). It is cast to
a W-bit unsigned integer, multiplied by Ftype(2^-W) and added to
Ftype(2^(-W-1)). A good compiler should optimize it down to an
int-to-float conversion followed by a multiply and an add, which
might be fused, depending on the architecture.
If the input is a uniformly distributed integer, and if Ftype
arithmetic follows IEEE754 round-to-nearest rules, then the
result is a uniformly distributed floating point number in (0, 1].
- The result is never exactly 0.0.
- The smallest value returned is 2^-(W-1).
- Let M be the number of mantissa bits in Ftype (typically 24 or 53).
- If W>M then the largest value retured is 1.0.
- If W<=M then the largest value returned is Ftype(1.0 - 2^(-W-1)).
*/
template <typename Ftype, typename Itype>
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE Ftype
u01(Itype in) {
typedef typename make_unsigned<Itype>::type Utype;
RANDOM_ITERATOR_R123_CONSTEXPR Ftype factor =
Ftype(1.) / (maxTvalue<Utype>() + Ftype(1.));
RANDOM_ITERATOR_R123_CONSTEXPR Ftype halffactor = Ftype(0.5) * factor;
#if RANDOM_ITERATOR_R123_UNIFORM_FLOAT_STORE
volatile Ftype x = Utype(in) * factor;
return x + halffactor;
#else
return Utype(in) * factor + halffactor;
#endif
}
//! Return a signed value in [-1,1]
/**
@ingroup uniform
The argument is converted to a W-bit signed integer, multiplied by Ftype(2^-(W-1))
and then added to Ftype(2^-W). A good compiler should optimize it down to an
int-to-float conversion followed by a multiply and an add, which might be fused,
depending on the architecture.
If the input is a uniformly distributed integer, and if Ftype
arithmetic follows IEEE754 round-to-nearest rules, then the
output is a uniformly distributed floating point number in [-1, 1].
- The result is never exactly 0.0.
- The smallest absolute value returned is 2^-W
- Let M be the number of mantissa bits in Ftype.
- If W>M then the largest value retured is 1.0 and the smallest is -1.0.
- If W<=M then the largest value returned is the Ftype(1.0 - 2^-W)
and the smallest value returned is -Ftype(1.0 - 2^-W).
*/
template <typename Ftype, typename Itype>
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE Ftype
uneg11(Itype in) {
typedef typename make_signed<Itype>::type Stype;
RANDOM_ITERATOR_R123_CONSTEXPR Ftype factor =
Ftype(1.) / (maxTvalue<Stype>() + Ftype(1.));
RANDOM_ITERATOR_R123_CONSTEXPR Ftype halffactor = Ftype(0.5) * factor;
#if RANDOM_ITERATOR_R123_UNIFORM_FLOAT_STORE
volatile Ftype x = Stype(in) * factor;
return x + halffactor;
#else
return Stype(in) * factor + halffactor;
#endif
}
//! Return a value in (0,1) chosen from a set of equally spaced fixed-point values
/**
@ingroup uniform
Let:
- W = width of Itype, e.g., 32 or 64, regardless of signedness.
- M = mantissa bits of Ftype, e.g., 24, 53 or 64
- B = min(M, W)
Then the 2^(B-1) possible output values are: 2^-B*{1, 3, 5, ..., 2^B - 1}
The smallest output is: 2^-B
The largest output is: 1 - 2^-B
The output is never exactly 0.0, nor 0.5, nor 1.0.
The 2^(B-1) possible outputs:
- are equally likely,
- are uniformly spaced by 2^-(B-1),
- are balanced around 0.5
*/
template <typename Ftype, typename Itype>
RANDOM_ITERATOR_R123_CUDA_DEVICE RANDOM_ITERATOR_R123_STATIC_INLINE Ftype
u01fixedpt(Itype in) {
typedef typename make_unsigned<Itype>::type Utype;
RANDOM_ITERATOR_R123_CONSTEXPR int excess =
std::numeric_limits<Utype>::digits - std::numeric_limits<Ftype>::digits;
if (excess >= 0) {
RANDOM_ITERATOR_R123_CONSTEXPR int ex_nowarn = (excess >= 0) ? excess : 0;
RANDOM_ITERATOR_R123_CONSTEXPR Ftype factor =
Ftype(1.) / (Ftype(1.) + ((maxTvalue<Utype>() >> ex_nowarn)));
return (1 | (Utype(in) >> ex_nowarn)) * factor;
} else
return u01<Ftype>(in);
}
#if RANDOM_ITERATOR_R123_USE_CXX11_STD_ARRAY
//! Apply u01 to every item in an r123array, returning a std::array
/** @ingroup uniform
* Only in C++11 and newer.
* The argument type may be any integer collection with a constexpr static_size member,
* e.g., an r123array or a std::array of an integer type.
*/
template <typename Ftype, typename CollType>
static inline std::array<Ftype, CollType::static_size> u01all(CollType in) {
std::array<Ftype, CollType::static_size> ret;
size_t i = 0;
for (auto const e : in) { ret[i++] = u01<Ftype>(e); }
return ret;
}
//! Apply uneg11 to every item in an r123array, returning a std::array
/** @ingroup uniform
* Only in C++11 and newer.
* The argument type may be any integer collection with a constexpr static_size member,
* e.g., an r123array or a std::array of an integer type.
*/
template <typename Ftype, typename CollType>
static inline std::array<Ftype, CollType::static_size> uneg11all(CollType in) {
std::array<Ftype, CollType::static_size> ret;
size_t i = 0;
for (auto const e : in) { ret[i++] = uneg11<Ftype>(e); }
return ret;
}
//! Apply u01fixedpt to every item in an r123array, returning a std::array
/** @ingroup uniform
* Only in C++11 and newer.
* The argument type may be any integer collection with a constexpr static_size member,
* e.g., an r123array or a std::array of an integer type.
*/
template <typename Ftype, typename CollType>
static inline std::array<Ftype, CollType::static_size> u01fixedptall(CollType in) {
std::array<Ftype, CollType::static_size> ret;
size_t i = 0;
for (auto const e : in) { ret[i++] = u01fixedpt<Ftype>(e); }
return ret;
}
#endif // __cplusplus >= 201103L
} // namespace random_iterator_r123
#endif
/*
* (c) Copyright 2020 CORSIKA Project, corsika-project@lists.kit.edu
*
* This software is distributed under the terms of the 3-clause BSD license.
* See file LICENSE for a full version of the license.
*/
/*
* SplitMix.hpp
*
* Created on: 25/02/2021
* Author: Antonio Augusto Alves Junior
*/
#pragma once
namespace random_iterator {
namespace detail {
template <typename UIntType>
inline UIntType splitmix(UIntType&);
template <>
inline uint32_t splitmix<uint32_t>(uint32_t& x) {
uint32_t z = (x += 0x6D2B79F5UL);
z = (z ^ (z >> 15)) * (z | 1UL);
z ^= z + (z ^ (z >> 7)) * (z | 61UL);
return z ^ (z >> 14);
}
template <>
inline uint64_t splitmix<uint64_t>(uint64_t& x) {
uint64_t z = (x += 0x9e3779b97f4a7c15);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}
} // namespace detail
} // namespace random_iterator
/*
* (c) Copyright 2020 CORSIKA Project, corsika-project@lists.kit.edu
*
* This software is distributed under the terms of the 3-clause BSD license.
* See file LICENSE for a full version of the license.
*/
/*
* Squares3_128.hpp
*
* Created on: 25/02/2021
* Author: Antonio Augusto Alves Junior
*/
#pragma once
#include <stdint.h>
#include "SquaresKeys.hpp"
#include "uint128.hpp"
namespace random_iterator {
namespace detail {
/*
* Three round counter-based middle square
*
* squares3 - returns a 32-bit unsigned int [0,0xffffffff]
*
*
* Three rounds of squaring are performed and the result is returned.
* For the first two rounds, the result is rotated right 32 bits.
* This places the random data in the best position for the next round.
* y = ctr*key or z = (ctr+1)*key is added on each round. For keys
* generated by the key utility, either ctr*key or (ctr+1)*key will
* have non-zero digits. This improves randomization and also provides
* for a uniform output.
*
* Note: The squares RNG was based on ideas derived from Middle Square
* Weyl Sequence RNG. One of the ideas was to obtain uniformity by adding
* the Weyl sequence after squaring. Richard P. Brent (creator of the
* xorgens RNG) suggested this method. It turns out that adding ctr*key
* is equivalent. Brent's idea provides the basis for uniformity in this
* generator.
*
* Implementation of the algorithm authored by Bernard Widynski and
* described in https://arxiv.org/pdf/2004.06278v2.pdf
*/
class Squares3_128 {
public:
typedef uint128_t init_type;
typedef uint128_t state_type;
typedef uint128_t seed_type;
typedef uint64_t advance_type;
typedef uint64_t result_type;
Squares3_128() = delete;
Squares3_128(size_t s, uint32_t stream = 0)
: state_(uint64_t(stream) << 32, 0)
, seed_(seed_type{splitmix<size_t>(s)}) {}
Squares3_128(Squares3_128 const& other)
: state_(other.getState())
, seed_(other.getSeed()) {}
inline Squares3_128& operator=(Squares3_128 const& other) {
if (this == &other) return *this;
state_ = other.getState();
seed_ = other.getSeed();
return *this;
}
inline result_type operator()(void) {
uint128_t x, y, z;
y = x = seed_ * state_;
z = y + seed_;
x = x * x + y;
x = x.rotate_right(); /* round 1 */
x = x * x + z;
x = x.rotate_right(); /* round 2 */
++state_; /* advance state */
return (x * x + y).upper(); /* round 3 */
}
inline void discard(advance_type n) { state_ += n; }
inline seed_type getSeed() const { return seed_; }
inline void setSeed(seed_type seed) { seed_ = seed; }
inline state_type getState() const { return state_; }
inline void setState(state_type state) { state_ = state; }
inline static uint64_t generateSeed(size_t i) { return keys[i]; }
friend inline std::ostream& operator<<(std::ostream& os, const Squares3_128& be) {
return os << "state: " << be.getState() << " seed: " << be.getSeed();
}
static constexpr result_type min() { return 0; }
static constexpr result_type max() {
return std::numeric_limits<result_type>::max();
}
private:
state_type state_;
seed_type seed_;
};
} // namespace detail
} // namespace random_iterator
/*
* (c) Copyright 2020 CORSIKA Project, corsika-project@lists.kit.edu
*
* This software is distributed under the terms of the 3-clause BSD license.
* See file LICENSE for a full version of the license.
*/
/*
* Squares3.hpp
*
* Created on: 25/02/2021
* Author: Antonio Augusto Alves Junior
*/
#pragma once
#include <stdint.h>
#include "SquaresKeys.hpp"
namespace random_iterator {
namespace detail {
/*
* Three round counter-based middle square
*
* squares3 - returns a 32-bit unsigned int [0,0xffffffff]
*
*
* Three rounds of squaring are performed and the result is returned.
* For the first two rounds, the result is rotated right 32 bits.
* This places the random data in the best position for the next round.
* y = ctr*key or z = (ctr+1)*key is added on each round. For keys
* generated by the key utility, either ctr*key or (ctr+1)*key will
* have non-zero digits. This improves randomization and also provides
* for a uniform output.
*
* Note: The squares RNG was based on ideas derived from Middle Square
* Weyl Sequence RNG. One of the ideas was to obtain uniformity by adding
* the Weyl sequence after squaring. Richard P. Brent (creator of the
* xorgens RNG) suggested this method. It turns out that adding ctr*key
* is equivalent. Brent's idea provides the basis for uniformity in this
* generator.
*
* Implementation of the algorithm authored by Bernard Widynski and
* described in https://arxiv.org/pdf/2004.06278v2.pdf
*/
class Squares3_64 {
public:
typedef uint64_t init_type;
typedef uint64_t state_type;
typedef uint64_t seed_type;
typedef uint64_t advance_type;
typedef uint32_t result_type;
Squares3_64() = delete;
Squares3_64(seed_type s, uint32_t)
: state_(0)
, seed_(seed_type{splitmix<seed_type>(s)}) {}
Squares3_64(Squares3_64 const& other)
: state_(other.getState())
, seed_(other.getSeed()) {}
inline Squares3_64& operator=(Squares3_64 const& other) {
if (this == &other) return *this;
state_ = other.getState();
seed_ = other.getSeed();
return *this;
}
inline result_type operator()(void) {
uint64_t x, y, z;
y = x = seed_ * state_;
z = y + seed_;
x = x * x + y;
x = (x >> 32) | (x << 32); /* round 1 */
x = x * x + z;
x = (x >> 32) | (x << 32); /* round 2 */
++state_; /* advance state */
return (x * x + y) >> 32; /* round 3 */
}
inline void discard(advance_type n) { state_ += n; }
inline seed_type getSeed() const { return seed_; }
inline void setSeed(seed_type seed) { seed_ = seed; }
inline state_type getState() const { return state_; }
inline void setState(state_type state) { state_ = state; }
inline static uint64_t generateSeed(size_t i) { return keys[i]; }
static constexpr result_type min() { return 0; }
static constexpr result_type max() {
return std::numeric_limits<result_type>::max();
}
friend inline std::ostream& operator<<(std::ostream& os, const Squares3_64& be) {
return os << "state: " << be.getState() << " seed: " << be.getSeed();
}
private:
state_type state_;
seed_type seed_;
};
} // namespace detail
} // namespace random_iterator
/*
* (c) Copyright 2020 CORSIKA Project, corsika-project@lists.kit.edu
*
* This software is distributed under the terms of the 3-clause BSD license.
* See file LICENSE for a full version of the license.
*/
/*
* Squares4_128.hpp
*
* Created on: 25/02/2021
* Author: Antonio Augusto Alves Junior
*/
#pragma once
#include <stdint.h>
#include "SquaresKeys.hpp"
#include "uint128.hpp"
namespace random_iterator {
namespace detail {
/*
* Three round counter-based middle square
*
* squares3 - returns a 32-bit unsigned int [0,0xffffffff]
*
*
* Three rounds of squaring are performed and the result is returned.
* For the first two rounds, the result is rotated right 32 bits.
* This places the random data in the best position for the next round.
* y = ctr*key or z = (ctr+1)*key is added on each round. For keys
* generated by the key utility, either ctr*key or (ctr+1)*key will
* have non-zero digits. This improves randomization and also provides
* for a uniform output.
*
* Note: The squares RNG was based on ideas derived from Middle Square
* Weyl Sequence RNG. One of the ideas was to obtain uniformity by adding
* the Weyl sequence after squaring. Richard P. Brent (creator of the
* xorgens RNG) suggested this method. It turns out that adding ctr*key
* is equivalent. Brent's idea provides the basis for uniformity in this
* generator.
*
* Implementation of the algorithm authored by Bernard Widynski and
* described in https://arxiv.org/pdf/2004.06278v2.pdf
*/
class Squares4_128 {
public:
typedef uint128_t init_type;
typedef uint128_t state_type;
typedef uint128_t seed_type;
typedef uint64_t advance_type;
typedef uint64_t result_type;
Squares4_128() = delete;
Squares4_128(size_t s, uint32_t stream = 0)
: state_(uint64_t(stream) << 32, 0)
, seed_(seed_type{splitmix<size_t>(s)}) {}
Squares4_128(Squares4_128 const& other)
: state_(other.getState())
, seed_(other.getSeed()) {}
inline Squares4_128& operator=(Squares4_128 const& other) {
if (this == &other) return *this;
state_ = other.getState();
seed_ = other.getSeed();
return *this;
}
inline result_type operator()(void) {
uint128_t x, y, z;
y = x = seed_ * state_;
z = y + seed_;
x = x * x + y;
x = x.rotate_right(); /* round 1 */
x = x * x + z;
x = x.rotate_right(); /* round 2 */
x = x * x + y;
x = x.rotate_right(); /* round 1 */
++state_; /* advance state */
return (x * x + z).upper(); /* round 3 */
}
inline void discard(advance_type n) { state_ += n; }
inline seed_type getSeed() const { return seed_; }
inline void setSeed(seed_type seed) { seed_ = seed; }
inline state_type getState() const { return state_; }
inline void setState(state_type state) { state_ = state; }
inline static uint64_t generateSeed(size_t i) { return keys[i]; }
friend inline std::ostream& operator<<(std::ostream& os, const Squares4_128& be) {
return os << "state: " << be.getState() << " seed: " << be.getSeed();
}
static constexpr result_type min() { return 0; }
static constexpr result_type max() {
return std::numeric_limits<result_type>::max();
}
private:
state_type state_;
seed_type seed_;
};
} // namespace detail
} // namespace random_iterator