/* * Copyright 2010-2014 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ /* * curand_mtgp32_host.h * * * MTGP32-11213 * * Mersenne Twister RNG for the GPU * * The period of generated integers is 2¹¹²¹³-1. * * This code generates 32-bit unsigned integers, and * single precision floating point numbers uniformly distributed * in the range [1, 2). (float r; 1.0 <= r < 2.0) */ /* * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima * University. All rights reserved. * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima * University and University of Tokyo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of the Hiroshima University nor the names of * its contributors may be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if !defined CURAND_MTGP32_HOST_H #define CURAND_MTGP32_HOST_H #if !defined(QUALIFIERS) #define QUALIFIERS static inline __device__ #endif #include #include #include #include #include "curand.h" #include "curand_mtgp32.h" #include "curand_mtgp32dc_p_11213.h" /** * \addtogroup DEVICE Device API * * @{ */ static const unsigned int non_zero = 0x4d544750; /* * This function represents a function used in the initialization * by mtgp32_init_by_array() and mtgp32_init_by_str(). * @param[in] x 32-bit integer * @return 32-bit integer */ static __forceinline__ unsigned int ini_func1(unsigned int x) { return (x ^ (x >> 27)) * (1664525); } /* * This function represents a function used in the initialization * by mtgp32_init_by_array() and mtgp32_init_by_str(). * @param[in] x 32-bit integer * @return 32-bit integer */ static __forceinline__ unsigned int ini_func2(unsigned int x) { return (x ^ (x >> 27)) * (1566083941); } /* * This function initializes the internal state array with a 32-bit * integer seed. The allocated memory should be freed by calling * mtgp32_free(). \b para should be one of the elements in the * parameter table (mtgp32-param-ref.c). * * This function is call by cuda program, because cuda program uses * another structure and another allocation method. * * @param[out] array MTGP internal status vector. * @param[in] para parameter structure * @param[in] seed a 32-bit integer used as the seed. */ static __forceinline__ __host__ void mtgp32_init_state(unsigned int state[], const mtgp32_params_fast_t *para, unsigned int seed) { int i; int size = para->mexp / 32 + 1; unsigned int hidden_seed; unsigned int tmp; hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16); tmp = hidden_seed; tmp += tmp >> 16; tmp += tmp >> 8; memset(state, tmp & 0xff, sizeof(unsigned int) * size); state[0] = seed; state[1] = hidden_seed; for (i = 1; i < size; i++) { state[i] ^= (1812433253) * (state[i - 1] ^ (state[i - 1] >> 30)) + i; } } /* * This function initializes the internal state array * with a 32-bit integer array. \b para should be one of the elements in * the parameter table (mtgp32-param-ref.c). * * @param[out] mtgp32 MTGP structure. * @param[in] para parameter structure * @param[in] array a 32-bit integer array used as a seed. * @param[in] length length of the array. * @return CURAND_STATUS_SUCCESS */ static __forceinline__ __host__ int mtgp32_init_by_array(unsigned int state[], const mtgp32_params_fast_t *para, unsigned int *array, int length) { int i, j, count; unsigned int r; int lag; int mid; int size = para->mexp / 32 + 1; unsigned int hidden_seed; unsigned int tmp; if (size >= 623) { lag = 11; } else if (size >= 68) { lag = 7; } else if (size >= 39) { lag = 5; } else { lag = 3; } mid = (size - lag) / 2; hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16); tmp = hidden_seed; tmp += tmp >> 16; tmp += tmp >> 8; memset(state, tmp & 0xff, sizeof(unsigned int) * size); state[0] = hidden_seed; if (length + 1 > size) { count = length + 1; } else { count = size; } r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]); state[mid] += r; r += length; state[(mid + lag) % size] += r; state[0] = r; i = 1; count--; for (i = 1, j = 0; (j < count) && (j < length); j++) { r = ini_func1(state[i] ^ state[(i + mid) % size] ^ state[(i + size - 1) % size]); state[(i + mid) % size] += r; r += array[j] + i; state[(i + mid + lag) % size] += r; state[i] = r; i = (i + 1) % size; } for (; j < count; j++) { r = ini_func1(state[i] ^ state[(i + mid) % size] ^ state[(i + size - 1) % size]); state[(i + mid) % size] += r; r += i; state[(i + mid + lag) % size] += r; state[i] = r; i = (i + 1) % size; } for (j = 0; j < size; j++) { r = ini_func2(state[i] + state[(i + mid) % size] + state[(i + size - 1) % size]); state[(i + mid) % size] ^= r; r -= i; state[(i + mid + lag) % size] ^= r; state[i] = r; i = (i + 1) % size; } if (state[size - 1] == 0) { state[size - 1] = non_zero; } return 0; } /* * This function initializes the internal state array * with a character array. \b para should be one of the elements in * the parameter table (mtgp32-param-ref.c). * This is the same algorithm with mtgp32_init_by_array(), but hope to * be more useful. * * @param[out] mtgp32 MTGP structure. * @param[in] para parameter structure * @param[in] array a character array used as a seed. (terminated by zero.) * @return memory allocation result. if 0 then O.K. */ static __forceinline__ __host__ int mtgp32_init_by_str(unsigned int state[], const mtgp32_params_fast_t *para, unsigned char *array) { int i, j, count; unsigned int r; int lag; int mid; int size = para->mexp / 32 + 1; int length = (unsigned int)strlen((char *)array); unsigned int hidden_seed; unsigned int tmp; if (size >= 623) { lag = 11; } else if (size >= 68) { lag = 7; } else if (size >= 39) { lag = 5; } else { lag = 3; } mid = (size - lag) / 2; hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16); tmp = hidden_seed; tmp += tmp >> 16; tmp += tmp >> 8; memset(state, tmp & 0xff, sizeof(unsigned int) * size); state[0] = hidden_seed; if (length + 1 > size) { count = length + 1; } else { count = size; } r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]); state[mid] += r; r += length; state[(mid + lag) % size] += r; state[0] = r; i = 1; count--; for (i = 1, j = 0; (j < count) && (j < length); j++) { r = ini_func1(state[i] ^ state[(i + mid) % size] ^ state[(i + size - 1) % size]); state[(i + mid) % size] += r; r += array[j] + i; state[(i + mid + lag) % size] += r; state[i] = r; i = (i + 1) % size; } for (; j < count; j++) { r = ini_func1(state[i] ^ state[(i + mid) % size] ^ state[(i + size - 1) % size]); state[(i + mid) % size] += r; r += i; state[(i + mid + lag) % size] += r; state[i] = r; i = (i + 1) % size; } for (j = 0; j < size; j++) { r = ini_func2(state[i] + state[(i + mid) % size] + state[(i + size - 1) % size]); state[(i + mid) % size] ^= r; r -= i; state[(i + mid + lag) % size] ^= r; state[i] = r; i = (i + 1) % size; } if (state[size - 1] == 0) { state[size - 1] = non_zero; } return 0; } template static __forceinline__ __host__ curandStatus_t curandMakeMTGP32ConstantsImpl(const mtgp32_params_fast_t params[], ParamsType * p, const int block_num) { const int size1 = sizeof(unsigned int) * block_num; const int size2 = sizeof(unsigned int) * block_num * TBL_SIZE; unsigned int *h_pos_tbl; unsigned int *h_sh1_tbl; unsigned int *h_sh2_tbl; unsigned int *h_param_tbl; unsigned int *h_temper_tbl; unsigned int *h_single_temper_tbl; unsigned int *h_mask; curandStatus_t status = CURAND_STATUS_SUCCESS; h_pos_tbl = (unsigned int *)malloc(size1); h_sh1_tbl = (unsigned int *)malloc(size1); h_sh2_tbl = (unsigned int *)malloc(size1); h_param_tbl = (unsigned int *)malloc(size2); h_temper_tbl = (unsigned int *)malloc(size2); h_single_temper_tbl = (unsigned int *)malloc(size2); h_mask = (unsigned int *)malloc(sizeof(unsigned int)); if (h_pos_tbl == NULL || h_sh1_tbl == NULL || h_sh2_tbl == NULL || h_param_tbl == NULL || h_temper_tbl == NULL || h_single_temper_tbl == NULL || h_mask == NULL) { if (h_pos_tbl != NULL) free(h_pos_tbl); if (h_sh1_tbl != NULL) free(h_sh1_tbl); if (h_sh2_tbl != NULL) free(h_sh2_tbl); if (h_param_tbl != NULL) free(h_param_tbl); if (h_temper_tbl != NULL) free(h_temper_tbl); if (h_single_temper_tbl != NULL) free(h_single_temper_tbl); if (h_mask != NULL) free(h_mask); status = CURAND_STATUS_ALLOCATION_FAILED; } else { h_mask[0] = params[0].mask; for (int i = 0; i < block_num; i++) { h_pos_tbl[i] = params[i].pos; h_sh1_tbl[i] = params[i].sh1; h_sh2_tbl[i] = params[i].sh2; for (int j = 0; j < TBL_SIZE; j++) { h_param_tbl[i * TBL_SIZE + j] = params[i].tbl[j]; h_temper_tbl[i * TBL_SIZE + j] = params[i].tmp_tbl[j]; h_single_temper_tbl[i * TBL_SIZE + j] = params[i].flt_tmp_tbl[j]; } } if (cudaMemcpy( p->pos_tbl, h_pos_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } else if (cudaMemcpy( p->sh1_tbl, h_sh1_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } else if (cudaMemcpy( p->sh2_tbl, h_sh2_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } else if (cudaMemcpy( p->param_tbl, h_param_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } else if (cudaMemcpy( p->temper_tbl, h_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } else if (cudaMemcpy( p->single_temper_tbl, h_single_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } else if (cudaMemcpy( p->mask, h_mask, sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } } if (h_pos_tbl != NULL) free(h_pos_tbl); if (h_sh1_tbl != NULL) free(h_sh1_tbl); if (h_sh2_tbl != NULL) free(h_sh2_tbl); if (h_param_tbl != NULL) free(h_param_tbl); if (h_temper_tbl != NULL) free(h_temper_tbl); if (h_single_temper_tbl != NULL)free(h_single_temper_tbl); if (h_mask != NULL) free(h_mask); return status; } /** * \brief Set up constant parameters for the mtgp32 generator * * This host-side helper function re-organizes CURAND_NUM_MTGP32_PARAMS sets of * generator parameters for use by kernel functions and copies the * result to the specified location in device memory. * * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory * \param p - pointer to a structure of type mtgp32_kernel_params_t in device memory. * * \return * - CURAND_STATUS_ALLOCATION_FAILED if host memory could not be allocated * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed * - CURAND_STATUS_SUCCESS otherwise */ static __forceinline__ __host__ curandStatus_t curandMakeMTGP32Constants(const mtgp32_params_fast_t params[], mtgp32_kernel_params_t * p) { return curandMakeMTGP32ConstantsImpl(params, p, CURAND_NUM_MTGP32_PARAMS); } /** * \brief Set up initial states for the mtgp32 generator * * This host-side helper function initializes a number of states (one parameter set per state) for * an mtgp32 generator. To accomplish this it allocates a state array in host memory, * initializes that array, and copies the result to device memory. * * \param s - pointer to an array of states in device memory * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory * \param k - pointer to a structure of type mtgp32_kernel_params_t in device memory * \param n - number of parameter sets/states to initialize * \param seed - seed value * * \return * - CURAND_STATUS_ALLOCATION_FAILED if host memory state could not be allocated * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed * - CURAND_STATUS_SUCCESS otherwise */ static __forceinline__ __host__ curandStatus_t CURANDAPI curandMakeMTGP32KernelState(curandStateMtgp32_t *s, mtgp32_params_fast_t params[], mtgp32_kernel_params_t *k, int n, unsigned long long seed) { int i; curandStatus_t status = CURAND_STATUS_SUCCESS; curandStateMtgp32_t *h_status =(curandStateMtgp32_t *) malloc(sizeof(curandStateMtgp32_t) * n); if (h_status == NULL) { status = CURAND_STATUS_ALLOCATION_FAILED; } else { seed = seed ^ (seed >> 32); for (i = 0; i < n; i++) { mtgp32_init_state(&(h_status[i].s[0]), ¶ms[i],(unsigned int)seed + i + 1); h_status[i].offset = 0; h_status[i].pIdx = i; h_status[i].k = k; } if (cudaMemcpy(s, h_status, sizeof(curandStateMtgp32_t) * n, cudaMemcpyHostToDevice) != cudaSuccess) { status = CURAND_STATUS_INITIALIZATION_FAILED; } } free(h_status); return status; } /** @} */ #endif