Merge pull request #59 from VanNostrand/rng

RNG Update
2014-06-10 10:34:25 -04:00 · 2014-06-10 10:34:25 -04:00 · e938c54926
commit e938c54926
parent 02c2b29701 ccab97ca7d
16 changed files with 798 additions and 572 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,10 @@
 cmake_minimum_required(VERSION 2.6)
+
+IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+# GNU systems need to define the Mersenne exponent for the RNG to compile w/o warning
+  ADD_DEFINITIONS("-DSFMT_MEXP=19937")
+ENDIF()
+
 add_subdirectory(common)
 if(WITH_SERVER)
 	add_subdirectory(servatrice)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -5,7 +5,6 @@ SET(common_SOURCES
 	decklist.cpp
 	get_pb_extension.cpp
 	rng_abstract.cpp
-	rng_qt.cpp
 	rng_sfmt.cpp
 	server.cpp
 	server_abstractuserinterface.cpp
@ -25,7 +24,6 @@ SET(common_SOURCES
 SET(common_HEADERS
 	decklist.h
 	rng_abstract.h
-	rng_qt.h
 	rng_sfmt.h
 	server.h
 	server_arrowtarget.h
--- a/common/rng_abstract.cpp
+++ b/common/rng_abstract.cpp
@ -7,9 +7,9 @@ QVector<int> RNG_Abstract::makeNumbersVector(int n, int min, int max)
 	const int bins = max - min + 1;
 	QVector<int> result(bins);
 	for (int i = 0; i < n; ++i) {
-		int number = getNumber(min, max);
+		int number = rand(min, max);
 		if ((number < min) || (number > max))
-			qDebug() << "getNumber(" << min << "," << max << ") returned " << number;
+			qDebug() << "rand(" << min << "," << max << ") returned " << number;
 		else
 			result[number - min]++;
 	}
--- a/common/rng_abstract.h
+++ b/common/rng_abstract.h
@ -8,7 +8,7 @@ class RNG_Abstract : public QObject {
 	Q_OBJECT
 public:
 	RNG_Abstract(QObject *parent = 0) : QObject(parent) { }
-	virtual unsigned int getNumber(unsigned int min, unsigned int max) = 0;
+	virtual unsigned int rand(int min, int max) = 0;
 	QVector<int> makeNumbersVector(int n, int min, int max);
 	double testRandom(const QVector<int> &numbers) const;
 };
--- a/common/rng_qt.cpp
+++ b/common/rng_qt.cpp
@ -1,16 +0,0 @@
-#include "rng_qt.h"
-#include <QDateTime>
-#include <stdlib.h>
-
-RNG_Qt::RNG_Qt(QObject *parent)
-	: RNG_Abstract(parent)
-{
-	int seed = QDateTime::currentDateTime().toTime_t();
-	qsrand(seed);
-}
-
-unsigned int RNG_Qt::getNumber(unsigned int min, unsigned int max)
-{
-	int r = qrand();
-	return min + (unsigned int) (((double) (max + 1 - min)) * r / (RAND_MAX + 1.0));
-}
--- a/common/rng_qt.h
+++ b/common/rng_qt.h
@ -1,13 +0,0 @@
-#ifndef RNG_QT_H
-#define RNG_QT_H
-
-#include "rng_abstract.h"
-
-class RNG_Qt : public RNG_Abstract {
-	Q_OBJECT
-public:
-	RNG_Qt(QObject *parent = 0);
-	unsigned int getNumber(unsigned int min, unsigned int max);
-};
-
-#endif
--- a/common/rng_sfmt.cpp
+++ b/common/rng_sfmt.cpp
@ -1,25 +1,136 @@
 #include "rng_sfmt.h"
-#include "sfmt/SFMT.h"
 #include <QDateTime>
-#include <stdlib.h>
-#include <iostream>
+#include <algorithm>
+#include <climits>
+#include <stdexcept>
+
+// This is from gcc sources, namely from fixincludes/inclhack.def
+// On C++11 systems, <cstdint> could be included instead.
+#ifndef UINT64_MAX
+#define UINT64_MAX (~(uint64_t)0)
+#endif

 RNG_SFMT::RNG_SFMT(QObject *parent)
 	: RNG_Abstract(parent)
 {
-	std::cerr << "Using SFMT random number generator." << std::endl;
-	
-	int seed = QDateTime::currentDateTime().toTime_t();
-	init_gen_rand(seed);
-	for (int i = 0; i < 100000; ++i)
-		gen_rand64();
+	// initialize the random number generator with a 32bit integer seed (timestamp)
+	sfmt_init_gen_rand(&sfmt, QDateTime::currentDateTime().toTime_t());
 }

-unsigned int RNG_SFMT::getNumber(unsigned int min, unsigned int max)
+/**
+ * This method is the rand() equivalent which calls the cdf with proper bounds.
+ *
+ * It is possible to generate random numbers from [-min, +/-max] though the RNG uses
+ * unsigned numbers only, so this wrapper handles some special cases for min and max.
+ *
+ * It is only necessary that the upper bound is larger or equal to the lower bound - with the exception
+ * that someone wants something like rand() % -foo.
+ */
+unsigned int RNG_SFMT::rand(int min, int max) {
+  /* If min is negative, it would be possible to calculate
+   * cdf(0, max - min) + min
+   * There has been no use for negative random numbers with rand() though, so it's treated as error.
+   */
+  if(min < 0) {
+	throw std::invalid_argument(
+	  QString("Invalid bounds for RNG: Got min " +
+	  QString::number(min) + " < 0!\n").toStdString());
+	// at this point, the method exits. No return value is needed, because
+	// basically the exception itself is returned.
+  }
+
+  // For complete fairness and equal timing, this should be a roll, but let's skip it anyway
+  if(min == max)
+    return max;
+
+  // This is actually not used in Cockatrice:
+  // Someone wants rand() % -foo, so we compute -rand(0, +foo)
+  // This is the only time where min > max is (sort of) legal.
+  // Not handling this will cause the application to crash.
+  if(min == 0 && max < 0) {
+    return -cdf(0, -max);
+  }
+
+  // No special cases are left, except !(min > max) which is caught in the cdf itself.
+  return cdf(min, max);
+}
+
+/**
+ * Much thought went into this, please read this comment before you modify the code.
+ * Let SFMT() be an alias for sfmt_genrand_uint64() aka SFMT's rand() function.
+ *
+ * SMFT() returns a uniformly distributed pseudorandom number from 0 to UINT64_MAX.
+ * As SFMT() operates on a limited integer range, it is a _discrete_ function.
+ *
+ * We want a random number from a given interval [min, max] though, so we need to
+ * implement the (discrete) cumulative distribution function SFMT(min, max), which
+ * returns a random number X from [min, max].
+ *
+ * This CDF is by formal definition:
+ * SFMT(X; min, max) = (floor(X) - min + 1) / (max - min + 1)
+ *
+ * To get out the random variable, solve for X:
+ * floor(X) = SFMT(X; min, max) * (max - min + 1) + min - 1
+ * So this is, what rand(min, max) should look like.
+ * Problem: SFMT(X; min, max) * (max - min + 1) could produce an integer overflow,
+ * so it is not safe.
+ *
+ * One solution is to divide the universe into buckets of equal size depending on the
+ * range [min, max] and assign X to the bucket that contains the number generated
+ * by SFMT(). This equals to modulo computation and is not satisfying:
+ * If the buckets don't divide the universe equally, because the bucket size is not
+ * a divisor of 2, there will be a range in the universe that is biased because one
+ * bucket is too small thus will be chosen less equally!
+ *
+ * This is solved by rejection sampling:
+ * As SFMT() is assumed to be unbiased, we are allowed to ignore those random numbers
+ * from SFMT() that would force us to have an unequal bucket and generate new random
+ * numbers until one number fits into one of the other buckets.
+ * This can be compared to an ideal six sided die that is rolled until only sides
+ * 1-5 show up, while 6 represents something that you don't want. So you basically roll
+ * a five sided die.
+ * 
+ * Note: If you replace the SFMT RNG with some other rand() function in the future,
+ * then you _need_ to change the UINT64_MAX constant to the largest possible random 
+ * number which can be created by the new rand() function. This value is often defined
+ * in a RAND_MAX constant.
+ * Otherwise you will probably skew the outcome of the rand() method or worsen the
+ * performance of the application.
+ */
+unsigned int RNG_SFMT::cdf(unsigned int min, unsigned int max)
 {
+	// This all makes no sense if min > max, which should never happen.
+	if(min > max) {
+		throw std::invalid_argument(
+		  QString("Invalid bounds for RNG: min > max! Values were: min = " +
+		  QString::number(min) + ", max = " +
+		  QString::number(max)).toStdString());
+		// at this point, the method exits. No return value is needed, because
+		// basically the exception itself is returned.
+	}
+
+	// First compute the diameter (aka size, length) of the [min, max] interval
+	const unsigned int diameter = max - min + 1;
+
+	// Compute how many buckets (each in size of the diameter) will fit into the
+	// universe.
+	// If the division has a remainder, the result is floored automatically.
+	const uint64_t buckets = UINT64_MAX / diameter;
+
+	// Compute the last valid random number. All numbers beyond have to be ignored.
+	// If there was no remainder in the previous step, limit is equal to UINT64_MAX.
+	const uint64_t limit = diameter * buckets;
+
+	uint64_t rand;
+	// To make the random number generation thread-safe, a mutex is created around
+	// the generation. Outside of the loop of course, to avoid lock/unlock overhead.
 	mutex.lock();
-	uint64_t r = gen_rand64();
+	do {
+		rand = sfmt_genrand_uint64(&sfmt);
+	} while (rand >= limit);
 	mutex.unlock();

-	return min + (unsigned int) (((double) (max + 1 - min)) * r / (18446744073709551616.0 + 1.0));
+	// Now determine the bucket containing the SFMT() random number and after adding
+	// the lower bound, a random number from [min, max] can be returned.
+	return (unsigned int) (rand / buckets + min);
 }
--- a/common/rng_sfmt.h
+++ b/common/rng_sfmt.h
@ -1,16 +1,40 @@
 #ifndef RNG_SFMT_H
 #define RNG_SFMT_H

-#include "rng_abstract.h"
+#include <climits>
 #include <QMutex>
+#include "sfmt/SFMT.h"
+#include "rng_abstract.h"
+
+/**
+ * This class encapsulates a state of the art PRNG and can be used
+ * to return uniformly distributed integer random numbers from a range [min, max].
+ * Though technically possible, min must be >= 0 and max should always be > 0.
+ * If max < 0 and min == 0 it is assumed that rand() % -max is wanted and the result will
+ * be -rand(0, -max).
+ * This is the only exception to the rule that !(min > max) and is actually unused in
+ * Cockatrice.
+ *
+ * Technical details:
+ * The RNG uses the SIMD-oriented Fast Mersenne Twister code v1.4.1 from
+ * http://www.math.sci.hiroshima-u.ac.jp/~%20m-mat/MT/SFMT/index.html
+ * The SFMT RNG creates unsigned int 64bit pseudo random numbers.
+ *
+ * These are mapped to values from the interval [min, max] without bias by using Knuth's
+ * "Algorithm S (Selection sampling technique)" from "The Art of Computer Programming 3rd
+ * Edition Volume 2 / Seminumerical Algorithms".
+ */

 class RNG_SFMT : public RNG_Abstract {
 	Q_OBJECT
 private:
 	QMutex mutex;
+	sfmt_t sfmt;
+	// The discrete cumulative distribution function for the RNG
+	unsigned int cdf(unsigned int min, unsigned int max);
 public:
 	RNG_SFMT(QObject *parent = 0);
-	unsigned int getNumber(unsigned int min, unsigned int max);
+	unsigned int rand(int min, int max);
 };

 #endif
--- a/common/server_cardzone.cpp
+++ b/common/server_cardzone.cpp
@ -45,7 +45,7 @@ void Server_CardZone::shuffle()
 {
 	QList<Server_Card *> temp;
 	for (int i = cards.size(); i; i--)
-		temp.append(cards.takeAt(rng->getNumber(0, i - 1)));
+		temp.append(cards.takeAt(rng->rand(0, i - 1)));
 	cards = temp;
 	
 	playersWithWritePermission.clear();
--- a/common/server_player.cpp
+++ b/common/server_player.cpp
@ -844,7 +844,7 @@ Response::ResponseCode Server_Player::cmdRollDie(const Command_RollDie &cmd, Res
 	
 	Event_RollDie event;
 	event.set_sides(cmd.sides());
-	event.set_value(rng->getNumber(1, cmd.sides()));
+	event.set_value(rng->rand(1, cmd.sides()));
 	ges.enqueueGameEvent(event, playerId);
 	
 	return Response::RespOk;
@ -1524,7 +1524,7 @@ Response::ResponseCode Server_Player::cmdRevealCards(const Command_RevealCards &
 	else if (cmd.card_id() == -2) {
 		if (zone->getCards().isEmpty())
 			return Response::RespContextError;
-		cardsToReveal.append(zone->getCards().at(rng->getNumber(0, zone->getCards().size() - 1)));
+		cardsToReveal.append(zone->getCards().at(rng->rand(0, zone->getCards().size() - 1)));
 	} else {
 		Server_Card *card = zone->getCard(cmd.card_id());
 		if (!card)
--- a/common/sfmt/LICENSE.txt
+++ b/common/sfmt/LICENSE.txt
@ -1,5 +1,8 @@
 Copyright (c) 2006,2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
-University. All rights reserved.
+University.
+Copyright (c) 2012 Mutsuo Saito, Makoto Matsumoto, Hiroshima University
+and The University of Tokyo.
+All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@ -11,10 +14,10 @@ met:
      copyright notice, this list of conditions and the following
      disclaimer in the documentation and/or other materials provided
      with the distribution.
-    * Neither the name of the Hiroshima University nor the names of
-      its contributors may be used to endorse or promote products
-      derived from this software without specific prior written
-      permission.
+    * Neither the names of Hiroshima University, The University of
+      Tokyo nor the names of its contributors may be used to endorse
+      or promote products derived from this software without specific
+      prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
--- a/common/sfmt/SFMT-common.h
+++ b/common/sfmt/SFMT-common.h
@ -0,0 +1,164 @@
+#pragma once
+/**
+ * @file SFMT-common.h
+ *
+ * @brief SIMD oriented Fast Mersenne Twister(SFMT) pseudorandom
+ * number generator with jump function. This file includes common functions
+ * used in random number generation and jump.
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (The University of Tokyo)
+ *
+ * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.
+ * Copyright (C) 2012 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and The University of Tokyo.
+ * All rights reserved.
+ *
+ * The 3-clause BSD License is applied to this software, see
+ * LICENSE.txt
+ */
+#ifndef SFMT_COMMON_H
+#define SFMT_COMMON_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include "SFMT.h"
+
+inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
+				w128_t * c, w128_t * d);
+
+inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
+inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
+
+/**
+ * This function simulates SIMD 128-bit right shift by the standard C.
+ * The 128-bit integer given in in is shifted by (shift * 8) bits.
+ * This function simulates the LITTLE ENDIAN SIMD.
+ * @param out the output of this function
+ * @param in the 128-bit data to be shifted
+ * @param shift the shift value
+ */
+#ifdef ONLY64
+inline static void rshift128(w128_t *out, w128_t const *in, int shift) {
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[2] << 32) | ((uint64_t)in->u[3]);
+    tl = ((uint64_t)in->u[0] << 32) | ((uint64_t)in->u[1]);
+
+    oh = th >> (shift * 8);
+    ol = tl >> (shift * 8);
+    ol |= th << (64 - shift * 8);
+    out->u[0] = (uint32_t)(ol >> 32);
+    out->u[1] = (uint32_t)ol;
+    out->u[2] = (uint32_t)(oh >> 32);
+    out->u[3] = (uint32_t)oh;
+}
+#else
+inline static void rshift128(w128_t *out, w128_t const *in, int shift)
+{
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
+    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
+
+    oh = th >> (shift * 8);
+    ol = tl >> (shift * 8);
+    ol |= th << (64 - shift * 8);
+    out->u[1] = (uint32_t)(ol >> 32);
+    out->u[0] = (uint32_t)ol;
+    out->u[3] = (uint32_t)(oh >> 32);
+    out->u[2] = (uint32_t)oh;
+}
+#endif
+/**
+ * This function simulates SIMD 128-bit left shift by the standard C.
+ * The 128-bit integer given in in is shifted by (shift * 8) bits.
+ * This function simulates the LITTLE ENDIAN SIMD.
+ * @param out the output of this function
+ * @param in the 128-bit data to be shifted
+ * @param shift the shift value
+ */
+#ifdef ONLY64
+inline static void lshift128(w128_t *out, w128_t const *in, int shift) {
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[2] << 32) | ((uint64_t)in->u[3]);
+    tl = ((uint64_t)in->u[0] << 32) | ((uint64_t)in->u[1]);
+
+    oh = th << (shift * 8);
+    ol = tl << (shift * 8);
+    oh |= tl >> (64 - shift * 8);
+    out->u[0] = (uint32_t)(ol >> 32);
+    out->u[1] = (uint32_t)ol;
+    out->u[2] = (uint32_t)(oh >> 32);
+    out->u[3] = (uint32_t)oh;
+}
+#else
+inline static void lshift128(w128_t *out, w128_t const *in, int shift)
+{
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
+    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
+
+    oh = th << (shift * 8);
+    ol = tl << (shift * 8);
+    oh |= tl >> (64 - shift * 8);
+    out->u[1] = (uint32_t)(ol >> 32);
+    out->u[0] = (uint32_t)ol;
+    out->u[3] = (uint32_t)(oh >> 32);
+    out->u[2] = (uint32_t)oh;
+}
+#endif
+/**
+ * This function represents the recursion formula.
+ * @param r output
+ * @param a a 128-bit part of the internal state array
+ * @param b a 128-bit part of the internal state array
+ * @param c a 128-bit part of the internal state array
+ * @param d a 128-bit part of the internal state array
+ */
+#ifdef ONLY64
+inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b, w128_t *c,
+				w128_t *d) {
+    w128_t x;
+    w128_t y;
+
+    lshift128(&x, a, SFMT_SL2);
+    rshift128(&y, c, SFMT_SR2);
+    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SFMT_SR1) & SFMT_MSK2) ^ y.u[0]
+	^ (d->u[0] << SFMT_SL1);
+    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SFMT_SR1) & SFMT_MSK1) ^ y.u[1]
+	^ (d->u[1] << SFMT_SL1);
+    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SFMT_SR1) & SFMT_MSK4) ^ y.u[2]
+	^ (d->u[2] << SFMT_SL1);
+    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SFMT_SR1) & SFMT_MSK3) ^ y.u[3]
+	^ (d->u[3] << SFMT_SL1);
+}
+#else
+inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
+				w128_t *c, w128_t *d)
+{
+    w128_t x;
+    w128_t y;
+
+    lshift128(&x, a, SFMT_SL2);
+    rshift128(&y, c, SFMT_SR2);
+    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SFMT_SR1) & SFMT_MSK1)
+	^ y.u[0] ^ (d->u[0] << SFMT_SL1);
+    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SFMT_SR1) & SFMT_MSK2)
+	^ y.u[1] ^ (d->u[1] << SFMT_SL1);
+    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SFMT_SR1) & SFMT_MSK3)
+	^ y.u[2] ^ (d->u[2] << SFMT_SL1);
+    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SFMT_SR1) & SFMT_MSK4)
+	^ y.u[3] ^ (d->u[3] << SFMT_SL1);
+}
+#endif
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
--- a/common/sfmt/SFMT-params.h
+++ b/common/sfmt/SFMT-params.h
@ -1,95 +1,96 @@
+#pragma once
 #ifndef SFMT_PARAMS_H
 #define SFMT_PARAMS_H

-#if !defined(MEXP)
-#ifdef __GNUC__
-  #warning "MEXP is not defined. I assume MEXP is 19937."
+#if !defined(SFMT_MEXP)
+#if defined(__GNUC__) && !defined(__ICC)
+  #warning "SFMT_MEXP is not defined. I assume MEXP is 19937."
 #endif
-  #define MEXP 19937
+  #define SFMT_MEXP 19937
 #endif
 /*-----------------
  BASIC DEFINITIONS
  -----------------*/
 /** Mersenne Exponent. The period of the sequence
 *  is a multiple of 2^MEXP-1.
- * #define MEXP 19937 */
+ * #define SFMT_MEXP 19937 */
 /** SFMT generator has an internal state array of 128-bit integers,
 * and N is its size. */
-#define N (MEXP / 128 + 1)
+#define SFMT_N (SFMT_MEXP / 128 + 1)
 /** N32 is the size of internal state array when regarded as an array
 * of 32-bit integers.*/
-#define N32 (N * 4)
+#define SFMT_N32 (SFMT_N * 4)
 /** N64 is the size of internal state array when regarded as an array
 * of 64-bit integers.*/
-#define N64 (N * 2)
+#define SFMT_N64 (SFMT_N * 2)

 /*----------------------
  the parameters of SFMT
  following definitions are in paramsXXXX.h file.
  ----------------------*/
 /** the pick up position of the array.
-#define POS1 122 
+#define SFMT_POS1 122
 */

 /** the parameter of shift left as four 32-bit registers.
-#define SL1 18
+#define SFMT_SL1 18
 */

 /** the parameter of shift left as one 128-bit register.
- * The 128-bit integer is shifted by (SL2 * 8) bits. 
-#define SL2 1 
+ * The 128-bit integer is shifted by (SFMT_SL2 * 8) bits.
+#define SFMT_SL2 1
 */

 /** the parameter of shift right as four 32-bit registers.
-#define SR1 11
+#define SFMT_SR1 11
 */

 /** the parameter of shift right as one 128-bit register.
- * The 128-bit integer is shifted by (SL2 * 8) bits. 
-#define SR2 1 
+ * The 128-bit integer is shifted by (SFMT_SL2 * 8) bits.
+#define SFMT_SR21 1
 */

 /** A bitmask, used in the recursion.  These parameters are introduced
 * to break symmetry of SIMD.
-#define MSK1 0xdfffffefU
-#define MSK2 0xddfecb7fU
-#define MSK3 0xbffaffffU
-#define MSK4 0xbffffff6U 
+#define SFMT_MSK1 0xdfffffefU
+#define SFMT_MSK2 0xddfecb7fU
+#define SFMT_MSK3 0xbffaffffU
+#define SFMT_MSK4 0xbffffff6U
 */

 /** These definitions are part of a 128-bit period certification vector.
-#define PARITY1	0x00000001U
-#define PARITY2	0x00000000U
-#define PARITY3	0x00000000U
-#define PARITY4	0xc98e126aU
+#define SFMT_PARITY1	0x00000001U
+#define SFMT_PARITY2	0x00000000U
+#define SFMT_PARITY3	0x00000000U
+#define SFMT_PARITY4	0xc98e126aU
 */

-#if MEXP == 607
+#if SFMT_MEXP == 607
  #include "SFMT-params607.h"
-#elif MEXP == 1279
+#elif SFMT_MEXP == 1279
  #include "SFMT-params1279.h"
-#elif MEXP == 2281
+#elif SFMT_MEXP == 2281
  #include "SFMT-params2281.h"
-#elif MEXP == 4253
+#elif SFMT_MEXP == 4253
  #include "SFMT-params4253.h"
-#elif MEXP == 11213
+#elif SFMT_MEXP == 11213
  #include "SFMT-params11213.h"
-#elif MEXP == 19937
+#elif SFMT_MEXP == 19937
  #include "SFMT-params19937.h"
-#elif MEXP == 44497
+#elif SFMT_MEXP == 44497
  #include "SFMT-params44497.h"
-#elif MEXP == 86243
+#elif SFMT_MEXP == 86243
  #include "SFMT-params86243.h"
-#elif MEXP == 132049
+#elif SFMT_MEXP == 132049
  #include "SFMT-params132049.h"
-#elif MEXP == 216091
+#elif SFMT_MEXP == 216091
  #include "SFMT-params216091.h"
 #else
-#ifdef __GNUC__
-  #error "MEXP is not valid."
-  #undef MEXP
+#if defined(__GNUC__) && !defined(__ICC)
+  #error "SFMT_MEXP is not valid."
+  #undef SFMT_MEXP
 #else
-  #undef MEXP
+  #undef SFMT_MEXP
 #endif

 #endif
--- a/common/sfmt/SFMT-params19937.h
+++ b/common/sfmt/SFMT-params19937.h
@ -1,46 +1,50 @@
+#pragma once
 #ifndef SFMT_PARAMS19937_H
 #define SFMT_PARAMS19937_H

-#define POS1	122
-#define SL1	18
-#define SL2	1
-#define SR1	11
-#define SR2	1
-#define MSK1	0xdfffffefU
-#define MSK2	0xddfecb7fU
-#define MSK3	0xbffaffffU
-#define MSK4	0xbffffff6U
-#define PARITY1	0x00000001U
-#define PARITY2	0x00000000U
-#define PARITY3	0x00000000U
-#define PARITY4	0x13c9e684U
+#define SFMT_POS1	122
+#define SFMT_SL1	18
+#define SFMT_SL2	1
+#define SFMT_SR1	11
+#define SFMT_SR2	1
+#define SFMT_MSK1	0xdfffffefU
+#define SFMT_MSK2	0xddfecb7fU
+#define SFMT_MSK3	0xbffaffffU
+#define SFMT_MSK4	0xbffffff6U
+#define SFMT_PARITY1	0x00000001U
+#define SFMT_PARITY2	0x00000000U
+#define SFMT_PARITY3	0x00000000U
+#define SFMT_PARITY4	0x13c9e684U


 /* PARAMETERS FOR ALTIVEC */
 #if defined(__APPLE__)	/* For OSX */
-    #define ALTI_SL1	(vector unsigned int)(SL1, SL1, SL1, SL1)
-    #define ALTI_SR1	(vector unsigned int)(SR1, SR1, SR1, SR1)
-    #define ALTI_MSK	(vector unsigned int)(MSK1, MSK2, MSK3, MSK4)
-    #define ALTI_MSK64 \
-	(vector unsigned int)(MSK2, MSK1, MSK4, MSK3)
-    #define ALTI_SL2_PERM \
+    #define SFMT_ALTI_SL1 \
+	(vector unsigned int)(SFMT_SL1, SFMT_SL1, SFMT_SL1, SFMT_SL1)
+    #define SFMT_ALTI_SR1 \
+	(vector unsigned int)(SFMT_SR1, SFMT_SR1, SFMT_SR1, SFMT_SR1)
+    #define SFMT_ALTI_MSK \
+	(vector unsigned int)(SFMT_MSK1, SFMT_MSK2, SFMT_MSK3, SFMT_MSK4)
+    #define SFMT_ALTI_MSK64 \
+	(vector unsigned int)(SFMT_MSK2, SFMT_MSK1, SFMT_MSK4, SFMT_MSK3)
+    #define SFMT_ALTI_SL2_PERM \
 	(vector unsigned char)(1,2,3,23,5,6,7,0,9,10,11,4,13,14,15,8)
-    #define ALTI_SL2_PERM64 \
+    #define SFMT_ALTI_SL2_PERM64 \
 	(vector unsigned char)(1,2,3,4,5,6,7,31,9,10,11,12,13,14,15,0)
-    #define ALTI_SR2_PERM \
+    #define SFMT_ALTI_SR2_PERM \
 	(vector unsigned char)(7,0,1,2,11,4,5,6,15,8,9,10,17,12,13,14)
-    #define ALTI_SR2_PERM64 \
+    #define SFMT_ALTI_SR2_PERM64 \
 	(vector unsigned char)(15,0,1,2,3,4,5,6,17,8,9,10,11,12,13,14)
 #else	/* For OTHER OSs(Linux?) */
-    #define ALTI_SL1	{SL1, SL1, SL1, SL1}
-    #define ALTI_SR1	{SR1, SR1, SR1, SR1}
-    #define ALTI_MSK	{MSK1, MSK2, MSK3, MSK4}
-    #define ALTI_MSK64	{MSK2, MSK1, MSK4, MSK3}
-    #define ALTI_SL2_PERM	{1,2,3,23,5,6,7,0,9,10,11,4,13,14,15,8}
-    #define ALTI_SL2_PERM64	{1,2,3,4,5,6,7,31,9,10,11,12,13,14,15,0}
-    #define ALTI_SR2_PERM	{7,0,1,2,11,4,5,6,15,8,9,10,17,12,13,14}
-    #define ALTI_SR2_PERM64	{15,0,1,2,3,4,5,6,17,8,9,10,11,12,13,14}
+    #define SFMT_ALTI_SL1	{SFMT_SL1, SFMT_SL1, SFMT_SL1, SFMT_SL1}
+    #define SFMT_ALTI_SR1	{SFMT_SR1, SFMT_SR1, SFMT_SR1, SFMT_SR1}
+    #define SFMT_ALTI_MSK	{SFMT_MSK1, SFMT_MSK2, SFMT_MSK3, SFMT_MSK4}
+    #define SFMT_ALTI_MSK64	{SFMT_MSK2, SFMT_MSK1, SFMT_MSK4, SFMT_MSK3}
+    #define SFMT_ALTI_SL2_PERM	{1,2,3,23,5,6,7,0,9,10,11,4,13,14,15,8}
+    #define SFMT_ALTI_SL2_PERM64 {1,2,3,4,5,6,7,31,9,10,11,12,13,14,15,0}
+    #define SFMT_ALTI_SR2_PERM	{7,0,1,2,11,4,5,6,15,8,9,10,17,12,13,14}
+    #define SFMT_ALTI_SR2_PERM64 {15,0,1,2,3,4,5,6,17,8,9,10,11,12,13,14}
 #endif	/* For OSX */
-#define IDSTR	"SFMT-19937:122-18-1-11-1:dfffffef-ddfecb7f-bffaffff-bffffff6"
+#define SFMT_IDSTR	"SFMT-19937:122-18-1-11-1:dfffffef-ddfecb7f-bffaffff-bffffff6"

 #endif /* SFMT_PARAMS19937_H */
--- a/common/sfmt/SFMT.c
+++ b/common/sfmt/SFMT.c
@ -5,15 +5,27 @@
 * @author Mutsuo Saito (Hiroshima University)
 * @author Makoto Matsumoto (Hiroshima University)
 *
- * Copyright (C) 2006,2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
- * University. All rights reserved.
+ * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.
+ * Copyright (C) 2012 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and The University of Tokyo.
+ * Copyright (C) 2013 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.
+ * All rights reserved.
 *
- * The new BSD License is applied to this software, see LICENSE.txt
+ * The 3-clause BSD License is applied to this software, see
+ * LICENSE.txt
 */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 #include <string.h>
 #include <assert.h>
 #include "SFMT.h"
 #include "SFMT-params.h"
+#include "SFMT-common.h"

 #if defined(__BIG_ENDIAN__) && !defined(__amd64) && !defined(BIG_ENDIAN64)
 #define BIG_ENDIAN64 1
@ -27,74 +39,20 @@
  #endif
 #undef ONLY64
 #endif
-/*------------------------------------------------------
-  128-bit SIMD data type for Altivec, SSE2 or standard C
-  ------------------------------------------------------*/
-#if defined(HAVE_ALTIVEC)
-  #if !defined(__APPLE__)
-    #include <altivec.h>
-  #endif
-/** 128-bit data structure */
-union W128_T {
-    vector unsigned int s;
-    uint32_t u[4];
-};
-/** 128-bit data type */
-typedef union W128_T w128_t;
-
-#elif defined(HAVE_SSE2)
-  #include <emmintrin.h>
-
-/** 128-bit data structure */
-union W128_T {
-    __m128i si;
-    uint32_t u[4];
-};
-/** 128-bit data type */
-typedef union W128_T w128_t;
-
-#else
-
-/** 128-bit data structure */
-struct W128_T {
-    uint32_t u[4];
-};
-/** 128-bit data type */
-typedef struct W128_T w128_t;
-
-#endif
-
-/*--------------------------------------
-  FILE GLOBAL VARIABLES
-  internal state, index counter and flag 
-  --------------------------------------*/
-/** the 128-bit internal state array */
-static w128_t sfmt[N];
-/** the 32bit integer pointer to the 128-bit internal state array */
-static uint32_t *psfmt32 = &sfmt[0].u[0];
-#if !defined(BIG_ENDIAN64) || defined(ONLY64)
-/** the 64bit integer pointer to the 128-bit internal state array */
-static uint64_t *psfmt64 = (uint64_t *)&sfmt[0].u[0];
-#endif
-/** index counter to the 32-bit internal state array */
-static int idx;
-/** a flag: it is 0 if and only if the internal state is not yet
- * initialized. */
-static int initialized = 0;
-/** a parity check vector which certificate the period of 2^{MEXP} */
-static uint32_t parity[4] = {PARITY1, PARITY2, PARITY3, PARITY4};

+/**
+ * parameters used by sse2.
+ */
+static const w128_t sse2_param_mask = {{SFMT_MSK1, SFMT_MSK2,
+                                        SFMT_MSK3, SFMT_MSK4}};
 /*----------------
  STATIC FUNCTIONS
  ----------------*/
 inline static int idxof(int i);
-inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
-inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
-inline static void gen_rand_all(void);
-inline static void gen_rand_array(w128_t *array, int size);
+inline static void gen_rand_array(sfmt_t * sfmt, w128_t *array, int size);
 inline static uint32_t func1(uint32_t x);
 inline static uint32_t func2(uint32_t x);
-static void period_certification(void);
+static void period_certification(sfmt_t * sfmt);
 #if defined(BIG_ENDIAN64) && !defined(ONLY64)
 inline static void swap(w128_t *array, int size);
 #endif
@ -102,7 +60,11 @@ inline static void swap(w128_t *array, int size);
 #if defined(HAVE_ALTIVEC)
  #include "SFMT-alti.h"
 #elif defined(HAVE_SSE2)
+  #if defined(_MSC_VER)
+    #include "SFMT-sse2-msc.h"
+  #else
    #include "SFMT-sse2.h"
+  #endif
 #endif

 /**
@ -118,190 +80,48 @@ inline static int idxof(int i) {
    return i;
 }
 #endif
-/**
- * This function simulates SIMD 128-bit right shift by the standard C.
- * The 128-bit integer given in in is shifted by (shift * 8) bits.
- * This function simulates the LITTLE ENDIAN SIMD.
- * @param out the output of this function
- * @param in the 128-bit data to be shifted
- * @param shift the shift value
- */
-#ifdef ONLY64
-inline static void rshift128(w128_t *out, w128_t const *in, int shift) {
-    uint64_t th, tl, oh, ol;
-
-    th = ((uint64_t)in->u[2] << 32) | ((uint64_t)in->u[3]);
-    tl = ((uint64_t)in->u[0] << 32) | ((uint64_t)in->u[1]);
-
-    oh = th >> (shift * 8);
-    ol = tl >> (shift * 8);
-    ol |= th << (64 - shift * 8);
-    out->u[0] = (uint32_t)(ol >> 32);
-    out->u[1] = (uint32_t)ol;
-    out->u[2] = (uint32_t)(oh >> 32);
-    out->u[3] = (uint32_t)oh;
-}
-#else
-inline static void rshift128(w128_t *out, w128_t const *in, int shift) {
-    uint64_t th, tl, oh, ol;
-
-    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
-    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
-
-    oh = th >> (shift * 8);
-    ol = tl >> (shift * 8);
-    ol |= th << (64 - shift * 8);
-    out->u[1] = (uint32_t)(ol >> 32);
-    out->u[0] = (uint32_t)ol;
-    out->u[3] = (uint32_t)(oh >> 32);
-    out->u[2] = (uint32_t)oh;
-}
-#endif
-/**
- * This function simulates SIMD 128-bit left shift by the standard C.
- * The 128-bit integer given in in is shifted by (shift * 8) bits.
- * This function simulates the LITTLE ENDIAN SIMD.
- * @param out the output of this function
- * @param in the 128-bit data to be shifted
- * @param shift the shift value
- */
-#ifdef ONLY64
-inline static void lshift128(w128_t *out, w128_t const *in, int shift) {
-    uint64_t th, tl, oh, ol;
-
-    th = ((uint64_t)in->u[2] << 32) | ((uint64_t)in->u[3]);
-    tl = ((uint64_t)in->u[0] << 32) | ((uint64_t)in->u[1]);
-
-    oh = th << (shift * 8);
-    ol = tl << (shift * 8);
-    oh |= tl >> (64 - shift * 8);
-    out->u[0] = (uint32_t)(ol >> 32);
-    out->u[1] = (uint32_t)ol;
-    out->u[2] = (uint32_t)(oh >> 32);
-    out->u[3] = (uint32_t)oh;
-}
-#else
-inline static void lshift128(w128_t *out, w128_t const *in, int shift) {
-    uint64_t th, tl, oh, ol;
-
-    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
-    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
-
-    oh = th << (shift * 8);
-    ol = tl << (shift * 8);
-    oh |= tl >> (64 - shift * 8);
-    out->u[1] = (uint32_t)(ol >> 32);
-    out->u[0] = (uint32_t)ol;
-    out->u[3] = (uint32_t)(oh >> 32);
-    out->u[2] = (uint32_t)oh;
-}
-#endif
-
-/**
- * This function represents the recursion formula.
- * @param r output
- * @param a a 128-bit part of the internal state array
- * @param b a 128-bit part of the internal state array
- * @param c a 128-bit part of the internal state array
- * @param d a 128-bit part of the internal state array
- */
-#if (!defined(HAVE_ALTIVEC)) && (!defined(HAVE_SSE2))
-#ifdef ONLY64
-inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b, w128_t *c,
-				w128_t *d) {
-    w128_t x;
-    w128_t y;
-
-    lshift128(&x, a, SL2);
-    rshift128(&y, c, SR2);
-    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SR1) & MSK2) ^ y.u[0] 
-	^ (d->u[0] << SL1);
-    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SR1) & MSK1) ^ y.u[1] 
-	^ (d->u[1] << SL1);
-    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SR1) & MSK4) ^ y.u[2] 
-	^ (d->u[2] << SL1);
-    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SR1) & MSK3) ^ y.u[3] 
-	^ (d->u[3] << SL1);
-}
-#else
-inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b, w128_t *c,
-				w128_t *d) {
-    w128_t x;
-    w128_t y;
-
-    lshift128(&x, a, SL2);
-    rshift128(&y, c, SR2);
-    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SR1) & MSK1) ^ y.u[0] 
-	^ (d->u[0] << SL1);
-    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SR1) & MSK2) ^ y.u[1] 
-	^ (d->u[1] << SL1);
-    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SR1) & MSK3) ^ y.u[2] 
-	^ (d->u[2] << SL1);
-    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SR1) & MSK4) ^ y.u[3] 
-	^ (d->u[3] << SL1);
-}
-#endif
-#endif

 #if (!defined(HAVE_ALTIVEC)) && (!defined(HAVE_SSE2))
-/**
- * This function fills the internal state array with pseudorandom
- * integers.
- */
-inline static void gen_rand_all(void) {
-    int i;
-    w128_t *r1, *r2;
-
-    r1 = &sfmt[N - 2];
-    r2 = &sfmt[N - 1];
-    for (i = 0; i < N - POS1; i++) {
-	do_recursion(&sfmt[i], &sfmt[i], &sfmt[i + POS1], r1, r2);
-	r1 = r2;
-	r2 = &sfmt[i];
-    }
-    for (; i < N; i++) {
-	do_recursion(&sfmt[i], &sfmt[i], &sfmt[i + POS1 - N], r1, r2);
-	r1 = r2;
-	r2 = &sfmt[i];
-    }
-}
-
 /**
 * This function fills the user-specified array with pseudorandom
 * integers.
 *
+ * @param sfmt SFMT internal state
 * @param array an 128-bit array to be filled by pseudorandom numbers.
 * @param size number of 128-bit pseudorandom numbers to be generated.
 */
-inline static void gen_rand_array(w128_t *array, int size) {
+inline static void gen_rand_array(sfmt_t * sfmt, w128_t *array, int size) {
    int i, j;
    w128_t *r1, *r2;

-    r1 = &sfmt[N - 2];
-    r2 = &sfmt[N - 1];
-    for (i = 0; i < N - POS1; i++) {
-	do_recursion(&array[i], &sfmt[i], &sfmt[i + POS1], r1, r2);
+    r1 = &sfmt->state[SFMT_N - 2];
+    r2 = &sfmt->state[SFMT_N - 1];
+    for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
+        do_recursion(&array[i], &sfmt->state[i], &sfmt->state[i + SFMT_POS1], r1, r2);
        r1 = r2;
        r2 = &array[i];
    }
-    for (; i < N; i++) {
-	do_recursion(&array[i], &sfmt[i], &array[i + POS1 - N], r1, r2);
+    for (; i < SFMT_N; i++) {
+        do_recursion(&array[i], &sfmt->state[i],
+                     &array[i + SFMT_POS1 - SFMT_N], r1, r2);
        r1 = r2;
        r2 = &array[i];
    }
-    for (; i < size - N; i++) {
-	do_recursion(&array[i], &array[i - N], &array[i + POS1 - N], r1, r2);
+    for (; i < size - SFMT_N; i++) {
+        do_recursion(&array[i], &array[i - SFMT_N],
+                     &array[i + SFMT_POS1 - SFMT_N], r1, r2);
        r1 = r2;
        r2 = &array[i];
    }
-    for (j = 0; j < 2 * N - size; j++) {
-	sfmt[j] = array[j + size - N];
+    for (j = 0; j < 2 * SFMT_N - size; j++) {
+        sfmt->state[j] = array[j + size - SFMT_N];
    }
    for (; i < size; i++, j++) {
-	do_recursion(&array[i], &array[i - N], &array[i + POS1 - N], r1, r2);
+        do_recursion(&array[i], &array[i - SFMT_N],
+                     &array[i + SFMT_POS1 - SFMT_N], r1, r2);
        r1 = r2;
        r2 = &array[i];
-	sfmt[j] = array[i];
+        sfmt->state[j] = array[i];
    }
 }
 #endif
@ -343,11 +163,15 @@ static uint32_t func2(uint32_t x) {

 /**
 * This function certificate the period of 2^{MEXP}
+ * @param sfmt SFMT internal state
 */
-static void period_certification(void) {
+static void period_certification(sfmt_t * sfmt) {
    int inner = 0;
    int i, j;
    uint32_t work;
+    uint32_t *psfmt32 = &sfmt->state[0].u[0];
+    const uint32_t parity[4] = {SFMT_PARITY1, SFMT_PARITY2,
+                                SFMT_PARITY3, SFMT_PARITY4};

    for (i = 0; i < 4; i++)
        inner ^= psfmt32[idxof(i)] & parity[i];
@ -374,83 +198,66 @@ static void period_certification(void) {
 /*----------------
  PUBLIC FUNCTIONS
  ----------------*/
+#define UNUSED_VARIABLE(x) (void)(x)
 /**
 * This function returns the identification string.
 * The string shows the word size, the Mersenne exponent,
 * and all parameters of this generator.
+ * @param sfmt SFMT internal state
 */
-const char *get_idstring(void) {
-    return IDSTR;
+const char *sfmt_get_idstring(sfmt_t * sfmt) {
+    UNUSED_VARIABLE(sfmt);
+    return SFMT_IDSTR;
 }

 /**
 * This function returns the minimum size of array used for \b
 * fill_array32() function.
+ * @param sfmt SFMT internal state
 * @return minimum size of array used for fill_array32() function.
 */
-int get_min_array_size32(void) {
-    return N32;
+int sfmt_get_min_array_size32(sfmt_t * sfmt) {
+    UNUSED_VARIABLE(sfmt);
+    return SFMT_N32;
 }

 /**
 * This function returns the minimum size of array used for \b
 * fill_array64() function.
+ * @param sfmt SFMT internal state
 * @return minimum size of array used for fill_array64() function.
 */
-int get_min_array_size64(void) {
-    return N64;
+int sfmt_get_min_array_size64(sfmt_t * sfmt) {
+    UNUSED_VARIABLE(sfmt);
+    return SFMT_N64;
 }

-#ifndef ONLY64
+#if !defined(HAVE_SSE2) && !defined(HAVE_ALTIVEC)
 /**
- * This function generates and returns 32-bit pseudorandom number.
- * init_gen_rand or init_by_array must be called before this function.
- * @return 32-bit pseudorandom number
+ * This function fills the internal state array with pseudorandom
+ * integers.
+ * @param sfmt SFMT internal state
 */
-uint32_t gen_rand32(void) {
-    uint32_t r;
+void sfmt_gen_rand_all(sfmt_t * sfmt) {
+    int i;
+    w128_t *r1, *r2;

-    assert(initialized);
-    if (idx >= N32) {
-	gen_rand_all();
-	idx = 0;
+    r1 = &sfmt->state[SFMT_N - 2];
+    r2 = &sfmt->state[SFMT_N - 1];
+    for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
+        do_recursion(&sfmt->state[i], &sfmt->state[i],
+                     &sfmt->state[i + SFMT_POS1], r1, r2);
+        r1 = r2;
+        r2 = &sfmt->state[i];
+    }
+    for (; i < SFMT_N; i++) {
+        do_recursion(&sfmt->state[i], &sfmt->state[i],
+                     &sfmt->state[i + SFMT_POS1 - SFMT_N], r1, r2);
+        r1 = r2;
+        r2 = &sfmt->state[i];
    }
-    r = psfmt32[idx++];
-    return r;
 }
 #endif
-/**
- * This function generates and returns 64-bit pseudorandom number.
- * init_gen_rand or init_by_array must be called before this function.
- * The function gen_rand64 should not be called after gen_rand32,
- * unless an initialization is again executed. 
- * @return 64-bit pseudorandom number
- */
-uint64_t gen_rand64(void) {
-#if defined(BIG_ENDIAN64) && !defined(ONLY64)
-    uint32_t r1, r2;
-#else
-    uint64_t r;
-#endif
-
-    assert(initialized);
-    assert(idx % 2 == 0);
-
-    if (idx >= N32) {
-	gen_rand_all();
-	idx = 0;
-    }
-#if defined(BIG_ENDIAN64) && !defined(ONLY64)
-    r1 = psfmt32[idx];
-    r2 = psfmt32[idx + 1];
-    idx += 2;
-    return ((uint64_t)r2 << 32) | r1;
-#else
-    r = psfmt64[idx / 2];
-    idx += 2;
-    return r;
-#endif
-}

 #ifndef ONLY64
 /**
@ -464,6 +271,7 @@ uint64_t gen_rand64(void) {
 * before the first call of this function. This function can not be
 * used after calling gen_rand function, without initialization.
 *
+ * @param sfmt SFMT internal state
 * @param array an array where pseudorandom 32-bit integers are filled
 * by this function.  The pointer to the array must be \b "aligned"
 * (namely, must be a multiple of 16) in the SIMD version, since it
@ -478,14 +286,13 @@ uint64_t gen_rand64(void) {
 * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
 * returns the pointer to the aligned memory block.
 */
-void fill_array32(uint32_t *array, int size) {
-    assert(initialized);
-    assert(idx == N32);
+void sfmt_fill_array32(sfmt_t * sfmt, uint32_t *array, int size) {
+    assert(sfmt->idx == SFMT_N32);
    assert(size % 4 == 0);
-    assert(size >= N32);
+    assert(size >= SFMT_N32);

-    gen_rand_array((w128_t *)array, size / 4);
-    idx = N32;
+    gen_rand_array(sfmt, (w128_t *)array, size / 4);
+    sfmt->idx = SFMT_N32;
 }
 #endif

@ -496,6 +303,7 @@ void fill_array32(uint32_t *array, int size) {
 * multiple of two.  The generation by this function is much faster
 * than the following gen_rand function.
 *
+ * @param sfmt SFMT internal state
 * For initialization, init_gen_rand or init_by_array must be called
 * before the first call of this function. This function can not be
 * used after calling gen_rand function, without initialization.
@ -514,14 +322,13 @@ void fill_array32(uint32_t *array, int size) {
 * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
 * returns the pointer to the aligned memory block.
 */
-void fill_array64(uint64_t *array, int size) {
-    assert(initialized);
-    assert(idx == N32);
+void sfmt_fill_array64(sfmt_t * sfmt, uint64_t *array, int size) {
+    assert(sfmt->idx == SFMT_N32);
    assert(size % 2 == 0);
-    assert(size >= N64);
+    assert(size >= SFMT_N64);

-    gen_rand_array((w128_t *)array, size / 2);
-    idx = N32;
+    gen_rand_array(sfmt, (w128_t *)array, size / 2);
+    sfmt->idx = SFMT_N32;

 #if defined(BIG_ENDIAN64) && !defined(ONLY64)
    swap((w128_t *)array, size /2);
@ -532,34 +339,38 @@ void fill_array64(uint64_t *array, int size) {
 * This function initializes the internal state array with a 32-bit
 * integer seed.
 *
+ * @param sfmt SFMT internal state
 * @param seed a 32-bit integer used as the seed.
 */
-void init_gen_rand(uint32_t seed) {
+void sfmt_init_gen_rand(sfmt_t * sfmt, uint32_t seed) {
    int i;

+    uint32_t *psfmt32 = &sfmt->state[0].u[0];
+
    psfmt32[idxof(0)] = seed;
-    for (i = 1; i < N32; i++) {
+    for (i = 1; i < SFMT_N32; i++) {
        psfmt32[idxof(i)] = 1812433253UL * (psfmt32[idxof(i - 1)]
                                            ^ (psfmt32[idxof(i - 1)] >> 30))
            + i;
    }
-    idx = N32;
-    period_certification();
-    initialized = 1;
+    sfmt->idx = SFMT_N32;
+    period_certification(sfmt);
 }

 /**
 * This function initializes the internal state array,
 * with an array of 32-bit integers used as the seeds
+ * @param sfmt SFMT internal state
 * @param init_key the array of 32-bit integers, used as a seed.
 * @param key_length the length of init_key.
 */
-void init_by_array(uint32_t *init_key, int key_length) {
+void sfmt_init_by_array(sfmt_t * sfmt, uint32_t *init_key, int key_length) {
    int i, j, count;
    uint32_t r;
    int lag;
    int mid;
-    int size = N * 4;
+    int size = SFMT_N * 4;
+    uint32_t *psfmt32 = &sfmt->state[0].u[0];

    if (size >= 623) {
        lag = 11;
@ -572,14 +383,14 @@ void init_by_array(uint32_t *init_key, int key_length) {
    }
    mid = (size - lag) / 2;

-    memset(sfmt, 0x8b, sizeof(sfmt));
-    if (key_length + 1 > N32) {
+    memset(sfmt, 0x8b, sizeof(sfmt_t));
+    if (key_length + 1 > SFMT_N32) {
        count = key_length + 1;
    } else {
-	count = N32;
+        count = SFMT_N32;
    }
    r = func1(psfmt32[idxof(0)] ^ psfmt32[idxof(mid)]
-	      ^ psfmt32[idxof(N32 - 1)]);
+              ^ psfmt32[idxof(SFMT_N32 - 1)]);
    psfmt32[idxof(mid)] += r;
    r += key_length;
    psfmt32[idxof(mid + lag)] += r;
@ -587,34 +398,36 @@ void init_by_array(uint32_t *init_key, int key_length) {

    count--;
    for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
-	r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % N32)] 
-		  ^ psfmt32[idxof((i + N32 - 1) % N32)]);
-	psfmt32[idxof((i + mid) % N32)] += r;
+        r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % SFMT_N32)]
+                  ^ psfmt32[idxof((i + SFMT_N32 - 1) % SFMT_N32)]);
+        psfmt32[idxof((i + mid) % SFMT_N32)] += r;
        r += init_key[j] + i;
-	psfmt32[idxof((i + mid + lag) % N32)] += r;
+        psfmt32[idxof((i + mid + lag) % SFMT_N32)] += r;
        psfmt32[idxof(i)] = r;
-	i = (i + 1) % N32;
+        i = (i + 1) % SFMT_N32;
    }
    for (; j < count; j++) {
-	r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % N32)] 
-		  ^ psfmt32[idxof((i + N32 - 1) % N32)]);
-	psfmt32[idxof((i + mid) % N32)] += r;
+        r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % SFMT_N32)]
+                  ^ psfmt32[idxof((i + SFMT_N32 - 1) % SFMT_N32)]);
+        psfmt32[idxof((i + mid) % SFMT_N32)] += r;
        r += i;
-	psfmt32[idxof((i + mid + lag) % N32)] += r;
+        psfmt32[idxof((i + mid + lag) % SFMT_N32)] += r;
        psfmt32[idxof(i)] = r;
-	i = (i + 1) % N32;
+        i = (i + 1) % SFMT_N32;
    }
-    for (j = 0; j < N32; j++) {
-	r = func2(psfmt32[idxof(i)] + psfmt32[idxof((i + mid) % N32)] 
-		  + psfmt32[idxof((i + N32 - 1) % N32)]);
-	psfmt32[idxof((i + mid) % N32)] ^= r;
+    for (j = 0; j < SFMT_N32; j++) {
+        r = func2(psfmt32[idxof(i)] + psfmt32[idxof((i + mid) % SFMT_N32)]
+                  + psfmt32[idxof((i + SFMT_N32 - 1) % SFMT_N32)]);
+        psfmt32[idxof((i + mid) % SFMT_N32)] ^= r;
        r -= i;
-	psfmt32[idxof((i + mid + lag) % N32)] ^= r;
+        psfmt32[idxof((i + mid + lag) % SFMT_N32)] ^= r;
        psfmt32[idxof(i)] = r;
-	i = (i + 1) % N32;
+        i = (i + 1) % SFMT_N32;
    }

-    idx = N32;
-    period_certification();
-    initialized = 1;
+    sfmt->idx = SFMT_N32;
+    period_certification(sfmt);
 }
+#if defined(__cplusplus)
+}
+#endif
--- a/common/sfmt/SFMT.h
+++ b/common/sfmt/SFMT.h
@ -1,17 +1,21 @@
+#pragma once
 /**
 * @file SFMT.h
 *
 * @brief SIMD oriented Fast Mersenne Twister(SFMT) pseudorandom
- * number generator
+ * number generator using C structure.
 *
 * @author Mutsuo Saito (Hiroshima University)
- * @author Makoto Matsumoto (Hiroshima University)
+ * @author Makoto Matsumoto (The University of Tokyo)
 *
 * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
- * University. All rights reserved.
+ * University.
+ * Copyright (C) 2012 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and The University of Tokyo.
+ * All rights reserved.
 *
- * The new BSD License is applied to this software.
- * see LICENSE.txt
+ * The 3-clause BSD License is applied to this software, see
+ * LICENSE.txt
 *
 * @note We assume that your system has inttypes.h.  If your system
 * doesn't have inttypes.h, you have to typedef uint32_t and uint64_t,
@ -28,14 +32,14 @@
 * unsigned int and 64-bit unsigned int in hexadecimal format.
 */

-#ifndef SFMT_H
-#define SFMT_H
-
-#ifdef __cplusplus
+#ifndef SFMTST_H
+#define SFMTST_H
+#if defined(__cplusplus)
 extern "C" {
 #endif

 #include <stdio.h>
+#include <assert.h>

 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
  #include <inttypes.h>
@ -60,105 +64,232 @@ extern "C" {
  #endif
 #endif

-#if defined(__GNUC__)
-#define ALWAYSINLINE __attribute__((always_inline))
-#else
-#define ALWAYSINLINE
-#endif
+#include "SFMT-params.h"

-#if defined(_MSC_VER)
-  #if _MSC_VER >= 1200
-    #define PRE_ALWAYS __forceinline
-  #else
-    #define PRE_ALWAYS inline
+/*------------------------------------------
+  128-bit SIMD like data type for standard C
+  ------------------------------------------*/
+#if defined(HAVE_ALTIVEC)
+  #if !defined(__APPLE__)
+    #include <altivec.h>
  #endif
+/** 128-bit data structure */
+union W128_T {
+    vector unsigned int s;
+    uint32_t u[4];
+    uint64_t u64[2];
+};
+#elif defined(HAVE_SSE2)
+  #include <emmintrin.h>
+
+/** 128-bit data structure */
+union W128_T {
+    uint32_t u[4];
+    uint64_t u64[2];
+    __m128i si;
+};
 #else
-  #define PRE_ALWAYS inline
+/** 128-bit data structure */
+union W128_T {
+    uint32_t u[4];
+    uint64_t u64[2];
+};
 #endif

-uint32_t gen_rand32(void);
-uint64_t gen_rand64(void);
-void fill_array32(uint32_t *array, int size);
-void fill_array64(uint64_t *array, int size);
-void init_gen_rand(uint32_t seed);
-void init_by_array(uint32_t *init_key, int key_length);
-const char *get_idstring(void);
-int get_min_array_size32(void);
-int get_min_array_size64(void);
+/** 128-bit data type */
+typedef union W128_T w128_t;

-/* These real versions are due to Isaku Wada */
-/** generates a random number on [0,1]-real-interval */
-inline static double to_real1(uint32_t v)
+/**
+ * SFMT internal state
+ */
+struct SFMT_T {
+    /** the 128-bit internal state array */
+    w128_t state[SFMT_N];
+    /** index counter to the 32-bit internal state array */
+    int idx;
+};
+
+typedef struct SFMT_T sfmt_t;
+
+void sfmt_fill_array32(sfmt_t * sfmt, uint32_t * array, int size);
+void sfmt_fill_array64(sfmt_t * sfmt, uint64_t * array, int size);
+void sfmt_init_gen_rand(sfmt_t * sfmt, uint32_t seed);
+void sfmt_init_by_array(sfmt_t * sfmt, uint32_t * init_key, int key_length);
+const char * sfmt_get_idstring(sfmt_t * sfmt);
+int sfmt_get_min_array_size32(sfmt_t * sfmt);
+int sfmt_get_min_array_size64(sfmt_t * sfmt);
+void sfmt_gen_rand_all(sfmt_t * sfmt);
+
+#ifndef ONLY64
+/**
+ * This function generates and returns 32-bit pseudorandom number.
+ * init_gen_rand or init_by_array must be called before this function.
+ * @param sfmt SFMT internal state
+ * @return 32-bit pseudorandom number
+ */
+inline static uint32_t sfmt_genrand_uint32(sfmt_t * sfmt) {
+    uint32_t r;
+    uint32_t * psfmt32 = &sfmt->state[0].u[0];
+
+    if (sfmt->idx >= SFMT_N32) {
+        sfmt_gen_rand_all(sfmt);
+        sfmt->idx = 0;
+    }
+    r = psfmt32[sfmt->idx++];
+    return r;
+}
+#endif
+/**
+ * This function generates and returns 64-bit pseudorandom number.
+ * init_gen_rand or init_by_array must be called before this function.
+ * The function gen_rand64 should not be called after gen_rand32,
+ * unless an initialization is again executed.
+ * @param sfmt SFMT internal state
+ * @return 64-bit pseudorandom number
+ */
+inline static uint64_t sfmt_genrand_uint64(sfmt_t * sfmt) {
+#if defined(BIG_ENDIAN64) && !defined(ONLY64)
+    uint32_t * psfmt32 = &sfmt->state[0].u[0];
+    uint32_t r1, r2;
+#else
+    uint64_t r;
+#endif
+    uint64_t * psfmt64 = &sfmt->state[0].u64[0];
+    assert(sfmt->idx % 2 == 0);
+
+    if (sfmt->idx >= SFMT_N32) {
+        sfmt_gen_rand_all(sfmt);
+        sfmt->idx = 0;
+    }
+#if defined(BIG_ENDIAN64) && !defined(ONLY64)
+    r1 = psfmt32[sfmt->idx];
+    r2 = psfmt32[sfmt->idx + 1];
+    sfmt->idx += 2;
+    return ((uint64_t)r2 << 32) | r1;
+#else
+    r = psfmt64[sfmt->idx / 2];
+    sfmt->idx += 2;
+    return r;
+#endif
+}
+
+/* =================================================
+   The following real versions are due to Isaku Wada
+   ================================================= */
+/**
+ * converts an unsigned 32-bit number to a double on [0,1]-real-interval.
+ * @param v 32-bit unsigned integer
+ * @return double on [0,1]-real-interval
+ */
+inline static double sfmt_to_real1(uint32_t v)
 {
    return v * (1.0/4294967295.0);
    /* divided by 2^32-1 */
 }

-/** generates a random number on [0,1]-real-interval */
-inline static double genrand_real1(void)
+/**
+ * generates a random number on [0,1]-real-interval
+ * @param sfmt SFMT internal state
+ * @return double on [0,1]-real-interval
+ */
+inline static double sfmt_genrand_real1(sfmt_t * sfmt)
 {
-    return to_real1(gen_rand32());
+    return sfmt_to_real1(sfmt_genrand_uint32(sfmt));
 }

-/** generates a random number on [0,1)-real-interval */
-inline static double to_real2(uint32_t v)
+/**
+ * converts an unsigned 32-bit integer to a double on [0,1)-real-interval.
+ * @param v 32-bit unsigned integer
+ * @return double on [0,1)-real-interval
+ */
+inline static double sfmt_to_real2(uint32_t v)
 {
    return v * (1.0/4294967296.0);
    /* divided by 2^32 */
 }

-/** generates a random number on [0,1)-real-interval */
-inline static double genrand_real2(void)
+/**
+ * generates a random number on [0,1)-real-interval
+ * @param sfmt SFMT internal state
+ * @return double on [0,1)-real-interval
+ */
+inline static double sfmt_genrand_real2(sfmt_t * sfmt)
 {
-    return to_real2(gen_rand32());
+    return sfmt_to_real2(sfmt_genrand_uint32(sfmt));
 }

-/** generates a random number on (0,1)-real-interval */
-inline static double to_real3(uint32_t v)
+/**
+ * converts an unsigned 32-bit integer to a double on (0,1)-real-interval.
+ * @param v 32-bit unsigned integer
+ * @return double on (0,1)-real-interval
+ */
+inline static double sfmt_to_real3(uint32_t v)
 {
    return (((double)v) + 0.5)*(1.0/4294967296.0);
    /* divided by 2^32 */
 }

-/** generates a random number on (0,1)-real-interval */
-inline static double genrand_real3(void)
-{
-    return to_real3(gen_rand32());
-}
-/** These real versions are due to Isaku Wada */
-
-/** generates a random number on [0,1) with 53-bit resolution*/
-inline static double to_res53(uint64_t v) 
-{ 
-    return v * (1.0/18446744073709551616.0L);
-}
-
-/** generates a random number on [0,1) with 53-bit resolution from two
- * 32 bit integers */
-inline static double to_res53_mix(uint32_t x, uint32_t y) 
-{ 
-    return to_res53(x | ((uint64_t)y << 32));
-}
-
-/** generates a random number on [0,1) with 53-bit resolution
+/**
+ * generates a random number on (0,1)-real-interval
+ * @param sfmt SFMT internal state
+ * @return double on (0,1)-real-interval
 */
-inline static double genrand_res53(void) 
+inline static double sfmt_genrand_real3(sfmt_t * sfmt)
 {
-    return to_res53(gen_rand64());
+    return sfmt_to_real3(sfmt_genrand_uint32(sfmt));
 }

-/** generates a random number on [0,1) with 53-bit resolution
-    using 32bit integer.
+/**
+ * converts an unsigned 32-bit integer to double on [0,1)
+ * with 53-bit resolution.
+ * @param v 32-bit unsigned integer
+ * @return double on [0,1)-real-interval with 53-bit resolution.
 */
-inline static double genrand_res53_mix(void) 
+inline static double sfmt_to_res53(uint64_t v)
+{
+    return v * (1.0/18446744073709551616.0);
+}
+
+/**
+ * generates a random number on [0,1) with 53-bit resolution
+ * @param sfmt SFMT internal state
+ * @return double on [0,1) with 53-bit resolution
+ */
+inline static double sfmt_genrand_res53(sfmt_t * sfmt)
+{
+    return sfmt_to_res53(sfmt_genrand_uint64(sfmt));
+}
+
+
+/* =================================================
+   The following function are added by Saito.
+   ================================================= */
+/**
+ * generates a random number on [0,1) with 53-bit resolution from two
+ * 32 bit integers
+ */
+inline static double sfmt_to_res53_mix(uint32_t x, uint32_t y)
+{
+    return sfmt_to_res53(x | ((uint64_t)y << 32));
+}
+
+/**
+ * generates a random number on [0,1) with 53-bit resolution
+ * using two 32bit integers.
+ * @param sfmt SFMT internal state
+ * @return double on [0,1) with 53-bit resolution
+ */
+inline static double sfmt_genrand_res53_mix(sfmt_t * sfmt)
 {
    uint32_t x, y;

-    x = gen_rand32();
-    y = gen_rand32();
-    return to_res53_mix(x, y);
+    x = sfmt_genrand_uint32(sfmt);
+    y = sfmt_genrand_uint32(sfmt);
+    return sfmt_to_res53_mix(x, y);
 }
-#ifdef __cplusplus
+
+#if defined(__cplusplus)
 }
 #endif
+
 #endif