Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 122 additions & 6 deletions include/fastmod.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,71 @@ FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
return mul128_u32(lowbits, d);
}

// Need _udiv128 to calculate the magic number (maps to x86 64-bit div)
#if defined(_M_AMD64) && ( _MSC_VER >= 1923)
// This is for the 64-bit functions.
// Visual Studio lacks support for 128-bit integers so they simulated are using
// multiword arithmatic and VS specific intrinsics.

FASTMOD_API uint64_t add128_u64(
uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi
) {
uint64_t sum_lo;

bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo);
_addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc'

return sum_lo;
}

FASTMOD_API uint64_t div128_u64(
uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi
) {
*quotient_hi = dividend_hi / divisor;
uint64_t remainder_hi = dividend_hi % divisor;

// When long div starts to consider the low dividend,
// the high part would have became its remainder.
// Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient
uint64_t remainder; // Discard
return _udiv128(remainder_hi, dividend_lo, divisor, &remainder);
}

// Multiplies the 128-bit integer by d and returns the lower 128-bits of the product
FASTMOD_API uint64_t mul128_u64_lo(
uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi
) {
uint64_t lowbits_hi;
uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi);

*product_hi = lowbits_hi + (M_hi * d);

return lowbits_lo;
}

// Multiplies the 128-bit integer by d and returns the highest 64-bits of the product
FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) {
uint64_t bottomHalf_hi = __umulh(lowbits_lo, d);

uint64_t topHalf_hi;
uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi);

uint64_t bothHalves_hi;
add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi);

return bothHalves_hi;
}

FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) {
// Only when low is greater, high equality should return true
uint64_t discard;
bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard);

// borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b)
return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard);
}

#endif // End MSVC 64-bit support
#else // _MSC_VER NOT defined

FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
Expand Down Expand Up @@ -143,13 +208,11 @@ FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) {
return (int32_t)(highbits);
}

#ifndef _MSC_VER

// What follows is the 64-bit functions.
// They are currently not supported on Visual Studio
// due to the lack of a mul128_u64 function.
// They may not be faster than what the compiler
// can produce.
// They may not be faster than what the compiler can produce.

#ifndef _MSC_VER
// No __uint128_t in VS, so they have to use a diffrent method.

FASTMOD_API __uint128_t computeM_u64(uint64_t d) {
// what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out
Expand All @@ -170,6 +233,59 @@ FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) {
return mul128_u64(M, a);
}

// given precomputed M, is_divisible checks whether n % d == 0
FASTMOD_API bool is_divisible_u64(uint64_t n, __uint128_t M) { return n * M <= M - 1; }

#elif defined(_MSC_VER) && defined(_M_AMD64) && (_MSC_VER >= 1923)
// Visual Studio lacks support for 128-bit integers
// so they simulated are using multiword arithmatic
// and VS specific intrinsics.

// Using a struct in the multiword arithmetic functions produces
// worse asm output but isn't that bad for public functions

typedef struct { uint64_t low; uint64_t hi; } fastmod_u128_t;

FASTMOD_API fastmod_u128_t computeM_u64(uint64_t d) {
// UINT128MAX / d
uint64_t magic_quotient_hi;
uint64_t magic_quotient_lo = div128_u64(
~UINT64_C(0), ~UINT64_C(0), d, &magic_quotient_hi
);

// quotient_u128 + 1
fastmod_u128_t M;
M.low = add128_u64(magic_quotient_hi, magic_quotient_lo, 1, &M.hi);
return M;
}

// computes (a % d) given precomputed M
FASTMOD_API uint64_t fastmod_u64(uint64_t a, fastmod_u128_t M, uint64_t d) {
uint64_t lowbits_hi;
uint64_t lowbits_lo = mul128_u64_lo(M.hi, M.low, a, &lowbits_hi);

return mul128_u64_hi(lowbits_hi, lowbits_lo, d);
}

// computes (a / d) given precomputed M for d>1
FASTMOD_API uint64_t fastdiv_u64(uint64_t a, fastmod_u128_t M) {
return mul128_u64_hi(M.hi, M.low, a);
}

// given precomputed M, is_divisible checks whether n % d == 0
FASTMOD_API bool is_divisible_u64(uint64_t n, fastmod_u128_t M) {
uint64_t lowBits_hi;
uint64_t lowBits_low = mul128_u64_lo(M.hi, M.low, n, &lowBits_hi);

uint64_t Mdec_low, Mdec_hi;
bool borrow_hi = _subborrow_u64(0, M.low, 1, &Mdec_low);
_subborrow_u64(borrow_hi, M.hi, 0, &Mdec_hi);

// n * M <= M - 1
return !isgreater_u128(lowBits_hi, lowBits_low, Mdec_hi, Mdec_low);
}


// End of the 64-bit functions

#endif // #ifndef _MSC_VER
Expand Down
6 changes: 3 additions & 3 deletions tests/cppincludetest1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
printf("skipping d = 0\n");
continue;
}
#ifndef _MSC_VER
__uint128_t M64 = computeM_u64(d);

auto M64 = computeM_u64(d);
if (verbose)
printf("d = %" PRIu64 " (unsigned 64-bit) ", d);
else
Expand All @@ -37,7 +37,7 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
return false;
}
}
#endif

if (verbose)
printf("ok!\n");
}
Expand Down
10 changes: 5 additions & 5 deletions tests/cppincludetest2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) {
printf("skipping d = 0 as it cannot be supported\n");
continue;
}
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("skipping d = -2147483648 as it is unsupported\n");
continue;
}
Expand Down Expand Up @@ -166,7 +166,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
printf("\n==Testing divsigned with min = %d and max = %d\n", min, max);
assert(min != max + 1); // infinite loop!
size_t count = 0;
static_assert(int32_t(-2147483648) < int32_t(2147483647));
static_assert(int32_t(INT32_MIN) < int32_t(2147483647));
for (int32_t d = min; (d <= max) && (d >= min); d++) {
if (d == 0) {
printf("skipping d = 0\n");
Expand Down Expand Up @@ -205,7 +205,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
d, a);
printf("expected %d div %d = %d \n", a, d, computedDiv);
printf("got %d div %d = %d \n", a, d, computedFastDiv);
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("Note: d = -2147483648 is unsupported\n");
} else {
return false;
Expand Down Expand Up @@ -235,7 +235,7 @@ int main(int argc, char *argv[]) {
}

isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose);
isok = isok && testdivsigned(2, 10, verbose);
isok = isok && testdivsigned(-10, -2, verbose);
isok = isok && testunsigned64(1, 0x10, verbose);
Expand All @@ -251,7 +251,7 @@ int main(int argc, char *argv[]) {
isok = isok && testsigned(-8, -1, verbose);
isok = isok && testsigned(1, 8, verbose);
isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose);
for (int k = 0; k < 100; k++) {
int32_t x = rand();
isok = isok && testsigned(x, x, verbose);
Expand Down
20 changes: 18 additions & 2 deletions tests/moddivnbenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,28 @@
#include <cstdio>
#include <random>

#ifdef _MSC_VER

// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284
#pragma optimize("", off)
inline void doNotOptimizeDependencySink(const void*) {}

#pragma optimize("", on)
template <class T>
void doNotOptimizeAway(const T& datum) {
doNotOptimizeDependencySink(&datum);
}
#else

template <typename T> inline void doNotOptimizeAway(T &&datum) {
// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326
asm volatile("" ::"m"(datum) : "memory");
}

#endif

using namespace fastmod;
template <typename F>
uint64_t time(const F &x, const std::vector<uint64_t> &zomg) {
Expand All @@ -29,7 +45,7 @@ int main() {
std::vector<uint64_t> zomg(10000000);
for (auto &e : zomg)
e = mt();
#ifndef _MSC_VER

const auto M = computeM_u64(mod);
auto fm = time(
[M, mod](uint64_t v) {
Expand All @@ -39,5 +55,5 @@ int main() {
auto sm = time([mod](uint64_t x) { return (x % mod) + (x / mod); }, zomg);
std::fprintf(stderr, "fastmod + fastdiv is %lf as fast as x86 mod + div: \n",
(double)sm / fm);
#endif

}
20 changes: 18 additions & 2 deletions tests/modnbenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,28 @@
#include <iostream>
#include <random>

#ifdef _MSC_VER

// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284
#pragma optimize("", off)
inline void doNotOptimizeDependencySink(const void*) {}

#pragma optimize("", on)
template <class T>
void doNotOptimizeAway(const T& datum) {
doNotOptimizeDependencySink(&datum);
}
#else

template <typename T> inline void doNotOptimizeAway(T &&datum) {
// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326
asm volatile("" ::"m"(datum) : "memory");
}

#endif

using namespace fastmod;
template <typename F> uint64_t time(const F &x, std::vector<uint64_t> &zomg) {
auto start = std::chrono::high_resolution_clock::now();
Expand All @@ -30,7 +46,7 @@ int main() {
std::vector<uint64_t> zomg(100000000);
for (auto &e : zomg)
e = mt();
#ifndef _MSC_VER

const auto M = computeM_u64(mod);
std::cout << "timing fastmod_u64 " << std::endl;
auto fmtime =
Expand Down Expand Up @@ -59,5 +75,5 @@ int main() {
std::fprintf(stderr,
"masking is %lf as fast as fastmod and %lf as fast as modding\n",
(double)fmtime / masktime, (double)modtime / masktime);
#endif

}
11 changes: 7 additions & 4 deletions tests/unit.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
#ifdef __cplusplus
using namespace fastmod;
#endif
#ifdef _MSC_VER
typedef fastmod_u128_t __uint128_t;
#endif

bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
for (uint64_t d = min; (d <= max) && (d >= min); d++) {
Expand Down Expand Up @@ -130,7 +133,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) {
printf("skipping d = 0 as it cannot be supported\n");
continue;
}
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("skipping d = -2147483648 as it is unsupported\n");
continue;
}
Expand Down Expand Up @@ -181,7 +184,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
printf("skipping d = -1 as it is not supported\n");
continue;
}
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("skipping d = -2147483648 as it is unsupported\n");
continue;
}
Expand Down Expand Up @@ -228,7 +231,7 @@ int main(int argc, char *argv[]) {
isok = isok && testunsigned64(UINT64_C(0xffffffffff00000), UINT64_C(0xffffffffff00000) + 0x100, verbose);
isok = isok && testunsigned(1, 8, verbose);
isok = isok && testunsigned(0xfffffff8, 0xffffffff, verbose);
isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose);
isok = isok && testdivsigned(2, 10, verbose);
isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testdivsigned(-10, -2, verbose);
Expand All @@ -239,7 +242,7 @@ int main(int argc, char *argv[]) {
isok = isok && testsigned(-8, -1, verbose);
isok = isok && testsigned(1, 8, verbose);
isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose);
for (int k = 0; k < 100; k++) {
int32_t x = rand();
isok = isok && testsigned(x, x, verbose);
Expand Down
Loading