Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Dockerfile text eol=lf
*.f03 filter=tabs2spaces
*.sql filter=tabs2spaces
*.ino filter=tabs2spaces
*.cu filter=tabs2spaces

# Project files
*.csproj filter=tabs2spaces
Expand Down
40 changes: 34 additions & 6 deletions gfoidl.Stochastics.sln
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "images", "images", "{A741E7
doc\release-notes\images\perf-avg-var-v1.1.0-preview-3.png = doc\release-notes\images\perf-avg-var-v1.1.0-preview-3.png
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gfoidl.Stochastics.Gpu-win-64", "source\gfoidl.Stochastics.Gpu-win-64\gfoidl.Stochastics.Gpu-win-64.vcxproj", "{60D3CEC5-E8DE-45E4-8834-86C9940D942A}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gfoidl.Stochastics.Gpu-win-64.Tests", "tests\gfoidl.Stochastics.Gpu-win-64.Tests\gfoidl.Stochastics.Gpu-win-64.Tests.vcxproj", "{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gfoidl.Stochastics.Gpu-win-64.Tests.Console", "tests\gfoidl.Stochastics.Gpu-win-64.Tests.Console\gfoidl.Stochastics.Gpu-win-64.Tests.Console.vcxproj", "{1C7AA347-76CE-4C7F-A468-B46ADDD93215}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gfoidl.Stochastics.Gpu-linux-64", "source\gfoidl.Stochastics.Gpu-linux-64\gfoidl.Stochastics.Gpu-linux-64.vcxproj", "{0729F977-87F1-4139-A9B1-569BFB4A352E}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -83,27 +91,21 @@ Global
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Debug|x64.ActiveCfg = Debug|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Debug|x64.Build.0 = Debug|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Release|Any CPU.Build.0 = Release|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Release|x64.ActiveCfg = Release|Any CPU
{B071CBBC-D096-4E62-A7D1-146FFC9B0431}.Release|x64.Build.0 = Release|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Debug|x64.ActiveCfg = Debug|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Debug|x64.Build.0 = Debug|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Release|Any CPU.ActiveCfg = Release|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Release|Any CPU.Build.0 = Release|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Release|x64.ActiveCfg = Release|Any CPU
{902CB41A-DDB5-4AEB-B5E8-AD6494EF4DB3}.Release|x64.Build.0 = Release|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Debug|x64.ActiveCfg = Debug|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Debug|x64.Build.0 = Debug|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Release|Any CPU.Build.0 = Release|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Release|x64.ActiveCfg = Release|Any CPU
{1B138AD4-AB52-4011-A0CB-C9AD6BB10D7B}.Release|x64.Build.0 = Release|Any CPU
{813695B2-09E2-4111-BD6B-18EEF40299C0}.Debug|Any CPU.ActiveCfg = Debug|x64
{813695B2-09E2-4111-BD6B-18EEF40299C0}.Debug|x64.ActiveCfg = Debug|x64
{813695B2-09E2-4111-BD6B-18EEF40299C0}.Release|Any CPU.ActiveCfg = Release|x64
Expand All @@ -120,6 +122,28 @@ Global
{7A4AAA49-6071-4FEA-8920-D8EFC6B8A019}.Release|Any CPU.ActiveCfg = Release|x64
{7A4AAA49-6071-4FEA-8920-D8EFC6B8A019}.Release|x64.ActiveCfg = Release|x64
{7A4AAA49-6071-4FEA-8920-D8EFC6B8A019}.Release|x64.Build.0 = Release|x64
{60D3CEC5-E8DE-45E4-8834-86C9940D942A}.Debug|Any CPU.ActiveCfg = Debug|x64
{60D3CEC5-E8DE-45E4-8834-86C9940D942A}.Debug|x64.ActiveCfg = Debug|x64
{60D3CEC5-E8DE-45E4-8834-86C9940D942A}.Debug|x64.Build.0 = Debug|x64
{60D3CEC5-E8DE-45E4-8834-86C9940D942A}.Release|Any CPU.ActiveCfg = Release|x64
{60D3CEC5-E8DE-45E4-8834-86C9940D942A}.Release|x64.ActiveCfg = Release|x64
{60D3CEC5-E8DE-45E4-8834-86C9940D942A}.Release|x64.Build.0 = Release|x64
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}.Debug|Any CPU.ActiveCfg = Debug|x64
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}.Debug|x64.ActiveCfg = Debug|x64
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}.Debug|x64.Build.0 = Debug|x64
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}.Release|Any CPU.ActiveCfg = Release|x64
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}.Release|x64.ActiveCfg = Release|x64
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3}.Release|x64.Build.0 = Release|x64
{1C7AA347-76CE-4C7F-A468-B46ADDD93215}.Debug|Any CPU.ActiveCfg = Debug|x64
{1C7AA347-76CE-4C7F-A468-B46ADDD93215}.Debug|x64.ActiveCfg = Debug|x64
{1C7AA347-76CE-4C7F-A468-B46ADDD93215}.Debug|x64.Build.0 = Debug|x64
{1C7AA347-76CE-4C7F-A468-B46ADDD93215}.Release|Any CPU.ActiveCfg = Release|x64
{1C7AA347-76CE-4C7F-A468-B46ADDD93215}.Release|x64.ActiveCfg = Release|x64
{1C7AA347-76CE-4C7F-A468-B46ADDD93215}.Release|x64.Build.0 = Release|x64
{0729F977-87F1-4139-A9B1-569BFB4A352E}.Debug|Any CPU.ActiveCfg = Debug|x64
{0729F977-87F1-4139-A9B1-569BFB4A352E}.Debug|x64.ActiveCfg = Debug|x64
{0729F977-87F1-4139-A9B1-569BFB4A352E}.Release|Any CPU.ActiveCfg = Release|x64
{0729F977-87F1-4139-A9B1-569BFB4A352E}.Release|x64.ActiveCfg = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -137,6 +161,10 @@ Global
{7A4AAA49-6071-4FEA-8920-D8EFC6B8A019} = {EFBEAA6C-2E09-4A10-898C-D20897F3609F}
{2EE5949B-0F90-49E5-B487-CC04C0940BC4} = {7917B2B4-78F0-4BDE-86FE-316549A84E4E}
{A741E7D0-1788-4145-BA6C-2C4698CCECF1} = {2EE5949B-0F90-49E5-B487-CC04C0940BC4}
{60D3CEC5-E8DE-45E4-8834-86C9940D942A} = {0B4051AE-E1F2-4601-B0A8-B4CA660CC4B2}
{90DA6C71-5A2B-4D26-9EA4-B6BF60A7B7E3} = {EFBEAA6C-2E09-4A10-898C-D20897F3609F}
{1C7AA347-76CE-4C7F-A468-B46ADDD93215} = {EFBEAA6C-2E09-4A10-898C-D20897F3609F}
{0729F977-87F1-4139-A9B1-569BFB4A352E} = {0B4051AE-E1F2-4601-B0A8-B4CA660CC4B2}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {A594189A-069D-4FEE-8CD8-1A5786CF97AA}
Expand Down
Binary file added native-out/gfoidl-Stochastics-gpu.dll
Binary file not shown.
Binary file added native-out/libgfoidl-Stochastics-gpu.so
Binary file not shown.
12 changes: 12 additions & 0 deletions source/gfoidl.Stochastics.Gpu-linux-64/SampleStats.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once
//-----------------------------------------------------------------------------
struct SampleStats
{
double Mean;
double Max;
double Min;
double Delta;
double VarianceCore;
double Skewness;
double Kurtosis;
};
23 changes: 23 additions & 0 deletions source/gfoidl.Stochastics.Gpu-linux-64/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

set -e

libName=libgfoidl-Stochastics-gpu.so

# https://docs.nvidia.com/cuda/pdf/CUDA_Compiler_Driver_NVCC.pdf
nvcc -std=c++14 \
--disable-warnings \
-Wno-deprecated-declarations \
-o "$libName" \
-O3 \
--ptxas-options=-v \
--machine 64 \
-x cu \
-cudart static \
-shared -rdc=true \
-Xcompiler -fPIC,-fvisibility=hidden \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_35,code=sm_35 \
kernel_utils.cu kernel.cu gpu_core.cu

nm -DC "$libName" | grep " T "
23 changes: 23 additions & 0 deletions source/gfoidl.Stochastics.Gpu-linux-64/dll.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once
//-----------------------------------------------------------------------------
#if defined _WIN32 || defined __CYGWIN__
#ifdef GFOIDL_STOCHASTICS_GPU_EXPORTS
#define GPU_API __declspec(dllexport)
#else
#define GPU_API __declspec(dllimport)
#endif
#else
#if __GNUC__ >= 4
#define GPU_API __attribute__ ((visibility ("default")))
#else
#define GPU_API
#endif
#endif
//-----------------------------------------------------------------------------
#ifdef __cplusplus
#define BEGIN_EXTERN_C extern "C" {
#define END_EXTERN_C }
#else
#define BEGIN_EXTERN_C
#define END_EXTERN_C
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{0729f977-87f1-4139-a9b1-569bfb4a352e}</ProjectGuid>
<Keyword>Linux</Keyword>
<RootNamespace>gfoidl_Stochastics_Gpu_linux_64</RootNamespace>
<MinimumVisualStudioVersion>15.0</MinimumVisualStudioVersion>
<ApplicationType>Linux</ApplicationType>
<ApplicationTypeRevision>1.0</ApplicationTypeRevision>
<TargetLinuxPlatform>Generic</TargetLinuxPlatform>
<LinuxProjectType>{2238F9CD-F817-4ECC-BD14-2524D2669B35}</LinuxProjectType>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<UseDebugLibraries>false</UseDebugLibraries>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" />
<ImportGroup Label="Shared" />
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros" />
<ItemGroup>
<None Include="build.sh" />
<None Include="gpu_core.cu" />
<None Include="kernel.cu" />
<None Include="kernel_utils.cu" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="dll.h" />
<ClInclude Include="gpu_core.h" />
<ClInclude Include="kernel.h" />
<ClInclude Include="kernel_utils.h" />
<ClInclude Include="SampleStats.h" />
</ItemGroup>
<ItemDefinitionGroup />
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
90 changes: 90 additions & 0 deletions source/gfoidl.Stochastics.Gpu-linux-64/gpu_core.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
//#define SINGLE_THREAD
//-----------------------------------------------------------------------------
#include "gpu_core.h"
#include <cuda_runtime.h>
#include "kernel.h"

#if defined(DEBUG) || defined(_DEBUG)
#include <stdio.h>
#include <assert.h>
#endif
//-----------------------------------------------------------------------------
// Forward declarations
inline cudaError_t checkCuda(cudaError_t result);
//-----------------------------------------------------------------------------
bool gpu_available()
{
int deviceCount;
cudaError_t errorId = cudaGetDeviceCount(&deviceCount);

return errorId == cudaSuccess
&& deviceCount > 0;
}
//-----------------------------------------------------------------------------
const char* gpu_get_error_string(const int errorCode)
{
return cudaGetErrorString(static_cast<cudaError>(errorCode));
}
//-----------------------------------------------------------------------------
int gpu_sample_calc_stats(double* sample, const int sampleSize, SampleStats* sampleStats)
{
double* deviceSample;
SampleStats* deviceSampleStats;

try
{
checkCuda(cudaMalloc(&deviceSample, sizeof(double) * sampleSize));
checkCuda(cudaMalloc(&deviceSampleStats, sizeof(SampleStats)));

checkCuda(cudaMemcpy(deviceSample, sample, sizeof(double) * sampleSize, cudaMemcpyHostToDevice));
checkCuda(cudaMemset(deviceSampleStats, 0, sizeof(SampleStats)));

#ifndef SINGLE_THREAD
const int blockSize = 256;
int numBlocks = (sampleSize + blockSize - 1) / blockSize;
#else
const int blockSize = 1;
const int numBlocks = 1;
#endif

#if defined(DEBUG) || defined(_DEBUG)
printf("blockSize: %d\nnumBlocks: %d\n", blockSize, numBlocks);
#endif

// For final fixup of values a separate kernel is queued to the device.
// Otherwise there's no way of syncing all threads in the grid.
Kernel::CalculateAverageAndVarianceCore<<<numBlocks, blockSize>>>(deviceSample, sampleSize, deviceSampleStats);
Kernel::CalculateAverageAndVarianceCoreFinal<<<1, 1>>>(deviceSampleStats, sampleSize);

Kernel::CalculateDeltaSkewnessKurtosis<<<numBlocks, blockSize>>>(deviceSample, sampleSize, deviceSampleStats);

//checkCuda(cudaDeviceSynchronize()); // not necessary
checkCuda(cudaMemcpy(sampleStats, deviceSampleStats, sizeof(SampleStats), cudaMemcpyDeviceToHost));

checkCuda(cudaFree(deviceSample));
checkCuda(cudaFree(deviceSampleStats));

// Final fixup of values, not done yet
// No need to launch a kernel for a simple division.
sampleStats->Delta /= sampleSize;

double sigma = sqrt(sampleStats->VarianceCore / sampleSize);
double t = sampleSize * sigma * sigma * sigma;
sampleStats->Skewness /= t;
sampleStats->Kurtosis /= t * sigma;
}
catch (const int e)
{
return e;
}

return 0;
}
//-----------------------------------------------------------------------------
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess)
throw static_cast<int>(result);

return result;
}
13 changes: 13 additions & 0 deletions source/gfoidl.Stochastics.Gpu-linux-64/gpu_core.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once
//-----------------------------------------------------------------------------
#include "dll.h"
#include "SampleStats.h"
//-----------------------------------------------------------------------------
BEGIN_EXTERN_C

GPU_API bool gpu_available();
GPU_API const char* gpu_get_error_string(const int errorCode);

GPU_API int gpu_sample_calc_stats(double* sample, const int sampleSize, SampleStats* sampleStats);

END_EXTERN_C
88 changes: 88 additions & 0 deletions source/gfoidl.Stochastics.Gpu-linux-64/kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include "kernel.h"
#include "kernel_utils.h"
//-----------------------------------------------------------------------------
namespace Kernel
{
__global__
void CalculateAverageAndVarianceCore(const double* sample, const int n, SampleStats* sampleStats)
{
const int index = blockDim.x * blockIdx.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x;

double avg = 0;
double variance = 0;

for (int i = index; i < n; i += stride)
{
avg += sample[i];
variance += sample[i] * sample[i];
}

Utils::TwoDoubles twoDoubles {avg, variance};
twoDoubles = Utils::BlockReduceSum(twoDoubles);

// Final sum in first thread of each block
if (threadIdx.x == 0)
{
#if __CUDA_ARCH__ < 600
Utils::atomicAdd(&sampleStats->Mean , twoDoubles.A);
Utils::atomicAdd(&sampleStats->VarianceCore, twoDoubles.B);
#else
atomicAdd(&sampleStats->Mean , twoDoubles.A);
atomicAdd(&sampleStats->VarianceCore, twoDoubles.B);
#endif
}
}
//-----------------------------------------------------------------------------
__global__
void CalculateAverageAndVarianceCoreFinal(SampleStats* sampleStats, const int n)
{
const int index = blockDim.x * blockIdx.x + threadIdx.x;

if (index == 0)
{
double avg = sampleStats->Mean / n;
sampleStats->Mean = avg;
sampleStats->VarianceCore -= n * avg*avg;
}
}
//-----------------------------------------------------------------------------
__global__
void CalculateDeltaSkewnessKurtosis(const double* sample, const int n, SampleStats* sampleStats)
{
const int index = blockDim.x * blockIdx.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x;

double avg = sampleStats->Mean;
double delta = 0;
double skewness = 0;
double kurtosis = 0;

for (int i = index; i < n; i += stride)
{
double t = sample[i] - avg;
double t1 = t * t*t;

delta += abs(t);
skewness += t1;
kurtosis += t1 * t;
}

Utils::ThreeDoubles threeDoubles {delta, skewness,kurtosis};
threeDoubles = Utils::BlockReduceSum(threeDoubles);

// Final sum in first thread of each block
if (threadIdx.x == 0)
{
#if __CUDA_ARCH__ < 600
Utils::atomicAdd(&sampleStats->Delta , threeDoubles.A);
Utils::atomicAdd(&sampleStats->Skewness, threeDoubles.B);
Utils::atomicAdd(&sampleStats->Kurtosis, threeDoubles.C);
#else
atomicAdd(&sampleStats->Delta , threeDoubles.A);
atomicAdd(&sampleStats->Skewness, threeDoubles.B);
atomicAdd(&sampleStats->Kurtosis, threeDoubles.C);
#endif
}
}
}
Loading