Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Makefile artifacts
*.o
*.so
# CMake artifacts
/build
cache-opencl.*
bin
profanity2.x64
26 changes: 26 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 3.10)

project(profanity2)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

include_directories(include/)
aux_source_directory(src/ SOURCES)

set(PROFANITY2_TARGET profanity2.x64)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

add_executable(${PROFANITY2_TARGET} ${SOURCES})

if(APPLE)
target_link_libraries(${PROFANITY2_TARGET} PRIVATE "-framework OpenCL")
target_compile_options(${PROFANITY2_TARGET} PRIVATE -std=c++11 -Wall -mmmx
-O2)
else()
find_package(OpenCL REQUIRED)
target_link_options(${PROFANITY2_TARGET} PRIVATE -s -mcmodel=large)
target_compile_options(${PROFANITY2_TARGET} PRIVATE -std=c++11 -Wall -mmmx
-O2 -mcmodel=large)
target_link_libraries(${PROFANITY2_TARGET} PRIVATE OpenCL)
endif()
26 changes: 0 additions & 26 deletions Makefile

This file was deleted.

3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ $ python3
```

# Usage
```cmake
cmake -S . -B build && cmake --build build
```
```
usage: ./profanity2 [OPTIONS]

Expand Down
278 changes: 139 additions & 139 deletions keccak.cl → cl/keccak.cl
Original file line number Diff line number Diff line change
@@ -1,139 +1,139 @@
/* This Keccak implementation is an amalgamation of:
* Tiny SHA3 implementation by Markku-Juhani O. Saarinen:
* https://github.com/mjosaarinen/tiny_sha3
* Keccak implementation found in xptMiner-gpu @ Github:
* https://github.com/llamasoft/xptMiner-gpu/blob/master/opencl/keccak.cl
*/
typedef union {
uchar b[200];
ulong q[25];
uint d[50];
} ethhash;
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = rotate((ulong)(d0 ^ d1 ^ d2 ^ d3 ^ d4), (ulong)1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}
#define THETA(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; s01 ^= t0; s02 ^= t0; s03 ^= t0; s04 ^= t0; \
s10 ^= t1; s11 ^= t1; s12 ^= t1; s13 ^= t1; s14 ^= t1; \
s20 ^= t2; s21 ^= t2; s22 ^= t2; s23 ^= t2; s24 ^= t2; \
s30 ^= t3; s31 ^= t3; s32 ^= t3; s33 ^= t3; s34 ^= t3; \
s40 ^= t4; s41 ^= t4; s42 ^= t4; s43 ^= t4; s44 ^= t4; \
}
#define RHOPI(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
t0 = rotate(s10, (ulong) 1); \
s10 = rotate(s11, (ulong)44); \
s11 = rotate(s41, (ulong)20); \
s41 = rotate(s24, (ulong)61); \
s24 = rotate(s42, (ulong)39); \
s42 = rotate(s04, (ulong)18); \
s04 = rotate(s20, (ulong)62); \
s20 = rotate(s22, (ulong)43); \
s22 = rotate(s32, (ulong)25); \
s32 = rotate(s43, (ulong) 8); \
s43 = rotate(s34, (ulong)56); \
s34 = rotate(s03, (ulong)41); \
s03 = rotate(s40, (ulong)27); \
s40 = rotate(s44, (ulong)14); \
s44 = rotate(s14, (ulong) 2); \
s14 = rotate(s31, (ulong)55); \
s31 = rotate(s13, (ulong)45); \
s13 = rotate(s01, (ulong)36); \
s01 = rotate(s30, (ulong)28); \
s30 = rotate(s33, (ulong)21); \
s33 = rotate(s23, (ulong)15); \
s23 = rotate(s12, (ulong)10); \
s12 = rotate(s21, (ulong) 6); \
s21 = rotate(s02, (ulong) 3); \
s02 = t0; \
}
#define KHI(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; s10 = t1; s20 = t2; s30 = t3; s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; s11 = t1; s21 = t2; s31 = t3; s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; s12 = t1; s22 = t2; s32 = t3; s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; s13 = t1; s23 = t2; s33 = t3; s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; s14 = t1; s24 = t2; s34 = t3; s44 = t4; \
}
#define IOTA(s00, r) { s00 ^= r; }
__constant ulong keccakf_rndc[24] = {
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};
// Barely a bottleneck. No need to tinker more.
void sha3_keccakf(ethhash * const h)
{
ulong * const st = &h->q;
h->d[33] ^= 0x80000000;
ulong t0, t1, t2, t3, t4;
// Unrolling and removing PI stage gave negligable performance on GTX 1070.
for (int i = 0; i < 24; ++i) {
THETA(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
RHOPI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
KHI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
IOTA(st[0], keccakf_rndc[i]);
}
}
/* This Keccak implementation is an amalgamation of:
* Tiny SHA3 implementation by Markku-Juhani O. Saarinen:
* https://github.com/mjosaarinen/tiny_sha3
* Keccak implementation found in xptMiner-gpu @ Github:
* https://github.com/llamasoft/xptMiner-gpu/blob/master/opencl/keccak.cl
*/

typedef union {
uchar b[200];
ulong q[25];
uint d[50];
} ethhash;

#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = rotate((ulong)(d0 ^ d1 ^ d2 ^ d3 ^ d4), (ulong)1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}

#define THETA(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; s01 ^= t0; s02 ^= t0; s03 ^= t0; s04 ^= t0; \
s10 ^= t1; s11 ^= t1; s12 ^= t1; s13 ^= t1; s14 ^= t1; \
s20 ^= t2; s21 ^= t2; s22 ^= t2; s23 ^= t2; s24 ^= t2; \
s30 ^= t3; s31 ^= t3; s32 ^= t3; s33 ^= t3; s34 ^= t3; \
s40 ^= t4; s41 ^= t4; s42 ^= t4; s43 ^= t4; s44 ^= t4; \
}

#define RHOPI(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
t0 = rotate(s10, (ulong) 1); \
s10 = rotate(s11, (ulong)44); \
s11 = rotate(s41, (ulong)20); \
s41 = rotate(s24, (ulong)61); \
s24 = rotate(s42, (ulong)39); \
s42 = rotate(s04, (ulong)18); \
s04 = rotate(s20, (ulong)62); \
s20 = rotate(s22, (ulong)43); \
s22 = rotate(s32, (ulong)25); \
s32 = rotate(s43, (ulong) 8); \
s43 = rotate(s34, (ulong)56); \
s34 = rotate(s03, (ulong)41); \
s03 = rotate(s40, (ulong)27); \
s40 = rotate(s44, (ulong)14); \
s44 = rotate(s14, (ulong) 2); \
s14 = rotate(s31, (ulong)55); \
s31 = rotate(s13, (ulong)45); \
s13 = rotate(s01, (ulong)36); \
s01 = rotate(s30, (ulong)28); \
s30 = rotate(s33, (ulong)21); \
s33 = rotate(s23, (ulong)15); \
s23 = rotate(s12, (ulong)10); \
s12 = rotate(s21, (ulong) 6); \
s21 = rotate(s02, (ulong) 3); \
s02 = t0; \
}

#define KHI(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; s10 = t1; s20 = t2; s30 = t3; s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; s11 = t1; s21 = t2; s31 = t3; s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; s12 = t1; s22 = t2; s32 = t3; s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; s13 = t1; s23 = t2; s33 = t3; s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; s14 = t1; s24 = t2; s34 = t3; s44 = t4; \
}

#define IOTA(s00, r) { s00 ^= r; }

__constant ulong keccakf_rndc[24] = {
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};

// Barely a bottleneck. No need to tinker more.
void sha3_keccakf(ethhash * const h)
{
ulong * const st = &h->q;
h->d[33] ^= 0x80000000;
ulong t0, t1, t2, t3, t4;

// Unrolling and removing PI stage gave negligable performance on GTX 1070.
for (int i = 0; i < 24; ++i) {
THETA(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
RHOPI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
KHI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
IOTA(st[0], keccakf_rndc[i]);
}
}
Loading