Skip to content
This repository was archived by the owner on Sep 15, 2022. It is now read-only.
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: johguse/profanity
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: v1.4
Choose a base ref
...
head repository: johguse/profanity
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
  • 7 commits
  • 6 files changed
  • 1 contributor

Commits on Jul 28, 2019

  1. Add debug output where important kernels are timed in milliseconds. S…

    …plit main memory area for points into two separate areas for X and Y coordinates respectively to improve memory access performance. Switch from Montgomery multiplication to improved interleaved Barret reduction eliminating two modular multiplications per point when moving out of montgomery form every iteration. Add a ton of explanations to the program.
    johguse committed Jul 28, 2019
    Copy the full SHA
    0b54ffc View commit details
  2. Copy the full SHA
    7730859 View commit details

Commits on Aug 1, 2019

  1. New major mathematical optimization giving boost of up to 14% on my n…

    …Vidia GTX 1070. Number of multiplications during point addition phase has been reduced from 3 to 2. In addition to this a visible progress counter has been added during initialization.
    johguse committed Aug 1, 2019
    1
    Copy the full SHA
    90f2c21 View commit details
  2. Copy the full SHA
    75afbad View commit details

Commits on Sep 14, 2022

  1. Copy the full SHA
    d22adaf View commit details

Commits on Sep 15, 2022

  1. Copy the full SHA
    182fc51 View commit details
  2. Make project uncompilable. Update README to reflect that this project…

    … will not receive further updates and will be archived to further reduce risk that someone uses it.
    johguse committed Sep 15, 2022
    Copy the full SHA
    186b9b2 View commit details
Showing with 8,535 additions and 8,486 deletions.
  1. +69 −41 Dispatcher.cpp
  2. +9 −7 Dispatcher.hpp
  3. +7 −78 README.md
  4. +8,162 −8,163 precomp.cpp
  5. +286 −197 profanity.cl
  6. +2 −0 profanity.cpp
110 changes: 69 additions & 41 deletions Dispatcher.cpp
Original file line number Diff line number Diff line change
@@ -55,6 +55,14 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
std::cout << ": 0x" << strPublic << std::endl;
}

unsigned int getKernelExecutionTimeMicros(cl_event & e) {
cl_ulong timeStart = 0, timeEnd = 0;
clWaitForEvents(1, &e);
clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_START, sizeof(timeStart), &timeStart, NULL);
clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_END, sizeof(timeEnd), &timeEnd, NULL);
return (timeEnd - timeStart) / 1000;
}

Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) :
std::runtime_error( s + " (res = " + toString(res) + ")"),
m_res(res)
@@ -70,10 +78,16 @@ void Dispatcher::OpenCLException::OpenCLException::throwIfError(const std::strin

cl_command_queue Dispatcher::Device::createQueue(cl_context & clContext, cl_device_id & clDeviceId) {
// nVidia CUDA Toolkit 10.1 only supports OpenCL 1.2 so we revert back to older functions for compatability
#ifdef PROFANITY_DEBUG
cl_command_queue_properties p = CL_QUEUE_PROFILING_ENABLE;
#else
cl_command_queue_properties p = NULL;
#endif

#ifdef CL_VERSION_2_0
const cl_command_queue ret = clCreateCommandQueueWithProperties(clContext, clDeviceId, NULL, NULL);
const cl_command_queue ret = clCreateCommandQueueWithProperties(clContext, clDeviceId, &p, NULL);
#else
const cl_command_queue ret = clCreateCommandQueue(clContext, clDeviceId, NULL, NULL);
const cl_command_queue ret = clCreateCommandQueue(clContext, clDeviceId, p, NULL);
#endif
return ret == NULL ? throw std::runtime_error("failed to create command queue") : ret;
}
@@ -113,15 +127,15 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
m_worksizeLocal(worksizeLocal),
m_clScoreMax(0),
m_clQueue(createQueue(clContext, clDeviceId) ),
m_kernelBegin( createKernel(clProgram, "profanity_begin") ),
m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")),
m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")),
m_kernelEnd(createKernel(clProgram, "profanity_end")),
m_kernelInit( createKernel(clProgram, "profanity_init") ),
m_kernelInverse(createKernel(clProgram, "profanity_inverse")),
m_kernelIterate(createKernel(clProgram, "profanity_iterate")),
m_kernelTransform( mode.transformKernel() == "" ? NULL : createKernel(clProgram, mode.transformKernel())),
m_kernelScore(createKernel(clProgram, mode.kernel)),
m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp),
m_memPoints(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
m_memPointsDeltaX(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
m_memInversedNegativeDoubleGy(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
m_memPrevLambda(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, PROFANITY_MAX_SCORE + 1),
m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20),
@@ -190,6 +204,8 @@ void Dispatcher::init() {
std::cout << std::endl;

const auto deviceCount = m_vDevices.size();
m_sizeInitTotal = m_size * deviceCount;
m_sizeInitDone = 0;

cl_event * const pInitEvents = new cl_event[deviceCount];

@@ -223,30 +239,28 @@ void Dispatcher::initBegin(Device & d) {
d.m_memData2.write(true);

// Kernel arguments - profanity_begin
d.m_memPrecomp.setKernelArg(d.m_kernelBegin, 0);
d.m_memPoints.setKernelArg(d.m_kernelBegin, 1);
d.m_memResult.setKernelArg(d.m_kernelBegin, 2);
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelBegin, 3, d.m_clSeed);
d.m_memPrecomp.setKernelArg(d.m_kernelInit, 0);
d.m_memPointsDeltaX.setKernelArg(d.m_kernelInit, 1);
d.m_memPrevLambda.setKernelArg(d.m_kernelInit, 2);
d.m_memResult.setKernelArg(d.m_kernelInit, 3);
CLMemory<cl_ulong4>::setKernelArg(d.m_kernelInit, 4, d.m_clSeed);

// Kernel arguments - profanity_inverse
d.m_memPoints.setKernelArg(d.m_kernelInverse, 0);
d.m_memInverse.setKernelArg(d.m_kernelInverse, 1);

// Kernel arguments - profanity_inverse_post
d.m_memPoints.setKernelArg(d.m_kernelInversePost, 0);
d.m_memInverse.setKernelArg(d.m_kernelInversePost, 1);
d.m_memPointsDeltaX.setKernelArg(d.m_kernelInverse, 0);
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelInverse, 1);

// Kernel arguments - profanity_end
d.m_memPoints.setKernelArg(d.m_kernelEnd, 0);
d.m_memInverse.setKernelArg(d.m_kernelEnd, 1);
// Kernel arguments - profanity_iterate
d.m_memPointsDeltaX.setKernelArg(d.m_kernelIterate, 0);
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelIterate, 1);
d.m_memPrevLambda.setKernelArg(d.m_kernelIterate, 2);

// Kernel arguments - profanity_transform_*
if(d.m_kernelTransform) {
d.m_memInverse.setKernelArg(d.m_kernelTransform, 0);
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelTransform, 0);
}

// Kernel arguments - profanity_score_*
d.m_memInverse.setKernelArg(d.m_kernelScore, 0);
d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelScore, 0);
d.m_memResult.setKernelArg(d.m_kernelScore, 1);
d.m_memData1.setKernelArg(d.m_kernelScore, 2);
d.m_memData2.setKernelArg(d.m_kernelScore, 3);
@@ -259,11 +273,16 @@ void Dispatcher::initBegin(Device & d) {

void Dispatcher::initContinue(Device & d) {
size_t sizeLeft = m_size - d.m_sizeInitialized;
const size_t sizeInitLimit = m_size / 20;

// Print progress
const size_t percentDone = m_sizeInitDone * 100 / m_sizeInitTotal;
std::cout << " " << percentDone << "%\r" << std::flush;

if (sizeLeft) {
cl_event event;
const size_t sizeRun = std::min(sizeLeft, m_worksizeMax);
const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelBegin, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event);
const size_t sizeRun = std::min(sizeInitLimit, std::min(sizeLeft, m_worksizeMax));
const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelInit, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event);
OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue);

// See: https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetEventCallback.html
@@ -274,7 +293,9 @@ void Dispatcher::initContinue(Device & d) {
// clFlush on the queue before returning or arrange for clFlush to be called later on another thread.
clFlush(d.m_clQueue);

std::lock_guard<std::mutex> lock(m_mutex);
d.m_sizeInitialized += sizeRun;
m_sizeInitDone += sizeRun;

const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d);
OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback);
@@ -286,37 +307,29 @@ void Dispatcher::initContinue(Device & d) {
}
}

void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bOneAtATime = false) {
void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, cl_event * pEvent = NULL) {
const size_t worksizeMax = m_worksizeMax;
size_t worksizeOffset = 0;
cl_event clEvent;
while (worksizeGlobal) {
const size_t worksizeRun = std::min(worksizeGlobal, worksizeMax);
const size_t * const pWorksizeLocal = (worksizeLocal == 0 ? NULL : &worksizeLocal);
const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, pWorksizeLocal, 0, NULL, bOneAtATime ? &clEvent : NULL);
const auto res = clEnqueueNDRangeKernel(clQueue, clKernel, 1, &worksizeOffset, &worksizeRun, pWorksizeLocal, 0, NULL, pEvent);
OpenCLException::throwIfError("kernel queueing failed", res);

// Queueing lots of work exhausted resources on my GTX 1070 during initialization. I don't really know why. Correlated with worksizeMax.
if (bOneAtATime) {
clWaitForEvents(1, &clEvent);
clReleaseEvent(clEvent);
clEvent = NULL;
}

worksizeGlobal -= worksizeRun;
worksizeOffset += worksizeRun;
}
}

void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bOneAtATime = false) {
void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, cl_event * pEvent = NULL) {
try {
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, bOneAtATime);
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, pEvent);
} catch ( OpenCLException & e ) {
// If local work size is invalid, abandon it and let implementation decide
if ((e.m_res == CL_INVALID_WORK_GROUP_SIZE || e.m_res == CL_INVALID_WORK_ITEM_SIZE) && d.m_worksizeLocal != 0) {
std::cout << std::endl << "warning: local work size abandoned on GPU" << d.m_index << std::endl;
d.m_worksizeLocal = 0;
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, bOneAtATime);
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, pEvent);
}
else {
throw;
@@ -328,9 +341,16 @@ void Dispatcher::dispatch(Device & d) {
cl_event event;
d.m_memResult.read(false, &event);

#ifdef PROFANITY_DEBUG
cl_event eventInverse;
cl_event eventIterate;

enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize, &eventInverse);
enqueueKernelDevice(d, d.m_kernelIterate, m_size, &eventIterate);
#else
enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize);
enqueueKernelDevice(d, d.m_kernelInversePost, m_size);
enqueueKernelDevice(d, d.m_kernelEnd, m_size);
enqueueKernelDevice(d, d.m_kernelIterate, m_size);
#endif

if (d.m_kernelTransform) {
enqueueKernelDevice(d, d.m_kernelTransform, m_size);
@@ -339,6 +359,14 @@ void Dispatcher::dispatch(Device & d) {
enqueueKernelDevice(d, d.m_kernelScore, m_size);
clFlush(d.m_clQueue);

#ifdef PROFANITY_DEBUG
// We're actually not allowed to call clFinish here because this function is ultimately asynchronously called by OpenCL.
// However, this happens to work on my computer and it's not really intended for release, just something to aid me in
// optimizations.
clFinish(d.m_clQueue);
std::cout << "Timing: profanity_inverse = " << getKernelExecutionTimeMicros(eventInverse) << "us, profanity_iterate = " << getKernelExecutionTimeMicros(eventIterate) << "us" << std::endl;
#endif

const auto res = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d);
OpenCLException::throwIfError("failed to set custom callback", res);
}
@@ -374,8 +402,8 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) {
else if (d.m_eventFinished != NULL) {
initContinue(d);
} else {
handleResult(d);
++d.m_round;
handleResult(d);

bool bDispatch = true;
{
16 changes: 9 additions & 7 deletions Dispatcher.hpp
Original file line number Diff line number Diff line change
@@ -49,16 +49,16 @@ class Dispatcher {
cl_uchar m_clScoreMax;
cl_command_queue m_clQueue;

cl_kernel m_kernelBegin;
cl_kernel m_kernelInit;
cl_kernel m_kernelInverse;
cl_kernel m_kernelInversePost;
cl_kernel m_kernelEnd;
cl_kernel m_kernelIterate;
cl_kernel m_kernelTransform;
cl_kernel m_kernelScore;

CLMemory<point> m_memPrecomp;
CLMemory<point> m_memPoints;
CLMemory<mp_number> m_memInverse;
CLMemory<mp_number> m_memPointsDeltaX;
CLMemory<mp_number> m_memInversedNegativeDoubleGy;
CLMemory<mp_number> m_memPrevLambda;
CLMemory<result> m_memResult;

// Data parameters used in some modes
@@ -90,8 +90,8 @@ class Dispatcher {
void initContinue(Device & d);

void dispatch(Device & d);
void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, const bool bSynchronous);
void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bSynchronous);
void enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel, size_t worksizeGlobal, const size_t worksizeLocal, cl_event * pEvent);
void enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, cl_event * pEvent);

void handleResult(Device & d);
void randomizeSeed(Device & d);
@@ -124,6 +124,8 @@ class Dispatcher {
std::chrono::time_point<std::chrono::steady_clock> timeStart;
unsigned int m_countPrint;
unsigned int m_countRunning;
size_t m_sizeInitTotal;
size_t m_sizeInitDone;
bool m_quit;
};

85 changes: 7 additions & 78 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,83 +1,12 @@
# profanity
Profanity is a high performance (probably the fastest!) vanity address generator for Ethereum. Create cool customized addresses that you never realized you needed! Recieve Ether in style! Wow!
This project was abandoned by me a couple of years ago. Fundamental security issues in the generation of private keys have been brought to my attention. See: https://github.com/johguse/profanity/issues/61

![Screenshot](/img/screenshot.png?raw=true "Wow! That's a lot of zeros!")
I strongly advice against using this tool in its current state. This repository will soon be further updated with additional information regarding this critical issue.

### Releases
Latest release compiled for 64-bit Windows & Linux can be found [here](https://github.com/johguse/profanity/releases).
## 2022-09-15
All affected binaries have been removed to prevent further unsafe use of this tool, please see the following article for more information:

### Disclaimer
Always verify that a private key generated by this program corresponds to the public key printed by importing it to a wallet of your choice. This program like any software might contain bugs and it does by design cut corners to improve overall performance.
https://blog.1inch.io/a-vulnerability-disclosed-in-profanity-an-ethereum-vanity-address-tool-68ed7455fc8c

### Usage
```
usage: ./profanity [OPTIONS]
Basic modes:
--benchmark Run without any scoring, a benchmark.
--zeros Score on zeros anywhere in hash.
--letters Score on letters anywhere in hash.
--numbers Score on numbers anywhere in hash.
--mirror Score on mirroring from center.
--leading-doubles Score on hashes leading with hexadecimal pairs
Modes with arguments:
--leading <single hex> Score on hashes leading with given hex character.
--matching <hex string> Score on hashes matching given hex string.
Advanced modes:
--contract Instead of account address, score the contract
address created by the account's zeroth transaction.
--leading-range Scores on hashes leading with characters within
given range.
--range Scores on hashes having characters within given
range anywhere.
Range:
-m, --min <0-15> Set range minimum (inclusive), 0 is '0' 15 is 'f'.
-M, --max <0-15> Set range maximum (inclusive), 0 is '0' 15 is 'f'.
Device control:
-s, --skip <index> Skip device given by index.
-n, --no-cache Don't load cached pre-compiled version of kernel.
Tweaking:
-w, --work <size> Set OpenCL local work size. [default = 64]
-W, --work-max <size> Set OpenCL maximum work size. [default = -i * -I]
-i, --inverse-size Set size of modular inverses to calculate in one
work item. [default = 255]
-I, --inverse-multiple Set how many above work items will run in
parallell. [default = 16384]
Examples:
./profanity --leading f
./profanity --matching dead
./profanity --matching badXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXbad
./profanity --leading-range -m 0 -M 1
./profanity --leading-range -m 10 -M 12
./profanity --range -m 0 -M 1
./profanity --contract --leading 0
About:
profanity is a vanity address generator for Ethereum that utilizes
computing power from GPUs using OpenCL.
Author: Johan Gustafsson <profanity@johgu.se>
Beer donations: 0x000dead000ae1c8e8ac27103e4ff65f42a4e9203
```

### Benchmarks - Current version
|Model|Clock Speed|Memory Speed|Modified straps|Speed|Time to match eight characters
|:-:|:-:|:-:|:-:|:-:|:-:|
|GTX 1070 OC|1950|4450|NO|138.0 MH/s| ~31s
|GTX 1070|1750|4000|NO|122.0 MH/s| ~35s
|RX 480|1328|2000|YES|103.0 MH/s| ~42s

### Benchmarks - Outdated versions
|Model|Clock Speed|Memory Speed|Modified straps|Speed|Time to match eight characters|Version
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|RX VEGA56|1408|1100|YES|146 MH/s| ~29 s | 1.1x
|R9 290|1150|1400|NO|100 MH/s| ~43 s | 1.1x
|RX 580|1366|1750|YES|92 MH/s| ~47 s| 1.2x
|R9 290|1040|1300|NO|91 MH/s| ~47 s | 1.1x
|RX 470|1216|1750|YES|73 MH/s| ~59s | 1.2x
## 2022-09-15
As per issue 76 (https://github.com/johguse/profanity/issues/76) I've decided to also archive this repository to further reduce risk that someone uses this tool. The code will not recieve any updates and I've left it in an uncompilable state. Use something else!
Loading