Skip to content

Commit

Permalink
[bug] fix gpu photon-sharing hanging, support mmc binary mode, close #86
Browse files Browse the repository at this point in the history
  • Loading branch information
fangq committed Oct 15, 2024
1 parent b3f9739 commit 072fafe
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 78 deletions.
14 changes: 1 addition & 13 deletions mmclab/mmc2json.m
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
% mcxpreview supports the cfg input for both mcxlab and mmclab.
% filestub: the filestub is the name stub for all output files,including
% filestub.json: the JSON input file
% filestub_vol.bin: the volume file if cfg.vol is defined
% filestub_shapes.json: the domain shape file if cfg.shapes is defined
% filestub_pattern.bin: the domain shape file if cfg.pattern is defined
%
% if filestub ends with '.json', then mmc2json saves the mesh data
% in the single-file mode, and the mesh information is stored
Expand Down Expand Up @@ -58,16 +55,7 @@
end
end
if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern))
Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1);
Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2);
Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3);
Optode.Source.Pattern.Data = single(cfg.srcpattern);
if (~singlefile)
Optode.Source.Pattern.Data = [filestub '_pattern.bin'];
fid = fopen(Optode.Source.Pattern.Data, 'wb');
fwrite(fid, cfg.srcpattern, 'float32');
fclose(fid);
end
Optode.Source.Pattern = single(cfg.srcpattern);
end

%% define the domain and optical properties
Expand Down
73 changes: 50 additions & 23 deletions src/mmc_cl_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/
cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/

cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;
cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; /**< total output data length in float count per time-frame */
cfg->crop0.w = meshlen * cfg->maxgate; /**< total output data length, before double-buffer expansion */

cl_float* field, *dref = NULL;

Expand All @@ -107,6 +107,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {

GPUInfo* gpu = NULL;
float4* propdet;
double energytot = 0.0, energyesc = 0.0;

MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}},
cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit,
Expand All @@ -124,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
(MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
};
Expand All @@ -146,6 +147,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {

param.reclen = sharedmemsize / sizeof(float);

sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1); /**< store energyesc/energytot */

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sharedmemsize += sizeof(cl_float) * cfg->srcnum;
}
Expand Down Expand Up @@ -256,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN));
}

fieldlen = meshlen * cfg->maxgate;
fieldlen = cfg->crop0.w; /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */

if (cfg->seed > 0) {
srand(cfg->seed);
Expand Down Expand Up @@ -310,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), &param, &status), status)));

Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN);
energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1);
energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1);

for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) {
Pseed[j] = rand();
Expand All @@ -331,7 +334,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status)));
}

OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status)));
OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status)));
OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status)));
OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));

Expand Down Expand Up @@ -509,6 +512,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons,
cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin);

MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock);

OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
Expand Down Expand Up @@ -556,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug));
}

cfg->energytot = 0.f;
cfg->energyesc = 0.f;
cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double));
cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double));
energytot = 0.0;
energyesc = 0.0;
cfg->runtime = 0;

//simulate for all time-gates in maxgate groups per run
Expand Down Expand Up @@ -636,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
reporter.raytet += rep.raytet;
reporter.jumpdebug += rep.jumpdebug;

energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1);
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1),
energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1);
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum,
energy, 0, NULL, NULL)));

for (i = 0; i < gpu[devid].autothread; i++) {
cfg->energyesc += energy[(i << 1)];
cfg->energytot += energy[(i << 1) + 1];
for (j = 0; j < cfg->srcnum; j++) {
cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j];
cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j];
energyesc += energy[(i << 1) * cfg->srcnum + j];
energytot += energy[((i << 1) + 1) * cfg->srcnum + j];
}
}

free(energy);
Expand Down Expand Up @@ -707,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen,
rawdref, 0, NULL, NULL)));

//TODO: saving dref has not yet adopting double-buffer
for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU
dref[i] += rawdref[i]; //+rawfield[i+fieldlen];
}
Expand All @@ -724,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify
mcx_fflush(cfg->flog);

for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
field[i] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
field[i] += rawfield[i] + rawfield[i + fieldlen];
}

free(rawfield);
Expand Down Expand Up @@ -754,15 +766,20 @@ is more than what your have specified (%d), please use the -H option to specify
cfg->exportfield[i] += field[i];
}
} else {
for (i = 0; i < cfg->maxgate; i++)
int srcid;

for (i = 0; i < cfg->maxgate; i++) {
for (j = 0; j < mesh->ne; j++) {
float ww = field[i * mesh->ne + j] * 0.25f;
int k;
for (srcid = 0; srcid < cfg->srcnum; srcid++) {
float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f;
int k;

for (k = 0; k < mesh->elemlen; k++) {
cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww;
for (k = 0; k < mesh->elemlen; k++) {
cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww;
}
}
}
}
}
}

Expand All @@ -773,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify
}

if (cfg->isnormalized) {
MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t");
mcx_fflush(cfg->flog);
double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0;

for (j = 0; j < cfg->srcnum; j++) {
energyabs = cfg->energytot[j] - cfg->energyesc[j];
cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j);
sum_normalizer += cur_normalizer;
MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n",
j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer));
}

cfg->energyabs = cfg->energytot - cfg->energyesc;
mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0);
cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources
}

#ifndef MCX_CONTAINER
Expand Down Expand Up @@ -817,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify
MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n",
cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc);
MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n",
cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f);
energytot, (energytot - energyesc) / energytot * 100.f);
mcx_fflush(cfg->flog);

OCL_ASSERT(clReleaseMemObject(gprogress[0]));
Expand Down Expand Up @@ -899,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify
free(mcxkernel);

free(waittoread);
free(cfg->energytot);
free(cfg->energyesc);
cfg->energytot = NULL;
cfg->energyesc = NULL;

if (gpu) {
free(gpu);
Expand Down
39 changes: 25 additions & 14 deletions src/mmc_core.cl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ inline __device__ __host__ int get_global_id(int idx) {
inline __device__ __host__ int get_local_id(int idx) {
return (idx == 0) ? threadIdx.x : ( (idx == 1) ? threadIdx.y : threadIdx.z );
}
inline __device__ __host__ int get_local_size(int idx) {
return (idx == 0) ? blockDim.x : ( (idx == 1) ? blockDim.y : blockDim.z );
}
inline __device__ __host__ float3 operator *(float3 a, float3 b) {
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
Expand Down Expand Up @@ -480,7 +483,6 @@ __device__ inline float atomicadd(volatile __global float* address, const float

#endif

#ifdef MCX_SAVE_DETECTORS
__device__ void clearpath(__local float* p, int len) {
uint i;

Expand All @@ -489,6 +491,7 @@ __device__ void clearpath(__local float* p, int len) {
}
}

#ifdef MCX_SAVE_DETECTORS
__device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCXParam* gcfg) {
uint i;

Expand Down Expand Up @@ -1443,7 +1446,7 @@ __device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global floa

__device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXParam* gcfg, __global float3* node, __global int* elem, __global float* weight, __global float* dref,
__global int* type, __global int* facenb, __global int* srcelem, __global float4* normal, __constant Medium* gmed,
__global float* n_det, __global uint* detectedphoton, float* energytot, float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern,
__global float* n_det, __global uint* detectedphoton, __local float* energytot, __local float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern,
__global float* replayweight, __global float* replaytime, __global RandType* photonseed, __global MCXReporter* reporter, __global float* gdebugdata) {

int oldeid, fixcount = 0;
Expand All @@ -1464,12 +1467,10 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP

/*initialize the photon parameters*/
launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern);
*energytot += r.weight;

#ifdef MCX_SAVE_DETECTORS

if (GPU_PARAM(gcfg, issavedet)) {
clearpath(ppath, GPU_PARAM(gcfg, reclen)); /*clear shared memory for saving history of a new photon*/

if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/
} else if (GPU_PARAM(gcfg, srctype) == stPattern) {
Expand All @@ -1479,9 +1480,12 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP

#endif

if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) {
for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) {
if (GPU_PARAM(gcfg, srcnum) == 1) {
*energytot += r.weight;
} else {
for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) {
ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid];
energytot[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid];
}
}

Expand Down Expand Up @@ -1700,7 +1704,9 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
savedebugdata(&r, id, reporter, gdebugdata, gcfg);
}

*energyesc += r.weight;
for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) {
energyesc[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid];
}
}

__kernel void mmc_main_loop(const int nphoton, const int ophoton,
Expand All @@ -1714,7 +1720,6 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,

RandType t[RAND_BUF_LEN];
int idx = get_global_id(0);
float energyesc = 0.f, energytot = 0.f;
int raytet = 0;

#ifdef __NVCC__
Expand All @@ -1725,21 +1730,27 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
gpu_rng_init(t, n_seed, idx);
}

clearpath(sharedmem, get_local_size(0) * ((GPU_PARAM(gcfg, srcnum) << 1) +
(GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum))));

/*launch photons*/
for (int i = 0; i < nphoton + (idx < ophoton); i++) {
if (GPU_PARAM(gcfg, seed) == SEED_FROM_FILE)
for (int j = 0; j < RAND_BUF_LEN; j++) {
t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j];
}

onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) *
(GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet,
onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_size(0) * (GPU_PARAM(gcfg, srcnum) << 1) +
get_local_id(0) * (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, sharedmem + get_local_id(0) * GPU_PARAM(gcfg, srcnum),
sharedmem + (get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum), t, &raytet,
srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata);
}

energy[idx << 1] = energyesc;
energy[1 + (idx << 1)] = energytot;
for (int i = 0; i < GPU_PARAM(gcfg, srcnum); i++) {
energy[(idx << 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[(get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum) + i];
energy[((idx << 1) + 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[get_local_id(0) * GPU_PARAM(gcfg, srcnum) + i];
}

if (GPU_PARAM(gcfg, debuglevel) & MCX_DEBUG_PROGRESS && progress) {
atomic_inc(progress);
Expand Down
Loading

0 comments on commit 072fafe

Please sign in to comment.