Skip to content

Commit

Permalink
Merge pull request #99 from fangq/photonsharing
Browse files Browse the repository at this point in the history
Support photon sharing on the GPU, fix #86
  • Loading branch information
fangq authored Oct 15, 2024
2 parents 2a7e9dd + 072fafe commit 8867302
Show file tree
Hide file tree
Showing 11 changed files with 417 additions and 236 deletions.
4 changes: 2 additions & 2 deletions mmclab/example/demo_photon_sharing.m
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
cfg.tstart = 0;
cfg.tend = 5e-9;
cfg.tstep = 1e-10;
cfg.gpuid = -1;
cfg.gpuid = 1;
cfg.method = 'elem';
cfg.debuglevel = 'TP';

Expand All @@ -30,7 +30,7 @@
pat(16:25, :, 5) = 1;
pat(:, 16:25, 5) = 1; % pattern with a bright cross

cfg.srcpattern = pat;
cfg.srcpattern = permute(pat, [3 1 2]);

%% run the simulation

Expand Down
14 changes: 1 addition & 13 deletions mmclab/mmc2json.m
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
% mcxpreview supports the cfg input for both mcxlab and mmclab.
% filestub: the filestub is the name stub for all output files,including
% filestub.json: the JSON input file
% filestub_vol.bin: the volume file if cfg.vol is defined
% filestub_shapes.json: the domain shape file if cfg.shapes is defined
% filestub_pattern.bin: the domain shape file if cfg.pattern is defined
%
% if filestub ends with '.json', then mmc2json saves the mesh data
% in the single-file mode, and the mesh information is stored
Expand Down Expand Up @@ -58,16 +55,7 @@
end
end
if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern))
Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1);
Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2);
Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3);
Optode.Source.Pattern.Data = single(cfg.srcpattern);
if (~singlefile)
Optode.Source.Pattern.Data = [filestub '_pattern.bin'];
fid = fopen(Optode.Source.Pattern.Data, 'wb');
fwrite(fid, cfg.srcpattern, 'float32');
fclose(fid);
end
Optode.Source.Pattern = single(cfg.srcpattern);
end

%% define the domain and optical properties
Expand Down
4 changes: 2 additions & 2 deletions mmclab/mmclab.m
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@
% by srcpos, srcpos+srcparam1(1:3) and srcpos+srcparam2(1:3)
% 'pattern' - a 3D quadrilateral pattern illumination, same as above, except
% srcparam1(4) and srcparam2(4) specify the pattern array x/y dimensions,
% and srcpattern is a floating-point pattern array, with values between [0-1].
% if cfg.srcnum>1, srcpattern must be a floating-point array with
% and srcpattern is a double-precision pattern array, with values between [0-1].
% if cfg.srcnum>1, srcpattern must be a 3-D double-precision array with
% a dimension of [srcnum srcparam1(4) srcparam2(4)]
% Example: <demo_photon_sharing.m>
% 'fourier' - spatial frequency domain source, similar to 'planar', except
Expand Down
98 changes: 69 additions & 29 deletions src/mmc_cl_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata; /*read-write buffers*/
cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/

cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing
cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum; /**< total output data length in float count per time-frame */
cfg->crop0.w = meshlen * cfg->maxgate; /**< total output data length, before double-buffer expansion */

cl_float* field, *dref = NULL;

Expand All @@ -103,8 +103,11 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
char opt[MAX_PATH_LENGTH + 1] = {'\0'};
cl_uint detreclen = (2 + ((cfg->ismomentum) > 0)) * mesh->prop + (cfg->issaveexit > 0) * 6 + 1;
cl_uint hostdetreclen = detreclen + 1;
int sharedmemsize = 0;

GPUInfo* gpu = NULL;
float4* propdet;
double energytot = 0.0, energyesc = 0.0;

MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}},
cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit,
Expand All @@ -122,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
(MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
};
Expand All @@ -138,6 +141,18 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
mcx_error(-99, (char*)("Unable to find devices!"), __FILE__, __LINE__);
}

if (cfg->issavedet) {
sharedmemsize = sizeof(cl_float) * detreclen;
}

param.reclen = sharedmemsize / sizeof(float);

sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1); /**< store energyesc/energytot */

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sharedmemsize += sizeof(cl_float) * cfg->srcnum;
}

cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};

/* Use NULL for backward compatibility */
Expand Down Expand Up @@ -244,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN));
}

fieldlen = meshlen * cfg->maxgate;
fieldlen = cfg->crop0.w; /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */

if (cfg->seed > 0) {
srand(cfg->seed);
Expand Down Expand Up @@ -298,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), &param, &status), status)));

Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN);
energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1);
energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1);

for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) {
Pseed[j] = rand();
Expand All @@ -319,14 +334,14 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status)));
}

OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status)));
OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status)));
OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status)));
OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));

if (cfg->srctype == MCX_SRC_PATTERN) {
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w), cfg->srcpattern, &status), status)));
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.w * cfg->srcparam2.w * cfg->srcnum), cfg->srcpattern, &status), status)));
} else if (cfg->srctype == MCX_SRC_PATTERN3D) {
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z), cfg->srcpattern, &status), status)));
OCL_ASSERT(((gsrcpattern[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(float) * (int)(cfg->srcparam1.x * cfg->srcparam1.y * cfg->srcparam1.z * cfg->srcnum), cfg->srcpattern, &status), status)));
} else {
gsrcpattern[i] = NULL;
}
Expand Down Expand Up @@ -417,6 +432,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
sprintf(opt + strlen(opt), " -DUSE_BLBADOUEL");
}

if (cfg->srctype == stPattern && cfg->srcnum > 1) {
sprintf(opt + strlen(opt), " -DUSE_PHOTON_SHARING");
}

if (gpu[0].vendor == dvNVIDIA) {
sprintf(opt + strlen(opt), " -DUSE_NVIDIA_GPU");
}
Expand Down Expand Up @@ -493,11 +512,13 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons,
cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin);

MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock);

OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
//OCL_ASSERT((clSetKernelArg(mcxkernel[i], 2, sizeof(cl_mem), (void*)(gparam+i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, cfg->issavedet ? sizeof(cl_float) * ((int)gpu[i].autoblock)*detreclen : sizeof(int), NULL)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 3, sharedmemsize * (int)gpu[i].autoblock, NULL)));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 4, sizeof(cl_mem), (void*)(gproperty + i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 5, sizeof(cl_mem), (void*)(gnode + i))));
OCL_ASSERT((clSetKernelArg(mcxkernel[i], 6, sizeof(cl_mem), (void*)(gelem + i))));
Expand Down Expand Up @@ -540,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug));
}

cfg->energytot = 0.f;
cfg->energyesc = 0.f;
cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double));
cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double));
energytot = 0.0;
energyesc = 0.0;
cfg->runtime = 0;

//simulate for all time-gates in maxgate groups per run
Expand Down Expand Up @@ -620,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
reporter.raytet += rep.raytet;
reporter.jumpdebug += rep.jumpdebug;

energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1);
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1),
energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1);
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum,
energy, 0, NULL, NULL)));

for (i = 0; i < gpu[devid].autothread; i++) {
cfg->energyesc += energy[(i << 1)];
cfg->energytot += energy[(i << 1) + 1];
for (j = 0; j < cfg->srcnum; j++) {
cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j];
cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j];
energyesc += energy[(i << 1) * cfg->srcnum + j];
energytot += energy[((i << 1) + 1) * cfg->srcnum + j];
}
}

free(energy);
Expand Down Expand Up @@ -691,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen,
rawdref, 0, NULL, NULL)));

//TODO: saving dref has not yet adopting double-buffer
for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU
dref[i] += rawdref[i]; //+rawfield[i+fieldlen];
}
Expand All @@ -708,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify
mcx_fflush(cfg->flog);

for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
field[i] += rawfield[i] + rawfield[i + fieldlen];
}

free(rawfield);
Expand All @@ -732,24 +760,26 @@ is more than what your have specified (%d), please use the -H option to specify
}// iteration
}// time gates

fieldlen = (fieldlen >> cfg->nbuffer);
field = realloc(field, sizeof(field[0]) * fieldlen);

if (cfg->exportfield) {
if (cfg->basisorder == 0 || cfg->method == rtBLBadouelGrid) {
for (i = 0; i < fieldlen; i++) {
cfg->exportfield[i] += field[i];
}
} else {
for (i = 0; i < cfg->maxgate; i++)
int srcid;

for (i = 0; i < cfg->maxgate; i++) {
for (j = 0; j < mesh->ne; j++) {
float ww = field[i * mesh->ne + j] * 0.25f;
int k;
for (srcid = 0; srcid < cfg->srcnum; srcid++) {
float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f;
int k;

for (k = 0; k < mesh->elemlen; k++) {
cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww;
for (k = 0; k < mesh->elemlen; k++) {
cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww;
}
}
}
}
}
}

Expand All @@ -760,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify
}

if (cfg->isnormalized) {
MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t");
mcx_fflush(cfg->flog);
double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0;

for (j = 0; j < cfg->srcnum; j++) {
energyabs = cfg->energytot[j] - cfg->energyesc[j];
cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j);
sum_normalizer += cur_normalizer;
MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n",
j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer));
}

cfg->energyabs = cfg->energytot - cfg->energyesc;
mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0);
cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources
}

#ifndef MCX_CONTAINER
Expand Down Expand Up @@ -804,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify
MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n",
cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc);
MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n",
cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f);
energytot, (energytot - energyesc) / energytot * 100.f);
mcx_fflush(cfg->flog);

OCL_ASSERT(clReleaseMemObject(gprogress[0]));
Expand Down Expand Up @@ -886,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify
free(mcxkernel);

free(waittoread);
free(cfg->energytot);
free(cfg->energyesc);
cfg->energytot = NULL;
cfg->energyesc = NULL;

if (gpu) {
free(gpu);
Expand Down
1 change: 0 additions & 1 deletion src/mmc_cl_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ typedef struct PRE_ALIGN(32) GPU_mcconfig {
cl_int e0;
cl_int isextdet;
cl_int framelen;
cl_uint nbuffer;
cl_uint maxpropdet;
cl_uint normbuf;
cl_int issaveseed;
Expand Down
Loading

0 comments on commit 8867302

Please sign in to comment.