[bug] fix gpu photon-sharing hanging, support mmc binary mode, close #86

fangq · Oct 15, 2024 · 072fafe · 072fafe
1 parent b3f9739
commit 072fafe
Show file tree

Hide file tree

Showing 6 changed files with 171 additions and 78 deletions.
diff --git a/mmclab/mmc2json.m b/mmclab/mmc2json.m
@@ -13,9 +13,6 @@
 %         mcxpreview supports the cfg input for both mcxlab and mmclab.
 %    filestub: the filestub is the name stub for all output files,including
 %         filestub.json: the JSON input file
-%         filestub_vol.bin: the volume file if cfg.vol is defined
-%         filestub_shapes.json: the domain shape file if cfg.shapes is defined
-%         filestub_pattern.bin: the domain shape file if cfg.pattern is defined
 %
 %         if filestub ends with '.json', then mmc2json saves the mesh data
 %         in the single-file mode, and the mesh information is stored
@@ -58,16 +55,7 @@
     end
 end
 if (isfield(cfg, 'srcpattern') && ~isempty(cfg.srcpattern))
-    Optode.Source.Pattern.Nx = size(cfg.srcpattern, 1);
-    Optode.Source.Pattern.Ny = size(cfg.srcpattern, 2);
-    Optode.Source.Pattern.Nz = size(cfg.srcpattern, 3);
-    Optode.Source.Pattern.Data = single(cfg.srcpattern);
-    if (~singlefile)
-        Optode.Source.Pattern.Data = [filestub '_pattern.bin'];
-        fid = fopen(Optode.Source.Pattern.Data, 'wb');
-        fwrite(fid, cfg.srcpattern, 'float32');
-        fclose(fid);
-    end
+    Optode.Source.Pattern = single(cfg.srcpattern);
 end
 
 %% define the domain and optical properties

diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c
@@ -92,8 +92,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
     cl_mem* gweight, *gdref, *gdetphoton, *gseed, *genergy, *greporter, *gdebugdata;     /*read-write buffers*/
     cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/
 
-    cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;
-    cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
+    cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) * cfg->srcnum;    /**< total output data length in float count per time-frame */
+    cfg->crop0.w = meshlen * cfg->maxgate;    /**< total output data length, before double-buffer expansion */
 
     cl_float*  field, *dref = NULL;
 
@@ -107,6 +107,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
 
     GPUInfo* gpu = NULL;
     float4* propdet;
+    double energytot = 0.0, energyesc = 0.0;
 
     MCXParam param = {{{cfg->srcpos.x, cfg->srcpos.y, cfg->srcpos.z}}, {{cfg->srcdir.x, cfg->srcdir.y, cfg->srcdir.z}},
         cfg->tstart, cfg->tend, (uint)cfg->isreflect, (uint)cfg->issavedet, (uint)cfg->issaveexit,
@@ -124,7 +125,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
         cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
         mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
-        cfg->e0, cfg->isextdet, meshlen, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
+        cfg->e0, cfg->isextdet, (meshlen / cfg->srcnum), (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
         (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
         cfg->issaveseed, cfg->seed, cfg->maxjumpdebug
     };
@@ -146,6 +147,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
 
     param.reclen = sharedmemsize / sizeof(float);
 
+    sharedmemsize += sizeof(cl_float) * (cfg->srcnum << 1);   /**< store energyesc/energytot */
+
     if (cfg->srctype == stPattern && cfg->srcnum > 1) {
         sharedmemsize += sizeof(cl_float) * cfg->srcnum;
     }
@@ -256,7 +259,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         Pphotonseed = (RandType*)calloc(cfg->maxdetphoton, (sizeof(RandType) * RAND_BUF_LEN));
     }
 
-    fieldlen = meshlen * cfg->maxgate;
+    fieldlen = cfg->crop0.w;  /**< total float counts of the output buffer, before double-buffer expansion (x2) for improving saving accuracy */
 
     if (cfg->seed > 0) {
         srand(cfg->seed);
@@ -310,7 +313,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         OCL_ASSERT(((gparam[i] = clCreateBuffer(mcxcontext, RO_MEM, sizeof(MCXParam), &param, &status), status)));
 
         Pseed = (cl_uint*)malloc(sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN);
-        energy = (cl_float*)calloc(sizeof(cl_float), gpu[i].autothread << 1);
+        energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[i].autothread << 1);
 
         for (j = 0; j < gpu[i].autothread * RAND_SEED_WORD_LEN; j++) {
             Pseed[j] = rand();
@@ -331,7 +334,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
             OCL_ASSERT(((gdebugdata[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (debuglen * cfg->maxjumpdebug), cfg->exportdebugdata, &status), status)));
         }
 
-        OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1), energy, &status), status)));
+        OCL_ASSERT(((genergy[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * (gpu[i].autothread << 1) * cfg->srcnum, energy, &status), status)));
         OCL_ASSERT(((gdetected[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint), &detected, &status), status)));
         OCL_ASSERT(((greporter[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(MCXReporter), &reporter, &status), status)));
 
@@ -509,6 +512,8 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         MMC_FPRINTF(cfg->flog, "- [device %d(%d): %s] threadph=%d oddphotons=%d np=%.1f nthread=%d nblock=%d repetition=%d\n", i, gpu[i].id, gpu[i].name, threadphoton, oddphotons,
                     cfg->nphoton * cfg->workload[i] / fullload, (int)gpu[i].autothread, (int)gpu[i].autoblock, cfg->respin);
 
+        MMC_FPRINTF(cfg->flog, "requesting %d bytes of shared memory\n", sharedmemsize * (int)gpu[i].autoblock);
+
         OCL_ASSERT(((mcxkernel[i] = clCreateKernel(mcxprogram, "mmc_main_loop", &status), status)));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 0, sizeof(cl_uint), (void*)&threadphoton)));
         OCL_ASSERT((clSetKernelArg(mcxkernel[i], 1, sizeof(cl_uint), (void*)&oddphotons)));
@@ -556,8 +561,10 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
         cfg->exportdebugdata = (float*)calloc(sizeof(float), (debuglen * cfg->maxjumpdebug));
     }
 
-    cfg->energytot = 0.f;
-    cfg->energyesc = 0.f;
+    cfg->energytot = (double*)calloc(cfg->srcnum, sizeof(double));
+    cfg->energyesc = (double*)calloc(cfg->srcnum, sizeof(double));
+    energytot = 0.0;
+    energyesc = 0.0;
     cfg->runtime = 0;
 
     //simulate for all time-gates in maxgate groups per run
@@ -636,13 +643,17 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer) {
                 reporter.raytet += rep.raytet;
                 reporter.jumpdebug += rep.jumpdebug;
 
-                energy = (cl_float*)calloc(sizeof(cl_float), gpu[devid].autothread << 1);
-                OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1),
+                energy = (cl_float*)calloc(sizeof(cl_float) * cfg->srcnum, gpu[devid].autothread << 1);
+                OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], genergy[devid], CL_TRUE, 0, sizeof(cl_float) * (gpu[devid].autothread << 1) * cfg->srcnum,
                                                 energy, 0, NULL, NULL)));
 
                 for (i = 0; i < gpu[devid].autothread; i++) {
-                    cfg->energyesc += energy[(i << 1)];
-                    cfg->energytot += energy[(i << 1) + 1];
+                    for (j = 0; j < cfg->srcnum; j++) {
+                        cfg->energyesc[j] += energy[(i << 1) * cfg->srcnum + j];
+                        cfg->energytot[j] += energy[((i << 1) + 1) * cfg->srcnum + j];
+                        energyesc += energy[(i << 1) * cfg->srcnum + j];
+                        energytot += energy[((i << 1) + 1) * cfg->srcnum + j];
+                    }
                 }
 
                 free(energy);
@@ -707,6 +718,7 @@ is more than what your have specified (%d), please use the -H option to specify
                     OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gdref[devid], CL_TRUE, 0, sizeof(float)*nflen,
                                                     rawdref, 0, NULL, NULL)));
 
+                    //TODO: saving dref has not yet adopting double-buffer
                     for (i = 0; i < nflen; i++) { //accumulate field, can be done in the GPU
                         dref[i] += rawdref[i];    //+rawfield[i+fieldlen];
                     }
@@ -724,7 +736,7 @@ is more than what your have specified (%d), please use the -H option to specify
                     mcx_fflush(cfg->flog);
 
                     for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
-                        field[i] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
+                        field[i] += rawfield[i] + rawfield[i + fieldlen];
                     }
 
                     free(rawfield);
@@ -754,15 +766,20 @@ is more than what your have specified (%d), please use the -H option to specify
                 cfg->exportfield[i] += field[i];
             }
         } else {
-            for (i = 0; i < cfg->maxgate; i++)
+            int srcid;
+
+            for (i = 0; i < cfg->maxgate; i++) {
                 for (j = 0; j < mesh->ne; j++) {
-                    float ww = field[i * mesh->ne + j] * 0.25f;
-                    int k;
+                    for (srcid = 0; srcid < cfg->srcnum; srcid++) {
+                        float ww = field[(i * mesh->ne + j) * cfg->srcnum + srcid] * 0.25f;
+                        int k;
 
-                    for (k = 0; k < mesh->elemlen; k++) {
-                        cfg->exportfield[i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1] += ww;
+                        for (k = 0; k < mesh->elemlen; k++) {
+                            cfg->exportfield[(i * mesh->nn + mesh->elem[j * mesh->elemlen + k] - 1) * cfg->srcnum + srcid] += ww;
+                        }
                     }
                 }
+            }
         }
     }
 
@@ -773,11 +790,17 @@ is more than what your have specified (%d), please use the -H option to specify
     }
 
     if (cfg->isnormalized) {
-        MMC_FPRINTF(cfg->flog, "normalizing raw data ...\t");
-        mcx_fflush(cfg->flog);
+        double cur_normalizer, sum_normalizer = 0.0, energyabs = 0.0;
+
+        for (j = 0; j < cfg->srcnum; j++) {
+            energyabs =  cfg->energytot[j] - cfg->energyesc[j];
+            cur_normalizer = mesh_normalize(mesh, cfg, energyabs, cfg->energytot[j], j);
+            sum_normalizer += cur_normalizer;
+            MMCDEBUG(cfg, dlTime, (cfg->flog, "source %d\ttotal simulated energy: %f\tabsorbed: "S_BOLD""S_BLUE"%5.5f%%"S_RESET"\tnormalizor=%g\n",
+                                   j + 1, cfg->energytot[j], 100.f * energyabs / cfg->energytot[j], cur_normalizer));
+        }
 
-        cfg->energyabs = cfg->energytot - cfg->energyesc;
-        mesh_normalize(mesh, cfg, cfg->energyabs, cfg->energytot, 0);
+        cfg->his.normalizer = sum_normalizer / cfg->srcnum; // average normalizer value for all simulated sources
     }
 
 #ifndef MCX_CONTAINER
@@ -817,7 +840,7 @@ is more than what your have specified (%d), please use the -H option to specify
     MMC_FPRINTF(cfg->flog, "simulated %zu photons (%zu) with %d devices (ray-tet %.0f)\nMCX simulation speed: %.2f photon/ms\n",
                 cfg->nphoton, cfg->nphoton, workdev, reporter.raytet, (double)cfg->nphoton / toc);
     MMC_FPRINTF(cfg->flog, "total simulated energy: %.2f\tabsorbed: %5.5f%%\n(loss due to initial specular reflection is excluded in the total)\n",
-                cfg->energytot, (cfg->energytot - cfg->energyesc) / cfg->energytot * 100.f);
+                energytot, (energytot - energyesc) / energytot * 100.f);
     mcx_fflush(cfg->flog);
 
     OCL_ASSERT(clReleaseMemObject(gprogress[0]));
@@ -899,6 +922,10 @@ is more than what your have specified (%d), please use the -H option to specify
     free(mcxkernel);
 
     free(waittoread);
+    free(cfg->energytot);
+    free(cfg->energyesc);
+    cfg->energytot = NULL;
+    cfg->energyesc = NULL;
 
     if (gpu) {
         free(gpu);

diff --git a/src/mmc_core.cl b/src/mmc_core.cl
@@ -47,6 +47,9 @@ inline __device__ __host__ int get_global_id(int idx) {
 inline __device__ __host__ int get_local_id(int idx) {
     return (idx == 0) ? threadIdx.x : ( (idx == 1) ? threadIdx.y : threadIdx.z );
 }
+inline __device__ __host__ int get_local_size(int idx) {
+    return (idx == 0) ? blockDim.x : ( (idx == 1) ? blockDim.y : blockDim.z );
+}
 inline __device__ __host__ float3 operator *(float3 a, float3 b) {
     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 }
@@ -480,7 +483,6 @@ __device__ inline float atomicadd(volatile __global float* address, const float
 
 #endif
 
-#ifdef MCX_SAVE_DETECTORS
 __device__ void clearpath(__local float* p, int len) {
     uint i;
 
@@ -489,6 +491,7 @@ __device__ void clearpath(__local float* p, int len) {
     }
 }
 
+#ifdef MCX_SAVE_DETECTORS
 __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCXParam* gcfg) {
     uint i;
 
@@ -1443,7 +1446,7 @@ __device__ void launchnewphoton(__constant MCXParam* gcfg, ray* r, __global floa
 
 __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXParam* gcfg, __global float3* node, __global int* elem, __global float* weight, __global float* dref,
                           __global int* type, __global int* facenb,  __global int* srcelem, __global float4* normal, __constant Medium* gmed,
-                          __global float* n_det, __global uint* detectedphoton, float* energytot, float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern,
+                          __global float* n_det, __global uint* detectedphoton, __local float* energytot, __local float* energyesc, __private RandType* ran, int* raytet, __global float* srcpattern,
                           __global float* replayweight, __global float* replaytime, __global RandType* photonseed, __global MCXReporter* reporter, __global float* gdebugdata) {
 
     int oldeid, fixcount = 0;
@@ -1464,12 +1467,10 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 
     /*initialize the photon parameters*/
     launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern);
-    *energytot += r.weight;
+
 #ifdef MCX_SAVE_DETECTORS
 
     if (GPU_PARAM(gcfg, issavedet)) {
-        clearpath(ppath, GPU_PARAM(gcfg, reclen));  /*clear shared memory for saving history of a new photon*/
-
         if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) {
             ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/
         } else if (GPU_PARAM(gcfg, srctype) == stPattern) {
@@ -1479,9 +1480,12 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
 
 #endif
 
-    if (GPU_PARAM(gcfg, srctype) == stPattern && GPU_PARAM(gcfg, srcnum) > 1) {
-        for (oldeid = 0; oldeid > GPU_PARAM(gcfg, srcnum); oldeid++) {
+    if (GPU_PARAM(gcfg, srcnum) == 1) {
+        *energytot += r.weight;
+    } else {
+        for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) {
             ppath[GPU_PARAM(gcfg, reclen) + oldeid] = srcpattern[r.posidx * GPU_PARAM(gcfg, srcnum) + oldeid];
+            energytot[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid];
         }
     }
 
@@ -1700,7 +1704,9 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP
         savedebugdata(&r, id, reporter, gdebugdata, gcfg);
     }
 
-    *energyesc += r.weight;
+    for (oldeid = 0; oldeid < GPU_PARAM(gcfg, srcnum); oldeid++) {
+        energyesc[oldeid] += r.weight * ppath[GPU_PARAM(gcfg, reclen) + oldeid];
+    }
 }
 
 __kernel void mmc_main_loop(const int nphoton, const int ophoton,
@@ -1714,7 +1720,6 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
 
     RandType t[RAND_BUF_LEN];
     int idx = get_global_id(0);
-    float  energyesc = 0.f, energytot = 0.f;
     int raytet = 0;
 
 #ifdef __NVCC__
@@ -1725,21 +1730,27 @@ __kernel void mmc_main_loop(const int nphoton, const int ophoton,
         gpu_rng_init(t, n_seed, idx);
     }
 
+    clearpath(sharedmem, get_local_size(0) * ((GPU_PARAM(gcfg, srcnum) << 1) +
+              (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum))));
+
     /*launch photons*/
     for (int i = 0; i < nphoton + (idx < ophoton); i++) {
         if (GPU_PARAM(gcfg, seed) == SEED_FROM_FILE)
             for (int j = 0; j < RAND_BUF_LEN; j++) {
                 t[j] = replayseed[(idx * nphoton + MIN(idx, ophoton) + i) * RAND_BUF_LEN + j];
             }
 
-        onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_id(0) *
-                  (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
-                  weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, &energytot, &energyesc, t, &raytet,
+        onephoton(idx * nphoton + MIN(idx, ophoton) + i, sharedmem + get_local_size(0) * (GPU_PARAM(gcfg, srcnum) << 1) +
+                  get_local_id(0) * (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum)), gcfg, node, elem,
+                  weight, dref, type, facenb, srcelem, normal, gmed, n_det, detectedphoton, sharedmem + get_local_id(0) * GPU_PARAM(gcfg, srcnum),
+                  sharedmem + (get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum), t, &raytet,
                   srcpattern, replayweight, replaytime, photonseed, reporter, gdebugdata);
     }
 
-    energy[idx << 1] = energyesc;
-    energy[1 + (idx << 1)] = energytot;
+    for (int i = 0; i < GPU_PARAM(gcfg, srcnum); i++) {
+        energy[(idx << 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[(get_local_size(0) + get_local_id(0)) * GPU_PARAM(gcfg, srcnum) + i];
+        energy[((idx << 1) + 1) * GPU_PARAM(gcfg, srcnum) + i] += sharedmem[get_local_id(0) * GPU_PARAM(gcfg, srcnum) + i];
+    }
 
     if (GPU_PARAM(gcfg, debuglevel) & MCX_DEBUG_PROGRESS && progress) {
         atomic_inc(progress);