From a6c29846e1c09cfe9261d44583b528f15b4fb153 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Mon, 4 Nov 2024 00:59:39 -0500 Subject: [PATCH] [bug] partially fix cuda saveseed, #98 --- .github/workflows/build_all.yml | 6 +-- mmclab/example/demo_example_replay.m | 3 +- mmclab/example/demo_head_atlas.m | 8 +-- mmclab/example/demo_photon_sharing.m | 4 +- src/mmc_core.cl | 79 ++++++++++++++++++++-------- 5 files changed, 67 insertions(+), 33 deletions(-) diff --git a/.github/workflows/build_all.yml b/.github/workflows/build_all.yml index e204fd4..5aed0ca 100644 --- a/.github/workflows/build_all.yml +++ b/.github/workflows/build_all.yml @@ -14,7 +14,7 @@ jobs: name: Build All strategy: matrix: - os: [ubuntu-20.04, ubuntu-22.04, macos-12, macos-14, windows-2019] + os: [ubuntu-20.04, ubuntu-22.04, macos-13, macos-14, windows-2019] runs-on: ${{ matrix.os }} defaults: run: @@ -164,7 +164,7 @@ jobs: zip -FSr --symlink packages/mmclab-${{ env.RELEASE_TAG }}.zip mmclab fi - name: Upload mmclab package - if: ${{ matrix.os == 'ubuntu-20.04' || matrix.os == 'macos-12' || matrix.os == 'macos-14' || matrix.os == 'windows-2019' }} + if: ${{ matrix.os == 'ubuntu-20.04' || matrix.os == 'macos-13' || matrix.os == 'macos-14' || matrix.os == 'windows-2019' }} uses: actions/upload-artifact@v3 with: name: all-mmc-packages @@ -184,7 +184,7 @@ jobs: zip -FSr --symlink mmc/packages/mmc-${{ env.RELEASE_TAG }}.zip mmc -x 'mmc/packages*' fi - name: Upload mmc package - if: ${{ matrix.os == 'ubuntu-20.04' || matrix.os == 'macos-12' || matrix.os == 'macos-14' || matrix.os == 'windows-2019' }} + if: ${{ matrix.os == 'ubuntu-20.04' || matrix.os == 'macos-13' || matrix.os == 'macos-14' || matrix.os == 'windows-2019' }} uses: actions/upload-artifact@v3 with: name: all-mmc-packages diff --git a/mmclab/example/demo_example_replay.m b/mmclab/example/demo_example_replay.m index 1909290..a832bc9 100644 --- a/mmclab/example/demo_example_replay.m +++ b/mmclab/example/demo_example_replay.m @@ -36,7 +36,6 @@ cfg.issaveexit = 1; % save detected photon exit position and angles cfg.issaveseed = 1; % save detected photon seeds to replay later % cfg.method=0; -cfg.compute='cuda'; newcfg = mmclab(cfg, 'prep'); % preprocessing of the mesh to get the missing fields @@ -82,7 +81,7 @@ % set up for wp replay newcfg.outputtype = 'wp'; % replay and get wp -[cube3, detp3, ~, ~] = mmclab(newcfg); +[cube3, detp3] = mmclab(newcfg); % the two detected photon arrays should be the same. however, because % the program uses multi-threading, the orders may be different diff --git a/mmclab/example/demo_head_atlas.m b/mmclab/example/demo_head_atlas.m index 842e30f..620e4aa 100644 --- a/mmclab/example/demo_head_atlas.m +++ b/mmclab/example/demo_head_atlas.m @@ -6,11 +6,11 @@ % for Fig.9(a) in TranYan2019(submitted). For simplicity, a coarse mesh (lower % sampling rate) that represents part of the head(z>155) is simulated. % -% [Sanchez2012] C.E.Sanchez J.E.Richards and C.R.Almli, “Age-Specific MRI Templates -% for Pediatric Neuroimaging,” Developmental Neuropsychology 37, 379–399 (2012). +% [Sanchez2012] C.E.Sanchez J.E.Richards and C.R.Almli, "Age-Specific MRI Templates +% for Pediatric Neuroimaging," Developmental Neuropsychology 37, 379–399 (2012). % -% [Yan2019] S.Yan, A.P.Tran, and Q.Fang, “Dual-grid mesh-based monte carlo algorithm -% for efficient photon transport simulations in complex three-dimensional media,” +% [Yan2019] S.Yan, A.P.Tran, and Q.Fang, "Dual-grid mesh-based monte carlo algorithm +% for efficient photon transport simulations in complex three-dimensional media," % Journal of Biomedical Optics 24(2), 020503 (2019). % % [TranYan2019] A.P.Tran, S.Yan and Q.Fang, "Improving model-based fNIRS diff --git a/mmclab/example/demo_photon_sharing.m b/mmclab/example/demo_photon_sharing.m index 15c5960..8e8849f 100644 --- a/mmclab/example/demo_photon_sharing.m +++ b/mmclab/example/demo_photon_sharing.m @@ -5,7 +5,7 @@ %% prepare simulation input cfg.nphoton = 3e6; -[cfg.node face cfg.elem] = meshabox([0 0 0], [60 60 20], 2, 2); +[cfg.node, face, cfg.elem] = meshabox([0 0 0], [60 60 20], 2, 2); cfg.elemprop = ones(size(cfg.elem, 1), 1); cfg.srcpos = [10 10 -2]; cfg.srcdir = [0 0 1]; @@ -38,7 +38,7 @@ %% plot results (same as in the mmc example) -[node, ~, elem] = meshabox([0 0 0], [60 60 20], 2, 2); +[node, face, elem] = meshabox([0 0 0], [60 60 20], 2, 2); data = flux.data(:, 1:length(node), :); cwdata = squeeze(sum(data, 3)); diff --git a/src/mmc_core.cl b/src/mmc_core.cl index 7dbf0df..9ca9f15 100644 --- a/src/mmc_core.cl +++ b/src/mmc_core.cl @@ -417,7 +417,7 @@ __device__ static float xorshift128p_nextf (__private RandType t[RAND_BUF_LEN]) return s1.f[0] - 1.0f; } -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) __device__ static void copystate(__local float* v1, __private float* v2, int len) { for (int i = 0; i < len; i++) { v1[i] = v2[i]; @@ -491,7 +491,7 @@ __device__ void clearpath(__local float* p, int len) { } } -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) __device__ uint finddetector(float3* p0, __constant float4* gmed, __constant MCXParam* gcfg) { uint i; @@ -516,12 +516,21 @@ __device__ void savedetphoton(__global float* n_det, __global uint* detectedphot if (baseaddr < GPU_PARAM(gcfg, maxdetphoton)) { uint i; -#ifdef MCX_SAVE_SEED +#if defined(MCX_SAVE_SEED) || defined(__NVCC__) +#ifdef __NVCC__ - for (i = 0; i < RAND_BUF_LEN; i++) { - photonseed[baseaddr * RAND_BUF_LEN + i] = initseed[i]; + if (GPU_PARAM(gcfg, issaveseed)) { +#endif + + for (i = 0; i < RAND_BUF_LEN; i++) { + photonseed[baseaddr * RAND_BUF_LEN + i] = initseed[i]; + } + +#ifdef __NVCC__ } +#endif + #endif baseaddr *= (GPU_PARAM(gcfg, reclen) + 1); n_det[baseaddr++] = detid; @@ -1451,8 +1460,8 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP int oldeid, fixcount = 0; ray r = {gcfg->srcpos, gcfg->srcdir, {MMC_UNDEFINED, 0.f, 0.f}, GPU_PARAM(gcfg, e0), 0, 0, 1.f, 0.f, 0.f, 0.f, ID_UNDEFINED, 0.f}; -#ifdef MCX_SAVE_SEED - RandType initseed[RAND_BUF_LEN]; +#if defined(MCX_SAVE_SEED) || defined(__NVCC__) + RandType initseed[RAND_BUF_LEN] = {NULL}; #endif clearpath(ppath, (GPU_PARAM(gcfg, reclen) + (GPU_PARAM(gcfg, srcnum) > 1) * GPU_PARAM(gcfg, srcnum))); @@ -1461,27 +1470,44 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP r.photonid = id; -#ifdef MCX_SAVE_SEED +#if defined(MCX_SAVE_SEED) || defined(__NVCC__) + +#ifdef __NVCC__ + + if (GPU_PARAM(gcfg, issaveseed)) { +#endif - for (oldeid = 0; oldeid < RAND_BUF_LEN; oldeid++) { - initseed[oldeid] = ran[oldeid]; + for (oldeid = 0; oldeid < RAND_BUF_LEN; oldeid++) { + initseed[oldeid] = ran[oldeid]; + } + +#ifdef __NVCC__ } +#endif + #endif /*initialize the photon parameters*/ launchnewphoton(gcfg, &r, node, elem, srcelem, ran, srcpattern); -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) +#ifdef __NVCC__ if (GPU_PARAM(gcfg, issavedet)) { +#endif + if (GPU_PARAM(gcfg, srctype) != stPattern || GPU_PARAM(gcfg, srcnum) == 1) { ppath[GPU_PARAM(gcfg, reclen) - 1] = r.weight; /*last record in partialpath is the initial photon weight*/ } else if (GPU_PARAM(gcfg, srctype) == stPattern) { *((__local uint*)(ppath + GPU_PARAM(gcfg, reclen) - 1)) = r.posidx; } + +#ifdef __NVCC__ } +#endif + #endif if (GPU_PARAM(gcfg, srcnum) == 1) { @@ -1518,7 +1544,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP r.faceid = -1; } -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) if (GPU_PARAM(gcfg, issavedet) && r.Lmove > 0.f && type[r.eid - 1] > 0) { ppath[GPU_PARAM(gcfg, maxmedia) + type[r.eid - 1] - 1] += r.Lmove; /*second medianum block is the partial path*/ @@ -1576,7 +1602,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) if (GPU_PARAM(gcfg, issavedet) && r.Lmove > 0.f && type[r.eid - 1] > 0) { ppath[GPU_PARAM(gcfg, maxmedia) + type[r.eid - 1] - 1] += r.Lmove; @@ -1594,7 +1620,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP fixphoton(&r.p0, node, (__global int*)(elem + (r.eid - 1)*GPU_PARAM(gcfg, elemlen))); r.slen = branchless_badouel_raytet(&r, gcfg, ppath, elem, weight, type[r.eid - 1], facenb, normal, gmed, replayweight, replaytime); (*raytet)++; -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) if (GPU_PARAM(gcfg, issavedet) && r.Lmove > 0.f && type[r.eid - 1] > 0) { ppath[GPU_PARAM(gcfg, maxmedia) + type[r.eid - 1] - 1] += r.Lmove; @@ -1618,7 +1644,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP //if(GPU_PARAM(gcfg,debuglevel)&dlExit) GPUDEBUG(("E %f %f %f %f %f %f %f %d\n", r.p0.x, r.p0.y, r.p0.z, r.vec.x, r.vec.y, r.vec.z, r.weight, r.eid)); -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) if (GPU_PARAM(gcfg, issavedet) && GPU_PARAM(gcfg, issaveexit)) { /*when issaveexit is set to 1*/ copystate(ppath + (GPU_PARAM(gcfg, reclen) - 7), (__private float*) & (r.p0), 3); /*columns 7-5 from the right store the exit positions*/ @@ -1640,16 +1666,21 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP GPUDEBUG(("X %f %f %f %d %u %e\n", r.p0.x, r.p0.y, r.p0.z, r.eid, id, r.slen)); } -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) +#ifdef __NVCC__ if (GPU_PARAM(gcfg, issavedet)) { +#endif + if (r.eid <= 0) { -#ifdef MCX_SAVE_SEED +#if defined(MCX_SAVE_SEED) || defined(__NVCC__) - if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) { - savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, initseed); - } else { - savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, initseed); + if (GPU_PARAM(gcfg, issaveseed)) { + if (GPU_PARAM(gcfg, isextdet) && type[oldeid - 1] == GPU_PARAM(gcfg, maxmedia) + 1) { + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, oldeid, gcfg, photonseed, initseed); + } else { + savedetphoton(n_det, detectedphoton, ppath, &r, gmed, -1, gcfg, photonseed, initseed); + } } #else @@ -1662,8 +1693,12 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP #endif } + +#ifdef __NVCC__ } +#endif + #endif break; /*photon exits boundary*/ } @@ -1689,7 +1724,7 @@ __device__ void onephoton(unsigned int id, __local float* ppath, __constant MCXP savedebugdata(&r, id, reporter, gdebugdata, gcfg); } -#ifdef MCX_SAVE_DETECTORS +#if defined(MCX_SAVE_DETECTORS) || defined(__NVCC__) if (GPU_PARAM(gcfg, issavedet)) { if (GPU_PARAM(gcfg, ismomentum) && type[r.eid - 1] > 0) { /*when ismomentum is set to 1*/