fallback sync

hannorein · Aug 22, 2024 · 7747075 · 7747075
1 parent 5f03d30
commit 7747075
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 115 deletions.
diff --git a/examples/whfast512_unittests/problem.c b/examples/whfast512_unittests/problem.c
@@ -324,7 +324,7 @@ int test_synchronization_fallback(){
     for (int i=0;i<r1->N;i++){
         double dx = fabs(r1->particles[i].x - r2->particles[i].x);
         double dvx = fabs(r1->particles[i].vx - r2->particles[i].vx);
-        if (dx>1e-16 || dvx>1e-16){
+        if (dx>1e-15 || dvx>1e-15){
             printf("Accuracy not met in synchronization fallback test.\n");
             printf("i=%i diff_x=%.16e diff_vx=%.16e\n",i,dx, dvx);
             return 0;

diff --git a/src/integrator_whfast512.c b/src/integrator_whfast512.c
@@ -65,6 +65,78 @@ static void reb_whfast512_com_step(struct reb_simulation* r, const double _dt){
 #endif
 }
 
+// Convert democratic heliocentric coordinates to inertial coordinates
+// Note: this is only called at the end. Speed is not a concern.
+static void democraticheliocentric_to_inertial_posvel(struct reb_simulation* r){
+    struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
+    struct reb_particle* particles = r->particles;
+    struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
+    const unsigned int N_systems = ri_whfast512->N_systems;
+    const unsigned int p_per_system = 8/N_systems;
+    const unsigned int N_per_system = r->N/N_systems;
+
+
+#ifdef AVX512
+    double m[8];
+    double x[8];
+    double y[8];
+    double z[8];
+    double vx[8];
+    double vy[8];
+    double vz[8];
+    _mm512_storeu_pd(&m, p_jh->m);
+    _mm512_storeu_pd(&x, p_jh->x);
+    _mm512_storeu_pd(&y, p_jh->y);
+    _mm512_storeu_pd(&z, p_jh->z);
+    _mm512_storeu_pd(&vx, p_jh->vx);
+    _mm512_storeu_pd(&vy, p_jh->vy);
+    _mm512_storeu_pd(&vz, p_jh->vz);
+#else // AVX512
+    // Fallback for synchronization for when AVX512 is not available
+    double* m = (double*)p_jh->m;
+    double* x = (double*)p_jh->x;
+    double* y = (double*)p_jh->y;
+    double* z = (double*)p_jh->z;
+    double* vx = p_jh->vx;
+    double* vy = p_jh->vy;
+    double* vz = p_jh->vz;
+#endif // AVX512
+
+    for (unsigned s=0;s<N_systems;s++){
+        double x0s = 0;
+        double y0s = 0;
+        double z0s = 0;
+        double vx0s = 0;
+        double vy0s = 0;
+        double vz0s = 0;
+        for (unsigned int i=1; i<N_per_system; i++){
+            x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
+            y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
+            z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
+            vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
+            vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
+            vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
+            particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
+            particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
+            particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
+        }
+        x0s /= ri_whfast512->p_jh0[s].m;
+        y0s /= ri_whfast512->p_jh0[s].m;
+        z0s /= ri_whfast512->p_jh0[s].m;
+        particles[s*N_per_system].x  = ri_whfast512->p_jh0[s].x - x0s;
+        particles[s*N_per_system].y  = ri_whfast512->p_jh0[s].y - y0s;
+        particles[s*N_per_system].z  = ri_whfast512->p_jh0[s].z - z0s;
+        particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
+        particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
+        particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
+        for (unsigned int i=1; i<N_per_system; i++){
+            particles[s*N_per_system+i].x  = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
+            particles[s*N_per_system+i].y  = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
+            particles[s*N_per_system+i].z  = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
+        }
+    }
+}
+
 #ifdef AVX512
 
 // Fast inverse factorial lookup table
@@ -735,65 +807,6 @@ static void inertial_to_democraticheliocentric_posvel(struct reb_simulation* r){
 
 }
 
-// Convert democratic heliocentric coordinates to inertial coordinates
-// Note: this is only called at the end. Speed is not a concern.
-static void democraticheliocentric_to_inertial_posvel(struct reb_simulation* r){
-    struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
-    struct reb_particle* particles = r->particles;
-    struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
-    const unsigned int N_systems = ri_whfast512->N_systems;
-    const unsigned int p_per_system = 8/N_systems;
-    const unsigned int N_per_system = r->N/N_systems;
-
-    double m[8];
-    double x[8];
-    double y[8];
-    double z[8];
-    double vx[8];
-    double vy[8];
-    double vz[8];
-    _mm512_storeu_pd(&m, p_jh->m);
-    _mm512_storeu_pd(&x, p_jh->x);
-    _mm512_storeu_pd(&y, p_jh->y);
-    _mm512_storeu_pd(&z, p_jh->z);
-    _mm512_storeu_pd(&vx, p_jh->vx);
-    _mm512_storeu_pd(&vy, p_jh->vy);
-    _mm512_storeu_pd(&vz, p_jh->vz);
-
-    for (unsigned s=0;s<N_systems;s++){
-        double x0s = 0;
-        double y0s = 0;
-        double z0s = 0;
-        double vx0s = 0;
-        double vy0s = 0;
-        double vz0s = 0;
-        for (unsigned int i=1; i<N_per_system; i++){
-            x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
-            particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
-            particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
-        }
-        x0s /= ri_whfast512->p_jh0[s].m;
-        y0s /= ri_whfast512->p_jh0[s].m;
-        z0s /= ri_whfast512->p_jh0[s].m;
-        particles[s*N_per_system].x  = ri_whfast512->p_jh0[s].x - x0s;
-        particles[s*N_per_system].y  = ri_whfast512->p_jh0[s].y - y0s;
-        particles[s*N_per_system].z  = ri_whfast512->p_jh0[s].z - z0s;
-        particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
-        particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
-        particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
-        for (unsigned int i=1; i<N_per_system; i++){
-            particles[s*N_per_system+i].x  = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
-            particles[s*N_per_system+i].y  = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
-            particles[s*N_per_system+i].z  = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
-        }
-    }
-}
 
 // Performs one complete jump step
 static void reb_whfast512_jump_step(struct reb_simulation* r, const double _dt){
@@ -1060,60 +1073,6 @@ void reb_integrator_whfast512_synchronize(struct reb_simulation* const r){
 #endif // AVX512
 }
 
-// Convert democratic heliocentric coordinates to inertial coordinates
-// Fallback for synchronization for when AVX512 is not available
-static void democraticheliocentric_to_inertial_posvel_fallback(struct reb_simulation* r){
-    struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
-    struct reb_particle* particles = r->particles;
-    struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
-    const unsigned int N_systems = ri_whfast512->N_systems;
-    const unsigned int p_per_system = 8/N_systems;
-    const unsigned int N_per_system = r->N/N_systems;
-
-    double* m = p_jh->m;
-    double* x = p_jh->x;
-    double* y = p_jh->y;
-    double* z = p_jh->z;
-    double* vx = p_jh->vx;
-    double* vy = p_jh->vy;
-    double* vz = p_jh->vz;
-
-    for (unsigned s=0;s<N_systems;s++){
-        double x0s = 0;
-        double y0s = 0;
-        double z0s = 0;
-        double vx0s = 0;
-        double vy0s = 0;
-        double vz0s = 0;
-        for (unsigned int i=1; i<N_per_system; i++){
-            x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
-            particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
-            particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
-            particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
-        }
-        x0s /= ri_whfast512->p_jh0[s].m;
-        y0s /= ri_whfast512->p_jh0[s].m;
-        z0s /= ri_whfast512->p_jh0[s].m;
-        particles[s*N_per_system].x  = ri_whfast512->p_jh0[s].x - x0s;
-        particles[s*N_per_system].y  = ri_whfast512->p_jh0[s].y - y0s;
-        particles[s*N_per_system].z  = ri_whfast512->p_jh0[s].z - z0s;
-        particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
-        particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
-        particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
-        for (unsigned int i=1; i<N_per_system; i++){
-            particles[s*N_per_system+i].x  = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
-            particles[s*N_per_system+i].y  = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
-            particles[s*N_per_system+i].z  = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
-        }
-    }
-}
-
-
 void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const r){
     // No AVX512 available
     // Using WHFast as a workaround.
@@ -1147,7 +1106,7 @@ void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const
             }
         }
         reb_whfast512_com_step(r, dt/2.0); // does not use AVX512
-        democraticheliocentric_to_inertial_posvel_fallback(r);
+        democraticheliocentric_to_inertial_posvel(r);
         ri_whfast512->is_synchronized = 1;
     }
 }