Skip to content

Commit

Permalink
fallback sync
Browse files Browse the repository at this point in the history
  • Loading branch information
hannorein committed Aug 22, 2024
1 parent 5f03d30 commit 7747075
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 115 deletions.
2 changes: 1 addition & 1 deletion examples/whfast512_unittests/problem.c
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ int test_synchronization_fallback(){
for (int i=0;i<r1->N;i++){
double dx = fabs(r1->particles[i].x - r2->particles[i].x);
double dvx = fabs(r1->particles[i].vx - r2->particles[i].vx);
if (dx>1e-16 || dvx>1e-16){
if (dx>1e-15 || dvx>1e-15){
printf("Accuracy not met in synchronization fallback test.\n");
printf("i=%i diff_x=%.16e diff_vx=%.16e\n",i,dx, dvx);
return 0;
Expand Down
187 changes: 73 additions & 114 deletions src/integrator_whfast512.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,78 @@ static void reb_whfast512_com_step(struct reb_simulation* r, const double _dt){
#endif
}

// Convert democratic heliocentric coordinates to inertial coordinates
// Note: this is only called at the end. Speed is not a concern.
static void democraticheliocentric_to_inertial_posvel(struct reb_simulation* r){
struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle* particles = r->particles;
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
const unsigned int N_systems = ri_whfast512->N_systems;
const unsigned int p_per_system = 8/N_systems;
const unsigned int N_per_system = r->N/N_systems;


#ifdef AVX512
double m[8];
double x[8];
double y[8];
double z[8];
double vx[8];
double vy[8];
double vz[8];
_mm512_storeu_pd(&m, p_jh->m);
_mm512_storeu_pd(&x, p_jh->x);
_mm512_storeu_pd(&y, p_jh->y);
_mm512_storeu_pd(&z, p_jh->z);
_mm512_storeu_pd(&vx, p_jh->vx);
_mm512_storeu_pd(&vy, p_jh->vy);
_mm512_storeu_pd(&vz, p_jh->vz);
#else // AVX512
// Fallback for synchronization for when AVX512 is not available
double* m = (double*)p_jh->m;
double* x = (double*)p_jh->x;
double* y = (double*)p_jh->y;
double* z = (double*)p_jh->z;
double* vx = p_jh->vx;
double* vy = p_jh->vy;
double* vz = p_jh->vz;
#endif // AVX512

for (unsigned s=0;s<N_systems;s++){
double x0s = 0;
double y0s = 0;
double z0s = 0;
double vx0s = 0;
double vy0s = 0;
double vz0s = 0;
for (unsigned int i=1; i<N_per_system; i++){
x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
}
x0s /= ri_whfast512->p_jh0[s].m;
y0s /= ri_whfast512->p_jh0[s].m;
z0s /= ri_whfast512->p_jh0[s].m;
particles[s*N_per_system].x = ri_whfast512->p_jh0[s].x - x0s;
particles[s*N_per_system].y = ri_whfast512->p_jh0[s].y - y0s;
particles[s*N_per_system].z = ri_whfast512->p_jh0[s].z - z0s;
particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
for (unsigned int i=1; i<N_per_system; i++){
particles[s*N_per_system+i].x = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
particles[s*N_per_system+i].y = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
particles[s*N_per_system+i].z = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
}
}
}

#ifdef AVX512

// Fast inverse factorial lookup table
Expand Down Expand Up @@ -735,65 +807,6 @@ static void inertial_to_democraticheliocentric_posvel(struct reb_simulation* r){

}

// Convert democratic heliocentric coordinates to inertial coordinates
// Note: this is only called at the end. Speed is not a concern.
static void democraticheliocentric_to_inertial_posvel(struct reb_simulation* r){
struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle* particles = r->particles;
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
const unsigned int N_systems = ri_whfast512->N_systems;
const unsigned int p_per_system = 8/N_systems;
const unsigned int N_per_system = r->N/N_systems;

double m[8];
double x[8];
double y[8];
double z[8];
double vx[8];
double vy[8];
double vz[8];
_mm512_storeu_pd(&m, p_jh->m);
_mm512_storeu_pd(&x, p_jh->x);
_mm512_storeu_pd(&y, p_jh->y);
_mm512_storeu_pd(&z, p_jh->z);
_mm512_storeu_pd(&vx, p_jh->vx);
_mm512_storeu_pd(&vy, p_jh->vy);
_mm512_storeu_pd(&vz, p_jh->vz);

for (unsigned s=0;s<N_systems;s++){
double x0s = 0;
double y0s = 0;
double z0s = 0;
double vx0s = 0;
double vy0s = 0;
double vz0s = 0;
for (unsigned int i=1; i<N_per_system; i++){
x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
}
x0s /= ri_whfast512->p_jh0[s].m;
y0s /= ri_whfast512->p_jh0[s].m;
z0s /= ri_whfast512->p_jh0[s].m;
particles[s*N_per_system].x = ri_whfast512->p_jh0[s].x - x0s;
particles[s*N_per_system].y = ri_whfast512->p_jh0[s].y - y0s;
particles[s*N_per_system].z = ri_whfast512->p_jh0[s].z - z0s;
particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
for (unsigned int i=1; i<N_per_system; i++){
particles[s*N_per_system+i].x = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
particles[s*N_per_system+i].y = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
particles[s*N_per_system+i].z = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
}
}
}

// Performs one complete jump step
static void reb_whfast512_jump_step(struct reb_simulation* r, const double _dt){
Expand Down Expand Up @@ -1060,60 +1073,6 @@ void reb_integrator_whfast512_synchronize(struct reb_simulation* const r){
#endif // AVX512
}

// Convert democratic heliocentric coordinates to inertial coordinates
// Fallback for synchronization for when AVX512 is not available
static void democraticheliocentric_to_inertial_posvel_fallback(struct reb_simulation* r){
struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle* particles = r->particles;
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
const unsigned int N_systems = ri_whfast512->N_systems;
const unsigned int p_per_system = 8/N_systems;
const unsigned int N_per_system = r->N/N_systems;

double* m = p_jh->m;
double* x = p_jh->x;
double* y = p_jh->y;
double* z = p_jh->z;
double* vx = p_jh->vx;
double* vy = p_jh->vy;
double* vz = p_jh->vz;

for (unsigned s=0;s<N_systems;s++){
double x0s = 0;
double y0s = 0;
double z0s = 0;
double vx0s = 0;
double vy0s = 0;
double vz0s = 0;
for (unsigned int i=1; i<N_per_system; i++){
x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
}
x0s /= ri_whfast512->p_jh0[s].m;
y0s /= ri_whfast512->p_jh0[s].m;
z0s /= ri_whfast512->p_jh0[s].m;
particles[s*N_per_system].x = ri_whfast512->p_jh0[s].x - x0s;
particles[s*N_per_system].y = ri_whfast512->p_jh0[s].y - y0s;
particles[s*N_per_system].z = ri_whfast512->p_jh0[s].z - z0s;
particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
for (unsigned int i=1; i<N_per_system; i++){
particles[s*N_per_system+i].x = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
particles[s*N_per_system+i].y = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
particles[s*N_per_system+i].z = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
}
}
}


void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const r){
// No AVX512 available
// Using WHFast as a workaround.
Expand Down Expand Up @@ -1147,7 +1106,7 @@ void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const
}
}
reb_whfast512_com_step(r, dt/2.0); // does not use AVX512
democraticheliocentric_to_inertial_posvel_fallback(r);
democraticheliocentric_to_inertial_posvel(r);
ri_whfast512->is_synchronized = 1;
}
}
Expand Down

0 comments on commit 7747075

Please sign in to comment.