From 20b08143e9ba60a2a15d4def1c4b3804b5b59ff8 Mon Sep 17 00:00:00 2001 From: schnetter Date: Sat, 10 May 2003 10:52:30 +0000 Subject: Optimise the common case. Output timing values if desired. git-svn-id: http://svn.cactuscode.org/arrangements/CactusNumerical/Slab/trunk@22 2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8 --- param.ccl | 4 + schedule.ccl | 12 +++ src/slab.c | 266 ++++++++++++++++++++++++++++++++++++++++++----------------- 3 files changed, 205 insertions(+), 77 deletions(-) diff --git a/param.ccl b/param.ccl index 31b7f9c..3be94cc 100644 --- a/param.ccl +++ b/param.ccl @@ -1,2 +1,6 @@ # Parameter definitions for thorn Slab # $Header$ + +BOOLEAN timer_output "Print slabbing timings at shutdown time" STEERABLE=always +{ +} "no" diff --git a/schedule.ccl b/schedule.ccl index 07ea006..cca4c9e 100644 --- a/schedule.ccl +++ b/schedule.ccl @@ -5,3 +5,15 @@ SCHEDULE Slab_InitMPIDatatypes AT startup after Driver_Startup { LANG: C } "Create MPI datatypes for complex variables in C" + +SCHEDULE Slab_InitTimers AT startup +{ + LANG: C +} "Initialise timers" + +if (timer_output) { + SCHEDULE Slab_PrintTimers AT shutdown + { + LANG: C + } "Print timers" +} diff --git a/src/slab.c b/src/slab.c index 64ce644..7b3fed4 100644 --- a/src/slab.c +++ b/src/slab.c @@ -12,6 +12,7 @@ +#define NDEBUG #include #include #include @@ -60,6 +61,29 @@ CCTK_FILEVERSION(TAT_Slab_slab_c); +static int timer_init; +static int timer_copy_in; +static int timer_xfer; +static int timer_copy_back; + +void Slab_InitTimers (void) +{ + timer_init = CCTK_TimerCreate ("Slab/init"); + timer_copy_in = CCTK_TimerCreate ("Slab/copy in"); + timer_xfer = CCTK_TimerCreate ("Slab/xfer"); + timer_copy_back = CCTK_TimerCreate ("Slab/copy back"); +} + +void Slab_PrintTimers (void) +{ + CCTK_TimerPrintDataI (timer_init , -1); + CCTK_TimerPrintDataI (timer_copy_in , -1); + CCTK_TimerPrintDataI (timer_xfer , -1); + CCTK_TimerPrintDataI (timer_copy_back, -1); +} + + + /* Find out which driver to use */ #ifdef CCTK_MPI # if defined CARPET_CARPET @@ -590,6 +614,7 @@ int Slab_Transfer (cGH const * restrict const cctkGH, assert (dsttype >= 0); /* assert (dstptr); */ + CCTK_TimerStartI (timer_init); assert (dim <= SLAB_MAXDIM); info = malloc (SLAB_MAXDIM * sizeof *info); assert (info); @@ -923,53 +948,93 @@ int Slab_Transfer (cGH const * restrict const cctkGH, free (dst2count); } + CCTK_TimerStopI (timer_init); + + CCTK_TimerStartI (timer_copy_in); + for (n = 0; n < size; ++n) { assert (SLAB_MAXDIM == 3); - for (k = 0; k < srcdetail[n*SLAB_MAXDIM+info[2].xpose].len; ++k) { - for (j = 0; j < srcdetail[n*SLAB_MAXDIM+info[1].xpose].len; ++j) { - for (i = 0; i < srcdetail[n*SLAB_MAXDIM+info[0].xpose].len; ++i) { - int ipos[SLAB_MAXDIM]; - int srcipos[SLAB_MAXDIM]; - int bufipos[SLAB_MAXDIM]; - size_t srcind; - size_t bufind; - ipos[0] = i; - ipos[1] = j; - ipos[2] = k; - for (d=0; d= info[c].src.local.off - && srcipos[c] < info[c].src.local.off + info[c].src.local.len); - if (! (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off - && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str)) { - } - assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off - && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str); - assert ((srcipos[c] - allinfo[n*SLAB_MAXDIM+c].src.slab.off) % allinfo[n*SLAB_MAXDIM+c].src.slab.str == 0); - bufipos[d] = ipos[d]; - assert (bufipos[d] >= 0 && bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len); - } - srcind = 0; - bufind = 0; - for (d=SLAB_MAXDIM-1; d>=0; --d) { - int const c = info[d].xpose; - srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off; - bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d]; - } - assert (srcind < srclentot); - assert (bufind < (size_t)srccount[n]); + + if (info[0].xpose==0 && info[1].xpose==1 && info[2].xpose==2 + && srcdetail[n*SLAB_MAXDIM ].str==1 && srcdetail[n*SLAB_MAXDIM+1].str==1 && srcdetail[n*SLAB_MAXDIM+2].str==1 + && srctype == CCTK_VARIABLE_REAL) { + /* Optimised version for a special case */ + + int const srcleni = info[0].src.local.len; + int const srclenj = info[1].src.local.len; + int const srclenk = info[2].src.local.len; + + int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off; + int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off; + int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off; + + int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len; + int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len; + int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len; + + for (k = 0; k < srcdetaillenk; ++k) { + for (j = 0; j < srcdetaillenj; ++j) { + for (i = 0; i < srcdetailleni; ++i) { + int const srcindi = srcdetailoffi + i; + int const srcindj = srcdetailoffj + j; + int const srcindk = srcdetailoffk + k; + size_t const srcind = srcindi + srcleni * (srcindj + srclenj * srcindk); + size_t const bufind = i + srcdetailleni * (j + srcdetaillenj * k); + ((CCTK_REAL*)srcdata)[srcoffset[n] + bufind] = ((const CCTK_REAL*)srcptr)[srcind]; + } + } + } + + } else { + /* Generic, unoptimised version */ + + for (k = 0; k < srcdetail[n*SLAB_MAXDIM+info[2].xpose].len; ++k) { + for (j = 0; j < srcdetail[n*SLAB_MAXDIM+info[1].xpose].len; ++j) { + for (i = 0; i < srcdetail[n*SLAB_MAXDIM+info[0].xpose].len; ++i) { + int ipos[SLAB_MAXDIM]; + int srcipos[SLAB_MAXDIM]; + int bufipos[SLAB_MAXDIM]; + size_t srcind; + size_t bufind; + ipos[0] = i; + ipos[1] = j; + ipos[2] = k; + for (d=0; d= info[c].src.local.off + && srcipos[c] < info[c].src.local.off + info[c].src.local.len); + if (! (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off + && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str)) { + } + assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off + && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str); + assert ((srcipos[c] - allinfo[n*SLAB_MAXDIM+c].src.slab.off) % allinfo[n*SLAB_MAXDIM+c].src.slab.str == 0); + bufipos[d] = ipos[d]; + assert (bufipos[d] >= 0 && bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len); + } + srcind = 0; + bufind = 0; + for (d=SLAB_MAXDIM-1; d>=0; --d) { + int const c = info[d].xpose; + srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off; + bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d]; + } + assert (srcind < srclentot); + assert (bufind < (size_t)srccount[n]); /* ((CCTK_REAL*)srcdata)[srcoffset[n] + bufind] */ /* = ((const CCTK_REAL*)srcptr)[srcind]; */ - memcpy ((char *) srcdata + srctypesize * (srcoffset[n] + bufind), - (const char *) srcptr + srctypesize * srcind, - srctypesize); - } + memcpy ((char *) srcdata + srctypesize * (srcoffset[n] + bufind), + (const char *) srcptr + srctypesize * srcind, + srctypesize); + } + } } + } - } + } /* for n */ ifcheck { if (srctype == CCTK_VARIABLE_REAL) { @@ -981,7 +1046,11 @@ int Slab_Transfer (cGH const * restrict const cctkGH, } } } + CCTK_TimerStopI (timer_copy_in); + + + CCTK_TimerStartI (timer_xfer); ifdebug fflush (stdout); MPI_Alltoallv (srcdata, srccount, srcoffset, srcdatatype, @@ -997,51 +1066,94 @@ int Slab_Transfer (cGH const * restrict const cctkGH, } } } + CCTK_TimerStopI (timer_xfer); + + + CCTK_TimerStartI (timer_copy_back); for (n = 0; n < size; ++n) { assert (SLAB_MAXDIM == 3); - for (k = 0; k < dstdetail[n*SLAB_MAXDIM+2].len; ++k) { - for (j = 0; j < dstdetail[n*SLAB_MAXDIM+1].len; ++j) { - for (i = 0; i < dstdetail[n*SLAB_MAXDIM+0].len; ++i) { - int ipos[SLAB_MAXDIM]; - int bufipos[SLAB_MAXDIM]; - int dstipos[SLAB_MAXDIM]; - size_t bufind; - size_t dstind; - ipos[0] = i; - ipos[1] = j; - ipos[2] = k; - for (d=0; d= 0 && bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len); - dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d] * info[d].dst.slab.str; - ifcheck assert (dstipos[d] >= info[d].dst.local.off - && dstipos[d] < info[d].dst.local.off + info[d].dst.local.len); - ifcheck assert (dstipos[d] >= info[d].dst.slab.off - && dstipos[d] <= info[d].dst.slab.off + (info[d].dst.slab.len - 1) * info[d].dst.slab.str); - ifcheck assert ((dstipos[d] - info[d].dst.slab.off) % info[d].dst.slab.str == 0); - } - bufind = 0; - dstind = 0; - for (d=SLAB_MAXDIM-1; d>=0; --d) { - bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d]; - dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off; - } - ifcheck assert (bufind < (size_t)dstcount[n]); - ifcheck assert (dstind < dstlentot); + + if (info[0].xpose==0 && info[1].xpose==1 && info[2].xpose==2 + && info[0].flip==0 && info[1].flip==0 && info[2].flip==0 + && dstdetail[n*SLAB_MAXDIM ].str==1 && dstdetail[n*SLAB_MAXDIM+1].str==1 && dstdetail[n*SLAB_MAXDIM+2].str==1 + && dsttype == CCTK_VARIABLE_REAL) { + /* Optimised version for a special case */ + + int const dstleni = info[0].dst.local.len; + int const dstlenj = info[1].dst.local.len; + int const dstlenk = info[2].src.local.len; + + int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; + int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; + int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + + int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; + int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; + int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + + for (k = 0; k < dstdetaillenk; ++k) { + for (j = 0; j < dstdetaillenj; ++j) { + for (i = 0; i < dstdetailleni; ++i) { + size_t const bufind = i + dstdetailleni * (j + dstdetaillenj * k); + int const dstindi = dstdetailoffi + i; + int const dstindj = dstdetailoffj + j; + int const dstindk = dstdetailoffk + k; + size_t const dstind = dstindi + dstleni * (dstindj + dstlenj * dstindk); + ((CCTK_REAL*)dstptr)[dstind] = ((const CCTK_REAL*)dstdata)[dstoffset[n] + bufind]; + } + } + } + + } else { + /* Generic, unoptimised version */ + + for (k = 0; k < dstdetail[n*SLAB_MAXDIM+2].len; ++k) { + for (j = 0; j < dstdetail[n*SLAB_MAXDIM+1].len; ++j) { + for (i = 0; i < dstdetail[n*SLAB_MAXDIM+0].len; ++i) { + int ipos[SLAB_MAXDIM]; + int bufipos[SLAB_MAXDIM]; + int dstipos[SLAB_MAXDIM]; + size_t bufind; + size_t dstind; + ipos[0] = i; + ipos[1] = j; + ipos[2] = k; + for (d=0; d= 0 && bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len); + dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d] * info[d].dst.slab.str; + ifcheck assert (dstipos[d] >= info[d].dst.local.off + && dstipos[d] < info[d].dst.local.off + info[d].dst.local.len); + ifcheck assert (dstipos[d] >= info[d].dst.slab.off + && dstipos[d] <= info[d].dst.slab.off + (info[d].dst.slab.len - 1) * info[d].dst.slab.str); + ifcheck assert ((dstipos[d] - info[d].dst.slab.off) % info[d].dst.slab.str == 0); + } + bufind = 0; + dstind = 0; + for (d=SLAB_MAXDIM-1; d>=0; --d) { + bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d]; + dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off; + } + ifcheck assert (bufind < (size_t)dstcount[n]); + ifcheck assert (dstind < dstlentot); /* ((CCTK_REAL*)dstptr)[dstind] */ /* = ((const CCTK_REAL*)dstdata)[dstoffset[n] + bufind]; */ - memcpy ((char *) dstptr + dsttypesize * dstind, - (const char *) dstdata + dsttypesize * (dstoffset[n] + bufind), - dsttypesize); - } + memcpy ((char *) dstptr + dsttypesize * dstind, + (const char *) dstdata + dsttypesize * (dstoffset[n] + bufind), + dsttypesize); + } + } } + } - } + + } /* for n */ + CCTK_TimerStopI (timer_copy_back); -- cgit v1.2.3