aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorschnetter <schnetter@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8>2003-05-10 10:52:30 +0000
committerschnetter <schnetter@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8>2003-05-10 10:52:30 +0000
commit20b08143e9ba60a2a15d4def1c4b3804b5b59ff8 (patch)
treee4156392dabd410b20a23b8b5375463549f252bb
parent56df8aae484f856e79a0d609b57d0b17a286b280 (diff)
Optimise the common case.
Output timing values if desired. git-svn-id: http://svn.cactuscode.org/arrangements/CactusNumerical/Slab/trunk@22 2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8
-rw-r--r--param.ccl4
-rw-r--r--schedule.ccl12
-rw-r--r--src/slab.c266
3 files changed, 205 insertions, 77 deletions
diff --git a/param.ccl b/param.ccl
index 31b7f9c..3be94cc 100644
--- a/param.ccl
+++ b/param.ccl
@@ -1,2 +1,6 @@
# Parameter definitions for thorn Slab
# $Header$
+
+BOOLEAN timer_output "Print slabbing timings at shutdown time" STEERABLE=always
+{
+} "no"
diff --git a/schedule.ccl b/schedule.ccl
index 07ea006..cca4c9e 100644
--- a/schedule.ccl
+++ b/schedule.ccl
@@ -5,3 +5,15 @@ SCHEDULE Slab_InitMPIDatatypes AT startup after Driver_Startup
{
LANG: C
} "Create MPI datatypes for complex variables in C"
+
+SCHEDULE Slab_InitTimers AT startup
+{
+ LANG: C
+} "Initialise timers"
+
+if (timer_output) {
+ SCHEDULE Slab_PrintTimers AT shutdown
+ {
+ LANG: C
+ } "Print timers"
+}
diff --git a/src/slab.c b/src/slab.c
index 64ce644..7b3fed4 100644
--- a/src/slab.c
+++ b/src/slab.c
@@ -12,6 +12,7 @@
+#define NDEBUG
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
@@ -60,6 +61,29 @@ CCTK_FILEVERSION(TAT_Slab_slab_c);
+static int timer_init;
+static int timer_copy_in;
+static int timer_xfer;
+static int timer_copy_back;
+
+void Slab_InitTimers (void)
+{
+ timer_init = CCTK_TimerCreate ("Slab/init");
+ timer_copy_in = CCTK_TimerCreate ("Slab/copy in");
+ timer_xfer = CCTK_TimerCreate ("Slab/xfer");
+ timer_copy_back = CCTK_TimerCreate ("Slab/copy back");
+}
+
+void Slab_PrintTimers (void)
+{
+ CCTK_TimerPrintDataI (timer_init , -1);
+ CCTK_TimerPrintDataI (timer_copy_in , -1);
+ CCTK_TimerPrintDataI (timer_xfer , -1);
+ CCTK_TimerPrintDataI (timer_copy_back, -1);
+}
+
+
+
/* Find out which driver to use */
#ifdef CCTK_MPI
# if defined CARPET_CARPET
@@ -590,6 +614,7 @@ int Slab_Transfer (cGH const * restrict const cctkGH,
assert (dsttype >= 0);
/* assert (dstptr); */
+ CCTK_TimerStartI (timer_init);
assert (dim <= SLAB_MAXDIM);
info = malloc (SLAB_MAXDIM * sizeof *info);
assert (info);
@@ -923,53 +948,93 @@ int Slab_Transfer (cGH const * restrict const cctkGH,
free (dst2count);
}
+ CCTK_TimerStopI (timer_init);
+
+ CCTK_TimerStartI (timer_copy_in);
+
for (n = 0; n < size; ++n) {
assert (SLAB_MAXDIM == 3);
- for (k = 0; k < srcdetail[n*SLAB_MAXDIM+info[2].xpose].len; ++k) {
- for (j = 0; j < srcdetail[n*SLAB_MAXDIM+info[1].xpose].len; ++j) {
- for (i = 0; i < srcdetail[n*SLAB_MAXDIM+info[0].xpose].len; ++i) {
- int ipos[SLAB_MAXDIM];
- int srcipos[SLAB_MAXDIM];
- int bufipos[SLAB_MAXDIM];
- size_t srcind;
- size_t bufind;
- ipos[0] = i;
- ipos[1] = j;
- ipos[2] = k;
- for (d=0; d<SLAB_MAXDIM; ++d) {
- int const c = info[d].xpose;
- srcipos[c] = srcdetail[n*SLAB_MAXDIM+c].off + ipos[d] * srcdetail[n*SLAB_MAXDIM+c].str;
- assert (srcipos[c] >= info[c].src.local.off
- && srcipos[c] < info[c].src.local.off + info[c].src.local.len);
- if (! (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off
- && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str)) {
- }
- assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off
- && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str);
- assert ((srcipos[c] - allinfo[n*SLAB_MAXDIM+c].src.slab.off) % allinfo[n*SLAB_MAXDIM+c].src.slab.str == 0);
- bufipos[d] = ipos[d];
- assert (bufipos[d] >= 0 && bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len);
- }
- srcind = 0;
- bufind = 0;
- for (d=SLAB_MAXDIM-1; d>=0; --d) {
- int const c = info[d].xpose;
- srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off;
- bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d];
- }
- assert (srcind < srclentot);
- assert (bufind < (size_t)srccount[n]);
+
+ if (info[0].xpose==0 && info[1].xpose==1 && info[2].xpose==2
+ && srcdetail[n*SLAB_MAXDIM ].str==1 && srcdetail[n*SLAB_MAXDIM+1].str==1 && srcdetail[n*SLAB_MAXDIM+2].str==1
+ && srctype == CCTK_VARIABLE_REAL) {
+ /* Optimised version for a special case */
+
+ int const srcleni = info[0].src.local.len;
+ int const srclenj = info[1].src.local.len;
+ int const srclenk = info[2].src.local.len;
+
+ int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off;
+ int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off;
+ int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off;
+
+ int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len;
+ int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len;
+ int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len;
+
+ for (k = 0; k < srcdetaillenk; ++k) {
+ for (j = 0; j < srcdetaillenj; ++j) {
+ for (i = 0; i < srcdetailleni; ++i) {
+ int const srcindi = srcdetailoffi + i;
+ int const srcindj = srcdetailoffj + j;
+ int const srcindk = srcdetailoffk + k;
+ size_t const srcind = srcindi + srcleni * (srcindj + srclenj * srcindk);
+ size_t const bufind = i + srcdetailleni * (j + srcdetaillenj * k);
+ ((CCTK_REAL*)srcdata)[srcoffset[n] + bufind] = ((const CCTK_REAL*)srcptr)[srcind];
+ }
+ }
+ }
+
+ } else {
+ /* Generic, unoptimised version */
+
+ for (k = 0; k < srcdetail[n*SLAB_MAXDIM+info[2].xpose].len; ++k) {
+ for (j = 0; j < srcdetail[n*SLAB_MAXDIM+info[1].xpose].len; ++j) {
+ for (i = 0; i < srcdetail[n*SLAB_MAXDIM+info[0].xpose].len; ++i) {
+ int ipos[SLAB_MAXDIM];
+ int srcipos[SLAB_MAXDIM];
+ int bufipos[SLAB_MAXDIM];
+ size_t srcind;
+ size_t bufind;
+ ipos[0] = i;
+ ipos[1] = j;
+ ipos[2] = k;
+ for (d=0; d<SLAB_MAXDIM; ++d) {
+ int const c = info[d].xpose;
+ srcipos[c] = srcdetail[n*SLAB_MAXDIM+c].off + ipos[d] * srcdetail[n*SLAB_MAXDIM+c].str;
+ assert (srcipos[c] >= info[c].src.local.off
+ && srcipos[c] < info[c].src.local.off + info[c].src.local.len);
+ if (! (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off
+ && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str)) {
+ }
+ assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off
+ && srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str);
+ assert ((srcipos[c] - allinfo[n*SLAB_MAXDIM+c].src.slab.off) % allinfo[n*SLAB_MAXDIM+c].src.slab.str == 0);
+ bufipos[d] = ipos[d];
+ assert (bufipos[d] >= 0 && bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len);
+ }
+ srcind = 0;
+ bufind = 0;
+ for (d=SLAB_MAXDIM-1; d>=0; --d) {
+ int const c = info[d].xpose;
+ srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off;
+ bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d];
+ }
+ assert (srcind < srclentot);
+ assert (bufind < (size_t)srccount[n]);
/* ((CCTK_REAL*)srcdata)[srcoffset[n] + bufind] */
/* = ((const CCTK_REAL*)srcptr)[srcind]; */
- memcpy ((char *) srcdata + srctypesize * (srcoffset[n] + bufind),
- (const char *) srcptr + srctypesize * srcind,
- srctypesize);
- }
+ memcpy ((char *) srcdata + srctypesize * (srcoffset[n] + bufind),
+ (const char *) srcptr + srctypesize * srcind,
+ srctypesize);
+ }
+ }
}
+
}
- }
+ } /* for n */
ifcheck {
if (srctype == CCTK_VARIABLE_REAL) {
@@ -981,7 +1046,11 @@ int Slab_Transfer (cGH const * restrict const cctkGH,
}
}
}
+ CCTK_TimerStopI (timer_copy_in);
+
+
+ CCTK_TimerStartI (timer_xfer);
ifdebug fflush (stdout);
MPI_Alltoallv
(srcdata, srccount, srcoffset, srcdatatype,
@@ -997,51 +1066,94 @@ int Slab_Transfer (cGH const * restrict const cctkGH,
}
}
}
+ CCTK_TimerStopI (timer_xfer);
+
+
+ CCTK_TimerStartI (timer_copy_back);
for (n = 0; n < size; ++n) {
assert (SLAB_MAXDIM == 3);
- for (k = 0; k < dstdetail[n*SLAB_MAXDIM+2].len; ++k) {
- for (j = 0; j < dstdetail[n*SLAB_MAXDIM+1].len; ++j) {
- for (i = 0; i < dstdetail[n*SLAB_MAXDIM+0].len; ++i) {
- int ipos[SLAB_MAXDIM];
- int bufipos[SLAB_MAXDIM];
- int dstipos[SLAB_MAXDIM];
- size_t bufind;
- size_t dstind;
- ipos[0] = i;
- ipos[1] = j;
- ipos[2] = k;
- for (d=0; d<SLAB_MAXDIM; ++d) {
- if (! info[d].flip) {
- bufipos[d] = ipos[d];
- } else {
- bufipos[d] = dstdetail[n*SLAB_MAXDIM+d].len - 1 - ipos[d];
- }
- ifcheck assert (bufipos[d] >= 0 && bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len);
- dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d] * info[d].dst.slab.str;
- ifcheck assert (dstipos[d] >= info[d].dst.local.off
- && dstipos[d] < info[d].dst.local.off + info[d].dst.local.len);
- ifcheck assert (dstipos[d] >= info[d].dst.slab.off
- && dstipos[d] <= info[d].dst.slab.off + (info[d].dst.slab.len - 1) * info[d].dst.slab.str);
- ifcheck assert ((dstipos[d] - info[d].dst.slab.off) % info[d].dst.slab.str == 0);
- }
- bufind = 0;
- dstind = 0;
- for (d=SLAB_MAXDIM-1; d>=0; --d) {
- bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d];
- dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off;
- }
- ifcheck assert (bufind < (size_t)dstcount[n]);
- ifcheck assert (dstind < dstlentot);
+
+ if (info[0].xpose==0 && info[1].xpose==1 && info[2].xpose==2
+ && info[0].flip==0 && info[1].flip==0 && info[2].flip==0
+ && dstdetail[n*SLAB_MAXDIM ].str==1 && dstdetail[n*SLAB_MAXDIM+1].str==1 && dstdetail[n*SLAB_MAXDIM+2].str==1
+ && dsttype == CCTK_VARIABLE_REAL) {
+ /* Optimised version for a special case */
+
+ int const dstleni = info[0].dst.local.len;
+ int const dstlenj = info[1].dst.local.len;
+ int const dstlenk = info[2].src.local.len;
+
+ int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
+ int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
+ int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (k = 0; k < dstdetaillenk; ++k) {
+ for (j = 0; j < dstdetaillenj; ++j) {
+ for (i = 0; i < dstdetailleni; ++i) {
+ size_t const bufind = i + dstdetailleni * (j + dstdetaillenj * k);
+ int const dstindi = dstdetailoffi + i;
+ int const dstindj = dstdetailoffj + j;
+ int const dstindk = dstdetailoffk + k;
+ size_t const dstind = dstindi + dstleni * (dstindj + dstlenj * dstindk);
+ ((CCTK_REAL*)dstptr)[dstind] = ((const CCTK_REAL*)dstdata)[dstoffset[n] + bufind];
+ }
+ }
+ }
+
+ } else {
+ /* Generic, unoptimised version */
+
+ for (k = 0; k < dstdetail[n*SLAB_MAXDIM+2].len; ++k) {
+ for (j = 0; j < dstdetail[n*SLAB_MAXDIM+1].len; ++j) {
+ for (i = 0; i < dstdetail[n*SLAB_MAXDIM+0].len; ++i) {
+ int ipos[SLAB_MAXDIM];
+ int bufipos[SLAB_MAXDIM];
+ int dstipos[SLAB_MAXDIM];
+ size_t bufind;
+ size_t dstind;
+ ipos[0] = i;
+ ipos[1] = j;
+ ipos[2] = k;
+ for (d=0; d<SLAB_MAXDIM; ++d) {
+ if (! info[d].flip) {
+ bufipos[d] = ipos[d];
+ } else {
+ bufipos[d] = dstdetail[n*SLAB_MAXDIM+d].len - 1 - ipos[d];
+ }
+ ifcheck assert (bufipos[d] >= 0 && bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len);
+ dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d] * info[d].dst.slab.str;
+ ifcheck assert (dstipos[d] >= info[d].dst.local.off
+ && dstipos[d] < info[d].dst.local.off + info[d].dst.local.len);
+ ifcheck assert (dstipos[d] >= info[d].dst.slab.off
+ && dstipos[d] <= info[d].dst.slab.off + (info[d].dst.slab.len - 1) * info[d].dst.slab.str);
+ ifcheck assert ((dstipos[d] - info[d].dst.slab.off) % info[d].dst.slab.str == 0);
+ }
+ bufind = 0;
+ dstind = 0;
+ for (d=SLAB_MAXDIM-1; d>=0; --d) {
+ bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d];
+ dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off;
+ }
+ ifcheck assert (bufind < (size_t)dstcount[n]);
+ ifcheck assert (dstind < dstlentot);
/* ((CCTK_REAL*)dstptr)[dstind] */
/* = ((const CCTK_REAL*)dstdata)[dstoffset[n] + bufind]; */
- memcpy ((char *) dstptr + dsttypesize * dstind,
- (const char *) dstdata + dsttypesize * (dstoffset[n] + bufind),
- dsttypesize);
- }
+ memcpy ((char *) dstptr + dsttypesize * dstind,
+ (const char *) dstdata + dsttypesize * (dstoffset[n] + bufind),
+ dsttypesize);
+ }
+ }
}
+
}
- }
+
+ } /* for n */
+ CCTK_TimerStopI (timer_copy_back);